In [1]:
import getpass
import os


try:
    # load environment variables from .env file (requires `python-dotenv`)
    from dotenv import load_dotenv

    load_dotenv()
except ImportError:
    pass

os.environ["LANGSMITH_TRACING"] = "true"
if "LANGSMITH_API_KEY" not in os.environ:
    os.environ["LANGSMITH_API_KEY"] = getpass.getpass(\
        prompt="Enter your LangSmith API key (optional): "
    )
if "LANGSMITH_PROJECT" not in os.environ:
    os.environ["LANGSMITH_PROJECT"] = getpass.getpass(
        prompt='Enter your LangSmith Project Name (default = "default"): '
    )
    if not os.environ.get("LANGSMITH_PROJECT"):
        os.environ["LANGSMITH_PROJECT"] = "default"
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass(
        prompt="Enter your OpenAI API key (required if using OpenAI): "
    )

## Theme Extraction

https://python.langchain.com/docs/tutorials/extraction/

#### Initialise Environment

In [2]:
# Initiate environment
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

#### Load Data

In [3]:
# Load Data

def extract_chunks(file_path, lines_per_chunk=10):
    """
    Extracts text from a .txt file and divides it into chunks, each containing a specified number of lines.
    
    Args:
        file_path (str): The path to the text file.
        lines_per_chunk (int): The number of lines per chunk. Defaults to 10.
        
    Returns:
        list: A list of strings, where each string is a chunk containing up to 'lines_per_chunk' lines.
    """
    chunks = []
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
            
        # Iterate over the lines in steps of 'lines_per_chunk'
        for i in range(0, len(lines), lines_per_chunk):
            # Join the lines to form a chunk, remove tab characters, and strip any trailing whitespace
            chunk = ''.join(lines[i:i+lines_per_chunk]).replace('\t', '').strip()
            if chunk:  # Only add non-empty chunks
                chunks.append(chunk)
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")
        
    return chunks


import glob

# Get all .txt file paths from the specified directory
file_paths = glob.glob("test_data/*.txt")

# Initialize an empty list for all chunks
input_texts = []

# Loop over each file and extract its chunks
for file_path in file_paths:
    chunks = extract_chunks(file_path)  # using your already-defined function
    input_texts.extend(chunks)  # Add all chunks from this file to the list

# Now input_texts contains chunks from all .txt files
print(f"Collected {len(input_texts)} chunks from {len(file_paths)} files.")

print(input_texts[0])

Collected 378 chunks from 6 files.
﻿In a world of abundant ready-made entertainments, how do we foster creativity and imagination?
Need to see humility with those in power, transparency, and acknowledgement that 'we don't know'.
"Around climate I crave strong leadership" - it's not there. 
How to make time for anything. How can we slow down, enable young people to slow down? Check space for new ideas. 
People from minority backgrounds need to be on boards where decisions are made: excluded, unable to take part. 
For trust, people need to be less fearful - have time. (enable connection to the local... more secure living).
We have issues with the council trusting us to do things for the city'.
Trust: understand the city leadership as humans, break down the distance.
Can we drip-feed info on the why and how of decision - making better.
Cities assemble for imagination and curiosity, not scrutiny... no commitment to it.


In [4]:
# Create codebook

codebook = [
    "Leadership & Governance",
    "Trust & Relationships",
    "Inclusivity & Diversity",
    "Communication & Information",
    "Socioeconomic Inequality",
    "Public Services & Infrastructure",
    "Community Identity & Pride",
    "Youth Engagement & Future Focus",
    "Safety & Security",
    "Public Spaces & Accessibility",
    "Environmental Concerns",
    "Educational Opportunity",
    "Community Cohesion",
    "Future Planning & Vision",
    "Housing & Neighborhoods",
    "Employment & Economic Development",
    "Youth Development",
    "Demographic Changes",
    "Innovation & Creativity",
    "Intergenerational Connections",
    "Climate Change & Sustainability",
    "Health & Wellbeing",
    "Political Alignment & Cooperation",
    "Environmental Access & Inclusion",
    "Natural World Appreciation",
    "Urban Green Infrastructure",
    "Environmental Responsibility",
    "Outdoor City Identity",
    "Transport & Accessibility",
    "Climate Action & Sustainability",
    "Biodiversity & Conservation",
    "Water Resources Management",
    "Community Gardening & Food Production",
    "Environmental Education",
    "Planning & Environmental Integration",
    "Geographic Division and Inequality",
    "Civic Infrastructure",
    "Cultural Identity and Heritage",
    "Governance and Civic Engagement",
    "Education and Youth",
    "Housing and Urban Planning",
    "Diversity and Inclusion",
    "Safety and Perception",
    "Leisure and Recreation",
    "Communication and Promotion",
    "Creative and Cultural Expression",
    "Community Spaces and Facilities",
    "Sports and Physical Activity",
    "Sustainability and Environmental Awareness",
    "Transportation and Mobility",
    "Hospitality and Entertainment",
    "Commercial Development",
    "Heritage and History",
    "Belonging and Identity",
    "Digital and Technological Progress",
    "Authenticity and Character",
    "Volunteering and Civic Participation",
    "Knowledge and Educational Resources",
    "Economic Resilience and Transformation",
    "Social Infrastructure",
    "City Identity and Narrative",
    "Urban Development and Regeneration",
    "Connectivity and Transport",
    "Social Equity and Inclusion",
    "Food Systems and Security",
    "Community Engagement & Participation"
]

In [5]:
# Define Theme and Data class with Pydantic

from typing import Optional

from pydantic import BaseModel, Field
from typing import List, Optional


class Theme(BaseModel):
    """Themes identified in the text"""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    theme: Optional[str] = Field(
        default=None, description="Name of the theme identified", enum=codebook)
    matching_quotes: Optional[str] = Field(
        default=None, description="All the quotes from the text that match the identified theme, each on a new line."
    )


class Data(BaseModel):
    """Extracted data about themes"""

    # Creates a model so that we can extract multiple entities.
    themes: List[Theme]

In [6]:
# Create prompt template

from typing import Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system", "You are an expert qualitative research algorithm. You are given a text to code deductively using a list of codes.\n"
                      "Your job is to extract the themes from the text and their associated quotes. Each quote can have multiple themes and each theme can have multiple quotes.\n"
                      f"You can only extract themes that belong to this list {str(codebook)}"
        ,
        ),

        MessagesPlaceholder("examples"),


        ("human", "{text}"),
    ]
)

In [7]:
# Define Reference Examples

import uuid
from typing import Dict, List, TypedDict

from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    HumanMessage,
    SystemMessage,
    ToolMessage,
)
from pydantic import BaseModel, Field



from langchain_core.utils.function_calling import tool_example_to_messages

examples = [
    (
        # Full text with all sentences and formatting removed:
        "Can we use empty buildings as new spaces for democratic decision-making.\n"
        "Big businesses in Sheffield.... very few.\n"
        "Local politicians engage relatively well where they live... but uninformed about transversal themes (mental health/food).\n"
        "In the age of disinformation, how can we build trust at a community level... build fertile ground for collective action.\n"
        "City leadership and fellow citizens have come out of a rough patch - tree felling.\n"
        "Innovative leadership and transparent decision-making inspire public trust and foster robust community participation.\n"
        "Local area committees are good but not fully started yet.\n"
        "Humans, not organisations make the difference - humans with resources that happen to be within organisations.\n"
        "You have to invest to enable people to be heard... and processes need to be genuine and influence decision making.\n"
        "What if every decision was made with young people/the next generation in mind.\n"
        "How can decisions be announced in a more distributed way... not just at city hall.",
        
        # Structured data with themes and their matching quotes:
        Data(themes=[
            Theme(
                theme="Governance and Civic Engagement",
                matching_quotes="Can we use empty buildings as new spaces for democratic decision-making."
            ),
            Theme(
                theme="Social Equity and Inclusion",
                matching_quotes="Local politicians engage relatively well where they live... but uninformed about transversal themes (mental health/food)."
            ),
            Theme(
                theme="Trust & Relationships",
                matching_quotes="In the age of disinformation, how can we build trust at a community level... build fertile ground for collective action.\n"
                                "Innovative leadership and transparent decision-making inspire public trust and foster robust community participation."
            ),
            Theme(
                theme="Community Engagement & Participation",
                matching_quotes=(
                    "In the age of disinformation, how can we build trust at a community level... build fertile ground for collective action.\n"
                    "You have to invest to enable people to be heard... and processes need to be genuine and influence decision making.\n"
                    "Innovative leadership and transparent decision-making inspire public trust and foster robust community participation."
                )
            ),
            Theme(
                theme="Community Spaces and Facilities",
                matching_quotes="Humans, not organisations make the difference - humans with resources that happen to be within organisations."
            ),
            Theme(
                theme="Youth Engagement & Future Focus",
                matching_quotes="What if every decision was made with young people/the next generation in mind."
            ),
            Theme(
                theme="Communication & Information",
                matching_quotes="How can decisions be announced in a more distributed way... not just at city hall."
            ),
            Theme(
                theme="Leadership & Governance",
                matching_quotes=(
                    "City leadership and fellow citizens have come out of a rough patch - tree felling.\n"
                    "Innovative leadership and transparent decision-making inspire public trust and foster robust community participation."
                )
            ),
        ])
    ),
    # (
    #     """Local area committees are good but not fully started yet.""",
    #     Data(themes=[]),
    # )
]

messages = []

for text, tool_call in examples:
    messages.extend(
        tool_example_to_messages(text, [tool_call])
    )

  tool_example_to_messages(text, [tool_call])


In [11]:
messages[0]

HumanMessage(content='Can we use empty buildings as new spaces for democratic decision-making.\nBig businesses in Sheffield.... very few.\nLocal politicians engage relatively well where they live... but uninformed about transversal themes (mental health/food).\nIn the age of disinformation, how can we build trust at a community level... build fertile ground for collective action.\nCity leadership and fellow citizens have come out of a rough patch - tree felling.\nInnovative leadership and transparent decision-making inspire public trust and foster robust community participation.\nLocal area committees are good but not fully started yet.\nHumans, not organisations make the difference - humans with resources that happen to be within organisations.\nYou have to invest to enable people to be heard... and processes need to be genuine and influence decision making.\nWhat if every decision was made with young people/the next generation in mind.\nHow can decisions be announced in a more distri

In [8]:
input_list = []

for i in input_texts:
    input_list.append({"text": i, "examples": messages})

In [10]:
for key, value in input_list[0].items():
    print(key)
    print(value)
    print('\n')

text
﻿In a world of abundant ready-made entertainments, how do we foster creativity and imagination?
Need to see humility with those in power, transparency, and acknowledgement that 'we don't know'.
"Around climate I crave strong leadership" - it's not there. 
How to make time for anything. How can we slow down, enable young people to slow down? Check space for new ideas. 
People from minority backgrounds need to be on boards where decisions are made: excluded, unable to take part. 
For trust, people need to be less fearful - have time. (enable connection to the local... more secure living).
We have issues with the council trusting us to do things for the city'.
Trust: understand the city leadership as humans, break down the distance.
Can we drip-feed info on the why and how of decision - making better.
Cities assemble for imagination and curiosity, not scrutiny... no commitment to it.


examples
[HumanMessage(content='Can we use empty buildings as new spaces for democratic decision-ma

In [12]:
# Query LLM

runnable = prompt_template | llm.with_structured_output(
    schema=Data,
    method="function_calling",
    include_raw=False,
)



response = runnable.batch(input_list)

In [16]:
# Collect Outputs into Dataframe

import pandas as pd

def create_themes_dataframe(responses):
    """
    Given a list of response dictionaries, each containing a 'themes' list,
    produce a Pandas DataFrame with two columns:
      - 'Theme'
      - 'Matching Quotes'

    Args:
        responses (list): A list of dictionaries or Pydantic-like objects
                          each containing a 'themes' key.

    Returns:
        pd.DataFrame: A DataFrame with all themes and matching quotes
                      aggregated across all responses.
    """
    
    # In case each response is still a pydantic model, convert to dict
    # You can skip this step if your responses are already dictionaries.
    responses = [resp.model_dump() if hasattr(resp, "model_dump") else resp for resp in responses]
    
    # 1. Collect rows from each response
    rows = []
    for response in responses:
        # If responses are Pydantic models, convert them with model_dump() if needed.
        theme_items = response.get("themes", [])
        for t in theme_items:
            rows.append({
                "Theme": t["theme"],
                "Matching Quotes": t["matching_quotes"]
            })
    
    # 2. Create a DataFrame from the collected rows
    df = pd.DataFrame(rows)
    
    # 3. Group by "Theme" and aggregate the matching quotes.
    grouped_data = []
    for theme, group in df.groupby("Theme"):
        # Join all matching quotes for the theme into one string (each separated by a newline)
        all_quotes = "\n".join(group["Matching Quotes"].tolist())
        # Split into individual quotes, strip whitespace, and ignore empty entries.
        quotes_list = [quote.strip() for quote in all_quotes.split("\n") if quote.strip()]
        count = len(quotes_list)
        grouped_data.append({
            "Theme": theme,
            "Matching Quotes": all_quotes,
            "Frequency": count
        })
    
    # 4. Convert the grouped data into a DataFrame.
    df_merged = pd.DataFrame(grouped_data, columns=["Theme", "Matching Quotes", "Frequency"])

    df_merged = df_merged.sort_values('Frequency', ascending=False)

    return df_merged


def display_multiline(df):
    """
    Return a Pandas Styler object that renders the newlines within cells
    in a Jupyter Notebook (or another HTML display).
    
    Call `display(display_multiline(df))` in a Jupyter cell to see multiline quotes.
    """
    return df.style.set_properties(**{'white-space': 'pre-wrap'})


def reduce_matching_quotes(df, n):
    """
    Reduces the number of matching quotes for each theme in the DataFrame to at most n quotes.
    
    Each cell in the "Matching Quotes" column is expected to have quotes separated by newline characters.
    
    Args:
        df (pd.DataFrame): A DataFrame with columns "Theme" and "Matching Quotes".
        n (int): The maximum number of matching quotes to keep for each theme.
        
    Returns:
        pd.DataFrame: A new DataFrame with the "Matching Quotes" column updated to have at most n quotes per theme.
    """
    # Make a copy of the dataframe to avoid modifying the original one.
    df_reduced = df.copy()
    
    # Process the "Matching Quotes" for each row: split by newline, take first n lines, and join them back.
    df_reduced["Matching Quotes"] = df_reduced["Matching Quotes"].apply(
        lambda quotes: "\n".join(quotes.split("\n")[:n])
    )

    return df_reduced

from IPython.display import display

df = create_themes_dataframe(response)
df_sample = reduce_matching_quotes(df,3)

display(display_multiline(df_sample))

Unnamed: 0,Theme,Matching Quotes,Frequency
19,Community Engagement & Participation,"Local area committees are good but not fully started yet. You have to invest to enable people to be heard... and processes need to be genuine and influence decision making. Lots of communities doing great stuff but from a strategic perspective very limited in influencing decision-making, Sheffield did not embrace big local groups.",360
71,Public Spaces & Accessibility,"Why are parks in different areas not managed or maintained equally. S3 not disability friendly otherwise would have stayed. S20 - very green/more attractive space very clean, no fly tipping.",242
21,Community Identity & Pride,"People living in Sheffield all their lives but do not feel they are Sheffielders. London to Sheffield, Birmingham/London Sheffield is great, it's the people, friendly. S3 people and community make it special - safe space.",230
76,Socioeconomic Inequality,Big businesses in Sheffield.... very few. Funding - power. no fairness or equity in funding. Different areas have differing priorities - survival mode vs. leisure focus.,172
63,Leisure and Recreation,Lego sculptures treasure hunt in the city centre. Friendly and approachable. I enjoy coming to the Youth club.,167
70,Public Services & Infrastructure,What is the strategy for Pitsmoor? Why is it not thriving? All outstanding secondary schools located in specific areas - gifted children in poor areas have no chances. Over reliance on public sector and voluntary in Sheffield - where is the private sector.,164
79,Transport & Accessibility,Weekend buses to reservoirs + other destinations. Tramlines are a good example of regional / national stewardship. Bus routes have been cut but regeneration is where the new tram is. Transport is a huge barrier to participation in the city.,164
53,Health & Wellbeing,"S1 church congregation is changing, attending prayers is essential, mental health. Need more wrap around, mental health, social care through to enterprise. Risk of health service for young.",162
22,Community Spaces and Facilities,"What has replaced local pubs for social gatherings? Don’t have to get transport = some areas are community natural spaces. Using buildings - not for housing, but for community, bringing people together.",162
38,Environmental Concerns,Redoing an allotment with my family and they helped by listening to my ideas to make it wildlife friendly. Clean air zone. Maintaining and creating green spaces.,161


## Derive Subthemes from Themes

In [48]:
def extract_top_n_dict(df: pd.DataFrame, n: int) -> list:
    """
    Extracts a list of strings from the top n rows of a DataFrame.
    Each string contains the theme prepended to its matching quote.
    
    Parameters:
    - df: pd.DataFrame
        A pandas DataFrame with at least two columns: 'Theme' and 'Matching Quotes'.
    - n: int
        The number of top rows to select from the DataFrame.
    
    Returns:
    - list
        A list of strings where each string is of the format:
        "<Theme>: <Matching Quote>"
    """
    # Select the top n rows
    top_n = df.sort_values('Frequency', ascending=False).head(n)

    # Create a dictionary mapping from 'Theme' to the combined string: "<theme> <matching_quote>"
    theme_quote_list = [
        f"**{theme}**: {quote}"
        for theme, quote in zip(top_n['Theme'], top_n['Matching Quotes'])
    ]
    
    return theme_quote_list


In [49]:
top_themes = extract_top_n_dict(df, 5)

In [50]:
top_themes[0]

'**Community Engagement & Participation**: Local area committees are good but not fully started yet.\nYou have to invest to enable people to be heard... and processes need to be genuine and influence decision making.\nLots of communities doing great stuff but from a strategic perspective very limited in influencing decision-making, Sheffield did not embrace big local groups.\nNeighbourhoods want to work with the council but doors are not open.\nWeaponization of funding accessibility/restricted resources/funding threats.\nOpportunities to challenge and scrutinise.\nHow can the community make its voice heard (LACs not working)?\nHow do we engage the voices of youth and diverse communities?\nLACs are rubbish - dysfunctional, all chat/no actions or follow up.\nSupport for community organisations - employer links.\nHaving a say in the look and feel of your area.\nMeaningful consultation and local action committees but needs better communication about how to participate.\nAbility to engage i

In [56]:
# Define Theme and Data class with Pydantic

from typing import Optional

from pydantic import BaseModel, Field
from typing import List, Optional


class SubTheme(BaseModel):
    """Themes identified in the text"""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    theme: Optional[str] = Field(
        default=None, description="Name of the theme")
    subtheme: Optional[str] = Field(
        default=None, description="Name of the sub-theme identified")
    matching_quotes: Optional[str] = Field(
        default=None, description="All the quotes from the text that match the identified subtheme, each on a new line."
    )


class Data(BaseModel):
    """Extracted data about subthemes"""

    # Creates a model so that we can extract multiple entities.
    subthemes: List[SubTheme]

In [57]:
# Create prompt template

from typing import Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system", "You are an expert qualitative researcher.\n" 
                      "You will be given some quotes that have been grouped under a theme. Each quote is separated by a new line ('\n').\n" 
                      "Your job is to break down the theme into sub-themes that really show the detail of what the community cares about.\n"
                      "Each quote can have multiple themes and each theme can have multiple quotes.\n"
                      "Please provide as an output each sub-theme identified along with the associated quotes\n",
        ),
        MessagesPlaceholder("examples"),
        ("human", "{text}"),
    ]
)

In [60]:
# Reference Examples
# Define Reference Examples

import uuid
from typing import Dict, List, TypedDict

from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    HumanMessage,
    SystemMessage,
    ToolMessage,
)
from pydantic import BaseModel, Field



from langchain_core.utils.function_calling import tool_example_to_messages

examples = [
    (
        # Full text with all sentences and formatting removed:
        "**Community Engagement & Participation**: "
        "Public/private/community co-creation and involvement.\n"
        "City wide events and debates like the festival of debate where there was opportunity to discuss ideas/experiences.\n"
        "Weaponization of funding accessibility/restricted resources/funding threats.\n"
        "Speaking in a climate protest when I was nine everyone was very helpful.\n"
        "Barriers to participation - language.\n"
        "Sheffield is a city of sanctuary and is a space where new communities can form e.g. Ukrainian refugees.\n"
        "Sheffield is family-orientated, kid-friendly, and it’s clear to students that many people choose to build their lives here.\n"
        "Neighbourhoods want to work with the council but doors are not open.\n"
        "Lots of communities doing great stuff but from a strategic perspective very limited in influencing decision-making.\n"
        "People want to make a difference and singing from the same song sheet will be important.\n"
        "Show up and show your support.\n"
        "Opportunities to challenge and scrutinise.\n"
        "People don’t know who/how to ask - ensuring people affected or impacted by decisions are introduced + diversity of voice.\n"
        "Opportunities are there but lack of trust - needs to feels meaningful + know it made a difference.\n"
        "Support for community organisations - employer links.\n"
        "Community campaign - we helped each other.\n"
        "City of sanctuary - welcoming in spite of national rhetoric.\n"
        "This! Everyone’s helped me feel included and my opinions heard.\n"
        ,

        # Structured data with themes and their matching quotes:
        Data(subthemes=[
            SubTheme(
                theme="Community Engagement & Participation",
                subtheme="Barriers to Engagement (Trust, Funding, Access, Language, Time, Childcare, etc.)",
                matching_quotes="Neighbourhoods want to work with the council but doors are not open.\n"
                "Weaponization of funding accessibility/restricted resources/funding threats.\n"
                "Barriers to participation - language.\n"
                "Opportunities are there but lack of trust - needs to feels meaningful + know it made a difference.\n"
                "People don’t know who/how to ask - ensuring people affected or impacted by decisions are introduced + diversity of voice.\n"
            ),
            SubTheme(
                theme="Community Engagement & Participation",
                subtheme="Collaboration, Co-production, and Partnership (Council, Third Sector, Businesses, Universities)",
                matching_quotes="Lots of communities doing great stuff but from a strategic perspective very limited in influencing decision-making.\n"
                "Support for community organisations - employer links.\n"
                "City wide events and debates like the festival of debate where there was opportunity to discuss ideas/experiences.\n"
                "Public/private/community co-creation and involvement.\n"
            ),
            SubTheme(
                theme="Community Engagement & Participation",
                subtheme="Activism, Volunteering, and Community Spirit",
                matching_quotes="Opportunities to challenge and scrutinise.\n"
                "Speaking in a climate protest when I was nine everyone was very helpful.\n"
                "Community campaign - we helped each other.\n"
                "This! Everyone’s helped me feel included and my opinions heard.\n"
                "Show up and show your support.\n"
            ),
            SubTheme(
                theme="Community Engagement & Participation",
                subtheme="Pride in Sheffield’s Community Identity (Family-Friendly, Welcoming, “City of Sanctuary”)",
                matching_quotes="Sheffield is family-orientated, kid-friendly, and it’s clear to students that many people choose to build their lives here.\n"
                "Sheffield is a city of sanctuary and is a space where new communities can form e.g. Ukrainian refugees.\n"
                "People want to make a difference and singing from the same song sheet will be important.\n"
                "City of sanctuary - welcoming in spite of national rhetoric.\n"
            ),
        ])
    ),
    (
        # Full text with all sentences and formatting removed:
        "**Public Spaces & Accessibility**: "
        "Too many barriers for people with disabilities. Meetings for LACs not hybrid, only live feed.\n"
        "Peak District, high number of parts accessibility.\n"
        "City centre is rubbish - Sheffield does not feel like the 4th largest city.\n"
        "Free of litter, broken glass and fly tipping.\n"
        "Accessibility - language barriers, disability, inclusive approach.\n"
        "Mobility issues impact - buses and tram not accessible.\n"
        "Not very well. Poor accessibility throughout the city even new builds e.g. toilets.\n"
        "Have cleaners.\n"
        "Not good enough signposting in town - bus stops for example.\n"
        "Barriers with bus passes and blue badges - criteria discriminate.\n"
        "Litter - gives a bad impression.\n"
        "Feel like everything you need is within walking distance, or accessible via public transport.\n"
        "Access to nature is limited by transport.\n"
        "Too much litter.\n"
        "Disability friendly city (accessibility info).\n"
        "Access is not equal - cars have best access and are not able to access on bus.\n"
        ,

        # Structured data with themes and their matching quotes:
        Data(subthemes=[
            SubTheme(
                theme="Public Spaces & Accessibility",
                subtheme="Accessibility for People with Disabilities and Inclusive Design",
                matching_quotes="Too many barriers for people with disabilities. Meetings for LACs not hybrid, only live feed.\n"
                "Mobility issues impact - buses and tram not accessible.\n"
                "Not good enough signposting in town - bus stops for example.\n"
                "Accessibility - language barriers, disability, inclusive approach.\n"
                "Disability friendly city (accessibility info).\n"
                "Not very well. Poor accessibility throughout the city even new builds e.g. toilets.\n"
            ),
            SubTheme(
                theme="Public Spaces & Accessibility",
                subtheme="Transport and Mobility (Public Transport, Active Travel, Car Culture)",
                matching_quotes="Feel like everything you need is within walking distance, or accessible via public transport.\n"
                "Peak District, high number of parts accessibility.\n"
                "Barriers with bus passes and blue badges - criteria discriminate.\n"
                "Access is not equal - cars have best access and are not able to access on bus.\n"
                "Access to nature is limited by transport.\n"
            ),
            SubTheme(
                theme="Public Spaces & Accessibility",
                subtheme="Cleanniness, Safety, and Maintenance (Litter, Fly-Tipping, Public Toilets, etc.)",
                matching_quotes="Litter - gives a bad impression.\n"
                "Too much litter.\n"
                "Free of litter, broken glass and fly tipping.\n"
                "Have cleaners.\n"
                "City centre is rubbish - Sheffield does not feel like the 4th largest city.\n"
            ),
        ])
    )
]


messages = []

for text, tool_call in examples:
    messages.extend(
        tool_example_to_messages(text, [tool_call])
    )

In [61]:
# Run Query

runnable = prompt_template | llm.with_structured_output(
    schema=Data,
    method="function_calling",
    include_raw=False,
)

input_texts = top_themes
input_list = []

for i in input_texts:
    input_list.append({"text": i, "examples": messages})

response = runnable.batch(input_list)

In [64]:
response[0]

Data(subthemes=[SubTheme(theme='Community Engagement & Participation', subtheme='Barriers to Participation and Communication', matching_quotes='You have to invest to enable people to be heard... and processes need to be genuine and influence decision making.\nOpportunities are there but lack of trust - needs to feel meaningful + know it made a difference.\nPeople don’t know who/how to ask - ensuring people affected or impacted by decisions are introduced + diversity of voice.\nIndividuals should have more engagement with community organisations.\nI want to be engaged with before a decision is made - not just have a decision explained to me.\nCapacity challenges around participation.\nBarriers to participation - language.\nCommunities - level activities needed to break through the cycle of hopelessness.'), SubTheme(theme='Community Engagement & Participation', subtheme='Desire for Genuine Engagement and Influence', matching_quotes='Local area committees are good but not fully started ye

In [78]:
def create_subthemes_dataframe(responses):
    """
    Given a list of response dictionaries (or Pydantic models) each containing a 'subthemes' key,
    produces a Pandas DataFrame with the columns:
      - 'Theme'
      - 'SubTheme'
      - 'Matching Quotes'
      - 'Frequency'
      
    For each unique (Theme, SubTheme) pair, this function aggregates the matching quotes and counts
    the frequency as the total number of non-empty lines (quotes).
    
    The final DataFrame is sorted first alphabetically by theme and then by frequency (descending)
    
    Args:
        responses (list): A list of dictionaries or Pydantic-like objects, each containing a 'subthemes'
                          key, where 'subthemes' is a list of items with keys 'theme', 'subtheme',
                          and 'matching_quotes'.
    
    Returns:
        pd.DataFrame: A DataFrame with aggregated rows for each (Theme, SubTheme) combination.
    """
    
    # In case each response is a Pydantic model, convert to dict.
    responses = [
        resp.model_dump() if hasattr(resp, "model_dump") else resp
        for resp in responses
    ]
    
    # 1. Collect rows from each response from its 'subthemes' list.
    rows = []
    for response in responses:
        subtheme_items = response.get("subthemes", [])
        for item in subtheme_items:
            rows.append({
                "Theme": item["theme"],
                "SubTheme": item["subtheme"],
                "Matching Quotes": item["matching_quotes"]
            })
    
    # 2. Create a DataFrame from the collected rows.
    df = pd.DataFrame(rows)
    
    # 3. Group by both 'Theme' and 'SubTheme' and aggregate matching quotes and calculate frequency.
    grouped_data = []
    grouped = df.groupby(["Theme", "SubTheme"])
    for (theme, subtheme), group in grouped:
        # Combine all matching quotes into one string (separated by newlines)
        all_quotes = "\n".join(group["Matching Quotes"].tolist())
        # Split the combined string into individual lines, stripping whitespace and filtering out empty lines.
        quotes_list = [quote.strip() for quote in all_quotes.split("\n") if quote.strip()]
        frequency = len(quotes_list)
        grouped_data.append({
            "Theme": theme,
            "SubTheme": subtheme,
            "Matching Quotes": all_quotes,
            "Frequency": frequency
        })
    
    # 4. Convert the grouped data into a DataFrame.
    df_merged = pd.DataFrame(grouped_data, columns=["Theme", "SubTheme", "Matching Quotes", "Frequency"])
    
    # 5. Sort the DataFrame first by Theme (alphabetically) and then by Frequency (descending)
    df_merged = df_merged.sort_values(by=["Theme", "Frequency"], ascending=[True, False])
    
    return df_merged

def extract_top_n_subthemes(aggregated_df, n):
    """
    Extracts the top n subthemes for each theme from an aggregated DataFrame.
    
    Args:
        aggregated_df (pd.DataFrame): A DataFrame with aggregated rows for each (Theme, SubTheme) combination,
                                      including columns 'Theme', 'SubTheme', 'Matching Quotes', and 'Frequency'.
        n (int): The number of top subthemes to extract for each theme.
        
    Returns:
        pd.DataFrame: A DataFrame containing, for each theme, the top n subthemes sorted by descending frequency.
    """
    # Group by "Theme" and for each group select the top n subthemes by frequency.
    top_n_df = aggregated_df.groupby("Theme", group_keys=False).apply(lambda x: x.nlargest(n, 'Frequency'))
    
    return top_n_df


In [81]:
df = create_subthemes_dataframe(response)
df_top_n = extract_top_n_subthemes(df,3)
df_sample = reduce_matching_quotes(df_top_n, 3)

display(display_multiline(df_sample))

  top_n_df = aggregated_df.groupby("Theme", group_keys=False).apply(lambda x: x.nlargest(n, 'Frequency'))


Unnamed: 0,Theme,SubTheme,Matching Quotes,Frequency
1,Community Engagement & Participation,Barriers to Participation and Communication,You have to invest to enable people to be heard... and processes need to be genuine and influence decision making. Opportunities are there but lack of trust - needs to feel meaningful + know it made a difference. People don’t know who/how to ask - ensuring people affected or impacted by decisions are introduced + diversity of voice.,8
5,Community Engagement & Participation,Desire for Genuine Engagement and Influence,"Local area committees are good but not fully started yet. LACs are rubbish - dysfunctional, all chat/no actions or follow up. Need to find different ways for people to engage beyond meetings.",8
4,Community Engagement & Participation,Community-led Initiatives and Events,Community events and festivals. Festival of Debate and Migration Matters as spaces to engage with ideas. Drop ins that encourage leadership from the community - the power is in their hands.,6
14,Community Identity & Pride,Sense of Belonging and Community Connection,People living in Sheffield all their lives but do not feel they are Sheffielders. One participant said they feel more part of a community in Sheffield than they did in their home-town of only 5000 people. Participants feel like ‘adopted Sheffielders’ and that they’ve really bought into being a Sheffield citizen during their time at University.,11
12,Community Identity & Pride,"Pride in Heritage, Culture, and Local Identity","Shared community experiences clearly a core part of being a Sheffield citizen; one participant noted that the musical ‘standing at sky’s edge’ really reflected this to them. Sheffield rarely feels overwhelming as a city, it has a real village-vibe. Sheffield as a village.",10
11,Community Identity & Pride,Perception and Image of the City,Does Sheffield have a brand? How can we thrive on being different from other cities? Sheffield feels like a chaotic place to live - disjointed.,9
19,Leisure and Recreation,"Outdoor Activities and Natural Spaces (Parks, Peaks, & Outdoor Events)",Sheffield has a lot of parks and quality natural space. Outdoor city. extremely well -trailing running - activities -wide reputation attracts visitors. Wild swimming - very fun!,13
17,Leisure and Recreation,"Community and Cultural Events (Festivals, Events, Gigs, Performances)","Good festivals/events - doc fest, festival of debate, cliffhanger, outdoors festival. Tramlines a major Sheffield event but a shame it’s so expensive now - the free gigs are still some of the best parts. Comedy at the showroom.",12
20,Leisure and Recreation,"Social Spaces and Community Connection (Cafes, Pubs, and Social Activities)",Go for walks. Coffee and pub culture. Going to the amazing pubs and breweries.,10
21,Public Spaces & Accessibility,Community Connection and Green Spaces,Green spaces plus intergenerational activities. Community allotments. Green space brings communities together particularly since covid.,7


In [96]:
prompt = prompt_template.invoke({"text": text_input, "examples": messages})

In [97]:
for i in prompt.to_messages():
    i.pretty_print()


You are an expert qualitative research algorithm. You are given a text to code deductively using a list of codes.
Your job is to extract the themes from the text and their associated quotes. Each quote can have multiple themes and each theme can have multiple quotes.
You can only extract themes that belong to this list ['Leadership & Governance', 'Trust & Relationships', 'Inclusivity & Diversity', 'Communication & Information', 'Socioeconomic Inequality', 'Public Services & Infrastructure', 'Community Identity & Pride', 'Youth Engagement & Future Focus', 'Safety & Security', 'Public Spaces & Accessibility', 'Environmental Concerns', 'Educational Opportunity', 'Community Cohesion', 'Future Planning & Vision', 'Housing & Neighborhoods', 'Employment & Economic Development', 'Youth Development', 'Demographic Changes', 'Innovation & Creativity', 'Intergenerational Connections', 'Climate Change & Sustainability', 'Health & Wellbeing', 'Political Alignment & Cooperation', 'Environmental Acce

In [48]:
type(prompt.to_messages()[0])

<class 'langchain_core.messages.system.SystemMessage'>

In [22]:
help(prompt.to_messages)

Help on method to_messages in module langchain_core.prompt_values:

to_messages() -> 'list[BaseMessage]' method of langchain_core.prompt_values.ChatPromptValue instance
    Return prompt as a list of messages.



In [None]:
Seems to not want to map multiple themes to single quote.
Next step try few shot example
Else different architecture
Parsing approach to extraction, without using tools