In [2]:
import getpass
import os


try:
    # load environment variables from .env file (requires `python-dotenv`)
    from dotenv import load_dotenv

    load_dotenv()
except ImportError:
    pass

os.environ["LANGSMITH_TRACING"] = "true"
if "LANGSMITH_API_KEY" not in os.environ:
    os.environ["LANGSMITH_API_KEY"] = getpass.getpass(
        prompt="Enter your LangSmith API key (optional): "
    )
if "LANGSMITH_PROJECT" not in os.environ:
    os.environ["LANGSMITH_PROJECT"] = getpass.getpass(
        prompt='Enter your LangSmith Project Name (default = "default"): '
    )
    if not os.environ.get("LANGSMITH_PROJECT"):
        os.environ["LANGSMITH_PROJECT"] = "default"
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass(
        prompt="Enter your OpenAI API key (required if using OpenAI): "
    )

### Extractor (Tutorial)

https://python.langchain.com/docs/tutorials/extraction/

#### Initialise Environment

In [1]:
# Initiate environment
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

#### Load Data

In [None]:
# Load Data

def extract_chunks(file_path, lines_per_chunk=10):
    """
    Extracts text from a .txt file and divides it into chunks, each containing a specified number of lines.
    
    Args:
        file_path (str): The path to the text file.
        lines_per_chunk (int): The number of lines per chunk. Defaults to 10.
        
    Returns:
        list: A list of strings, where each string is a chunk containing up to 'lines_per_chunk' lines.
    """
    chunks = []
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
            
        # Iterate over the lines in steps of 'lines_per_chunk'
        for i in range(0, len(lines), lines_per_chunk):
            # Join the lines to form a chunk, remove tab characters, and strip any trailing whitespace
            chunk = ''.join(lines[i:i+lines_per_chunk]).replace('\t', '').strip()
            if chunk:  # Only add non-empty chunks
                chunks.append(chunk)
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")
        
    return chunks


import glob

# Get all .txt file paths from the specified directory
file_paths = glob.glob("test_data/*.txt")

# Initialize an empty list for all chunks
input_texts = []

# Loop over each file and extract its chunks
for file_path in file_paths:
    chunks = extract_chunks(file_path)  # using your already-defined function
    input_texts.extend(chunks)  # Add all chunks from this file to the list

# Now input_texts contains chunks from all .txt files
print(f"Collected {len(input_texts)} chunks from {len(file_paths)} files.")

print(input_texts[0])

Collected 378 chunks from 6 files.
﻿In a world of abundant ready-made entertainments, how do we foster creativity and imagination?
Need to see humility with those in power, transparency, and acknowledgement that 'we don't know'.
"Around climate I crave strong leadership" - it's not there. 
How to make time for anything. How can we slow down, enable young people to slow down? Check space for new ideas. 
People from minority backgrounds need to be on boards where decisions are made: excluded, unable to take part. 
For trust, people need to be less fearful - have time. (enable connection to the local... more secure living).
We have issues with the council trusting us to do things for the city'.
Trust: understand the city leadership as humans, break down the distance.
Can we drip-feed info on the why and how of decision - making better.
Cities assemble for imagination and curiosity, not scrutiny... no commitment to it.


In [2]:
# Create codebook

codebook = [
    "Leadership & Governance",
    "Trust & Relationships",
    "Inclusivity & Diversity",
    "Communication & Information",
    "Socioeconomic Inequality",
    "Public Services & Infrastructure",
    "Community Identity & Pride",
    "Youth Engagement & Future Focus",
    "Safety & Security",
    "Public Spaces & Accessibility",
    "Environmental Concerns",
    "Educational Opportunity",
    "Community Cohesion",
    "Future Planning & Vision",
    "Housing & Neighborhoods",
    "Employment & Economic Development",
    "Youth Development",
    "Demographic Changes",
    "Innovation & Creativity",
    "Intergenerational Connections",
    "Climate Change & Sustainability",
    "Health & Wellbeing",
    "Political Alignment & Cooperation",
    "Environmental Access & Inclusion",
    "Natural World Appreciation",
    "Urban Green Infrastructure",
    "Environmental Responsibility",
    "Outdoor City Identity",
    "Transport & Accessibility",
    "Climate Action & Sustainability",
    "Biodiversity & Conservation",
    "Water Resources Management",
    "Community Gardening & Food Production",
    "Environmental Education",
    "Planning & Environmental Integration",
    "Geographic Division and Inequality",
    "Civic Infrastructure",
    "Cultural Identity and Heritage",
    "Governance and Civic Engagement",
    "Education and Youth",
    "Housing and Urban Planning",
    "Diversity and Inclusion",
    "Safety and Perception",
    "Leisure and Recreation",
    "Communication and Promotion",
    "Creative and Cultural Expression",
    "Community Spaces and Facilities",
    "Sports and Physical Activity",
    "Sustainability and Environmental Awareness",
    "Transportation and Mobility",
    "Hospitality and Entertainment",
    "Commercial Development",
    "Heritage and History",
    "Belonging and Identity",
    "Digital and Technological Progress",
    "Authenticity and Character",
    "Volunteering and Civic Participation",
    "Knowledge and Educational Resources",
    "Economic Resilience and Transformation",
    "Social Infrastructure",
    "City Identity and Narrative",
    "Urban Development and Regeneration",
    "Connectivity and Transport",
    "Social Equity and Inclusion",
    "Food Systems and Security",
    "Community Engagement & Participation"
]

In [3]:
# Define Theme and Data class with Pydantic

from typing import Optional

from pydantic import BaseModel, Field
from typing import List, Optional


class Theme(BaseModel):
    """Themes identified in the text"""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    theme: Optional[str] = Field(
        default=None, description="Name of the theme identified", enum=codebook)
    matching_quotes: Optional[str] = Field(
        default=None, description="All the quotes from the text that match the identified theme, each on a new line."
    )


class Data(BaseModel):
    """Extracted data about themes"""

    # Creates a model so that we can extract multiple entities.
    themes: List[Theme]

In [5]:
# Create prompt template

from typing import Optional

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from pydantic import BaseModel, Field

# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system", "You are an expert qualitative research algorithm. You are given a text to code deductively using a list of codes.\n"
                      "Your job is to extract the themes from the text and their associated quotes. Each quote can have multiple themes and each theme can have multiple quotes.\n"
                      f"You can only extract themes that belong to this list {str(codebook)}"
        ,
        ),

        MessagesPlaceholder("examples"),


        ("human", "{text}"),
    ]
)

In [7]:
# Define Reference Examples

import uuid
from typing import Dict, List, TypedDict

from langchain_core.messages import (
    AIMessage,
    BaseMessage,
    HumanMessage,
    SystemMessage,
    ToolMessage,
)
from pydantic import BaseModel, Field


class Example(TypedDict):
    """A representation of an example consisting of text input and expected tool calls.

    For extraction, the tool calls are represented as instances of pydantic model.
    """

    input: str  # This is the example text
    tool_calls: List[BaseModel]  # Instances of pydantic model that should be extracted


def tool_example_to_messages(example: Example) -> List[BaseMessage]:
    """Convert an example into a list of messages that can be fed into an LLM.

    This code is an adapter that converts our example to a list of messages
    that can be fed into a chat model.

    The list of messages per example corresponds to:

    1) HumanMessage: contains the content from which content should be extracted.
    2) AIMessage: contains the extracted information from the model
    3) ToolMessage: contains confirmation to the model that the model requested a tool correctly.

    The ToolMessage is required because some of the chat models are hyper-optimized for agents
    rather than for an extraction use case.
    """
    messages: List[BaseMessage] = [HumanMessage(content=example["input"])]
    tool_calls = []
    for tool_call in example["tool_calls"]:
        tool_calls.append(
            {
                "id": str(uuid.uuid4()),
                "args": tool_call.dict(),
                # The name of the function right now corresponds
                # to the name of the pydantic model
                # This is implicit in the API right now,
                # and will be improved over time.
                "name": tool_call.__class__.__name__,
            },
        )
    messages.append(AIMessage(content="", tool_calls=tool_calls))
    tool_outputs = example.get("tool_outputs") or [
        "You have correctly called this tool."
    ] * len(tool_calls)
    for output, tool_call in zip(tool_outputs, tool_calls):
        messages.append(ToolMessage(content=output, tool_call_id=tool_call["id"]))
    return messages


from langchain_core.utils.function_calling import tool_example_to_messages

examples = [
    (
        # Full text with all sentences and formatting removed:
        "Can we use empty buildings as new spaces for democratic decision-making.\n"
        "Big businesses in Sheffield.... very few.\n"
        "Local politicians engage relatively well where they live... but uninformed about transversal themes (mental health/food).\n"
        "In the age of disinformation, how can we build trust at a community level... build fertile ground for collective action.\n"
        "City leadership and fellow citizens have come out of a rough patch - tree felling.\n"
        "Innovative leadership and transparent decision-making inspire public trust and foster robust community participation.\n"
        "Local area committees are good but not fully started yet.\n"
        "Humans, not organisations make the difference - humans with resources that happen to be within organisations.\n"
        "You have to invest to enable people to be heard... and processes need to be genuine and influence decision making.\n"
        "What if every decision was made with young people/the next generation in mind.\n"
        "How can decisions be announced in a more distributed way... not just at city hall.",
        
        # Structured data with themes and their matching quotes:
        Data(themes=[
            Theme(
                theme="Governance and Civic Engagement",
                matching_quotes="Can we use empty buildings as new spaces for democratic decision-making."
            ),
            Theme(
                theme="Social Equity and Inclusion",
                matching_quotes="Local politicians engage relatively well where they live... but uninformed about transversal themes (mental health/food)."
            ),
            Theme(
                theme="Trust & Relationships",
                matching_quotes="In the age of disinformation, how can we build trust at a community level... build fertile ground for collective action.\n"
                                "Innovative leadership and transparent decision-making inspire public trust and foster robust community participation."
            ),
            Theme(
                theme="Community Engagement & Participation",
                matching_quotes=(
                    "In the age of disinformation, how can we build trust at a community level... build fertile ground for collective action.\n"
                    "You have to invest to enable people to be heard... and processes need to be genuine and influence decision making.\n"
                    "Innovative leadership and transparent decision-making inspire public trust and foster robust community participation."
                )
            ),
            Theme(
                theme="Community Spaces and Facilities",
                matching_quotes="Humans, not organisations make the difference - humans with resources that happen to be within organisations."
            ),
            Theme(
                theme="Youth Engagement & Future Focus",
                matching_quotes="What if every decision was made with young people/the next generation in mind."
            ),
            Theme(
                theme="Communication & Information",
                matching_quotes="How can decisions be announced in a more distributed way... not just at city hall."
            ),
            Theme(
                theme="Leadership & Governance",
                matching_quotes=(
                    "City leadership and fellow citizens have come out of a rough patch - tree felling.\n"
                    "Innovative leadership and transparent decision-making inspire public trust and foster robust community participation."
                )
            ),
        ])
    ),
    # (
    #     """Local area committees are good but not fully started yet.""",
    #     Data(themes=[]),
    # )
]

messages = []

for text, tool_call in examples:
    messages.extend(
        tool_example_to_messages(text, [tool_call])
    )

  tool_example_to_messages(text, [tool_call])


In [52]:
# Query LLM

runnable = prompt_template | llm.with_structured_output(
    schema=Data,
    method="function_calling",
    include_raw=False,
)

input_list = []

for i in input_texts:
    input_list.append({"text": i, "examples": messages})

response = runnable.batch(input_list)

In [53]:
# Collect Outputs into Dataframe

import pandas as pd

def create_themes_dataframe(responses):
    """
    Given a list of response dictionaries, each containing a 'themes' list,
    produce a Pandas DataFrame with two columns:
      - 'Theme'
      - 'Matching Quotes'

    Args:
        responses (list): A list of dictionaries or Pydantic-like objects
                          each containing a 'themes' key.

    Returns:
        pd.DataFrame: A DataFrame with all themes and matching quotes
                      aggregated across all responses.
    """
    
    # In case each response is still a pydantic model, convert to dict
    # You can skip this step if your responses are already dictionaries.
    responses = [resp.model_dump() if hasattr(resp, "model_dump") else resp for resp in responses]
    
    # 1. Collect rows from each response
    rows = []
    for response in responses:
        # If responses are Pydantic models, convert them with model_dump() if needed.
        theme_items = response.get("themes", [])
        for t in theme_items:
            rows.append({
                "Theme": t["theme"],
                "Matching Quotes": t["matching_quotes"]
            })
    
    # 2. Create a DataFrame from the collected rows
    df = pd.DataFrame(rows)
    
    # 3. Group by "Theme" and aggregate the matching quotes.
    grouped_data = []
    for theme, group in df.groupby("Theme"):
        # Join all matching quotes for the theme into one string (each separated by a newline)
        all_quotes = "\n".join(group["Matching Quotes"].tolist())
        # Split into individual quotes, strip whitespace, and ignore empty entries.
        quotes_list = [quote.strip() for quote in all_quotes.split("\n") if quote.strip()]
        count = len(quotes_list)
        grouped_data.append({
            "Theme": theme,
            "Matching Quotes": all_quotes,
            "Frequency": count
        })
    
    # 4. Convert the grouped data into a DataFrame.
    df_merged = pd.DataFrame(grouped_data, columns=["Theme", "Matching Quotes", "Frequency"])

    df_merged = df_merged.sort_values('Frequency', ascending=False)

    return df_merged


def display_multiline(df):
    """
    Return a Pandas Styler object that renders the newlines within cells
    in a Jupyter Notebook (or another HTML display).
    
    Call `display(display_multiline(df))` in a Jupyter cell to see multiline quotes.
    """
    return df.style.set_properties(**{'white-space': 'pre-wrap'})


def reduce_matching_quotes(df, n):
    """
    Reduces the number of matching quotes for each theme in the DataFrame to at most n quotes.
    
    Each cell in the "Matching Quotes" column is expected to have quotes separated by newline characters.
    
    Args:
        df (pd.DataFrame): A DataFrame with columns "Theme" and "Matching Quotes".
        n (int): The maximum number of matching quotes to keep for each theme.
        
    Returns:
        pd.DataFrame: A new DataFrame with the "Matching Quotes" column updated to have at most n quotes per theme.
    """
    # Make a copy of the dataframe to avoid modifying the original one.
    df_reduced = df.copy()
    
    # Process the "Matching Quotes" for each row: split by newline, take first n lines, and join them back.
    df_reduced["Matching Quotes"] = df_reduced["Matching Quotes"].apply(
        lambda quotes: "\n".join(quotes.split("\n")[:n])
    )

    return df_reduced

from IPython.display import display

df = create_themes_dataframe(response)
df_sample = reduce_matching_quotes(df,3)

display(display_multiline(df_sample))

Unnamed: 0,Theme,Matching Quotes,Frequency
22,Community Engagement & Participation,"You have to invest to enable people to be heard... and processes need to be genuine and influence decision making. Basic principles - residents leading change local level first, support for participants enabled through flexible regularity, but we have a very top down city. Lots of communities doing great stuff but from a strategic perspective very limited in influencing decision-making, Sheffield did not embrace big local groups.",356
74,Public Spaces & Accessibility,"S3 not disability friendly otherwise would have stayed. S20 - very green/more attractive space very clean, no fly tipping. Parks and recreation - where are all the park keepers?",235
25,Community Identity & Pride,"People living in Sheffield all their lives but do not feel they are Sheffielders. Alternative grassroots ways to empower communities - Sheffield's radical past/history forgotten. London to Sheffield, Birmingham/London Sheffield is great, it's the people, friendly.",229
41,Environmental Concerns,Peaceful protests about climate change. Clean air zone. Maintaining and creating green spaces.,182
52,Health & Wellbeing,"S1 church congregation is changing, attending prayers is essential, mental health. Risk of health service for young. Mental and physical health issues are a huge barrier to participation.",180
81,Socioeconomic Inequality,Big businesses in Sheffield.... very few. Funding - power. no fairness or equity in funding. Why are parks in different areas not managed or maintained equally.,176
27,Community Spaces and Facilities,"Elected leaders feel more remote from communities - where are the community centres? LACs are rubbish - dysfunctional, all chat/no actions or follow up. What has replaced local pubs for social gatherings?",163
56,Inclusivity & Diversity,"People from minority backgrounds need to be on boards where decisions are made: excluded, unable to take part. Decolonise retail shopping (muslim outlets in Firth Park). Representation and inclusion and equity.",163
86,Transport & Accessibility,Weekend buses to reservoirs + other destinations. Bus routes have been cut but regeneration is where the new tram is. Transport is a huge barrier to participation in the city. People struggle to navigate transport issues.,157
21,Community Cohesion,"Local area committees are good but not fully started yet. Humans, not organisations make the difference - humans with resources that happen to be within organisations. No glue points where community assets are richer.",157


In [96]:
prompt = prompt_template.invoke({"text": text_input, "examples": messages})

In [97]:
for i in prompt.to_messages():
    i.pretty_print()


You are an expert qualitative research algorithm. You are given a text to code deductively using a list of codes.
Your job is to extract the themes from the text and their associated quotes. Each quote can have multiple themes and each theme can have multiple quotes.
You can only extract themes that belong to this list ['Leadership & Governance', 'Trust & Relationships', 'Inclusivity & Diversity', 'Communication & Information', 'Socioeconomic Inequality', 'Public Services & Infrastructure', 'Community Identity & Pride', 'Youth Engagement & Future Focus', 'Safety & Security', 'Public Spaces & Accessibility', 'Environmental Concerns', 'Educational Opportunity', 'Community Cohesion', 'Future Planning & Vision', 'Housing & Neighborhoods', 'Employment & Economic Development', 'Youth Development', 'Demographic Changes', 'Innovation & Creativity', 'Intergenerational Connections', 'Climate Change & Sustainability', 'Health & Wellbeing', 'Political Alignment & Cooperation', 'Environmental Acce

In [48]:
type(prompt.to_messages()[0])

<class 'langchain_core.messages.system.SystemMessage'>

In [22]:
help(prompt.to_messages)

Help on method to_messages in module langchain_core.prompt_values:

to_messages() -> 'list[BaseMessage]' method of langchain_core.prompt_values.ChatPromptValue instance
    Return prompt as a list of messages.



In [None]:
Seems to not want to map multiple themes to single quote.
Next step try few shot example
Else different architecture
Parsing approach to extraction, without using tools