# Topic Modeling + Plots for Paper

## Setup

In [None]:
import os
import time
import xml.etree.ElementTree as ET

import pandas as pd
import anthropic
from datetime import date
from pathlib import Path
import matplotlib.ticker as mtick
import numpy as np
from scipy.stats import entropy
import seaborn as sns
import opinionated  # noqa
import matplotlib.pyplot as plt
from dotenv import load_dotenv
from tqdm import tqdm

plt.style.use("opinionated_rc")

# Load Anthropic API key
load_dotenv("../")

CITY_SHORTCODE_NAME_LUT = {
    "AA": "Ann Arbor, MI",
    "RO": "Royal Oak, MI",
    "AP": "Alpena, MI",
    "CS": "Cedar Springs, MI",
    "GA": "Garden City, MI",
    "IN": "Inkster, MI",
    "JS": "Jackson, MI",
    "LS": "Lansing, MI",
    "LV": "Lathrup Village, MI",
    "LN": "Livonia, MI",
    "MH": "Madison Heights, MI",
    "MT": "Manistee, MI",
    "MP": "Memphis, MI",
    "MC": "Mt Clemens, MI",
    "PE": "Perry, MI",
    "PR": "Pleasant Ridge, MI",
    "RM": "Richmond, VA",
    "SL": "Saline, , MI",
    "SC": "St Clair, MI",
    "SH": "Sterling Heights, MI",
    "WT": "Williamston, MI",
    "SEA": "Seattle, WA",
    "OAK": "Oakland, CA",
}

CITIES_OF_INTEREST = [
    "AA",
    "RO",
    "JS",
    "LS",
    "SEA",
    "OAK",
]

ANNOTATIONS_DIR = Path("../data/annotated-for-modeling/").resolve()

In [None]:
# Get the bold color map values
# PALETTE_BOLD = cmaps.bold._colors
COLORBREWER_PALETTE = np.array(
    [
        [27, 158, 119],  # green
        [217, 95, 2],  # orange
        [117, 112, 179],  # purple
    ]
)
COLORBREWER_PALETTE = COLORBREWER_PALETTE / 255
sns.set_palette(COLORBREWER_PALETTE)

## Data Cleaning and Subsetting

In [None]:
# Store all data to single object
data_dfs = []


def split_short_name_to_city_and_date(short_name: str) -> tuple[str, date]:
    # Split the short name into city and date
    short_code_and_date_parts = short_name.split("_")

    # Short code is the first part
    short_code = short_code_and_date_parts[0]

    # Date is the rest in month day two-digit-year format
    event_date = date(
        year=int("20" + short_code_and_date_parts[-1]),
        month=int(short_code_and_date_parts[1]),
        day=int(short_code_and_date_parts[2]),
    )

    return short_code, event_date


# Read all data
for filepath in ANNOTATIONS_DIR.glob("*.csv"):
    # Read the comment data
    df = pd.read_csv(filepath)

    # Lowercase all columns
    df.columns = df.columns.str.lower()

    # Remove any spaces from column names and replace with "_"
    df.columns = df.columns.str.replace(" ", "_")

    # Split the "name" column into "city_short_code" and "date"
    df["city_short_code"], df["date"] = zip(
        *df["name"].apply(split_short_name_to_city_and_date),
        strict=True,
    )

    # Add the city name
    df["city_name"] = df["city_short_code"].map(CITY_SHORTCODE_NAME_LUT)

    # Add a year-month column
    df["year_month"] = df["date"].apply(lambda x: x.replace(day=1))

    # Using the filename, mark if this was a "training" or "inferred" dataset
    df["dataset_portion"] = filepath.stem.split("_")[-1]

    # Add the truth data to the list
    data_dfs.append(df)

# Concatenate all training data
full_data = pd.concat(data_dfs)

# Cap the portions
# Change test to validate

# Replace dataset portion with standard names
full_data["dataset_portion"] = full_data["dataset_portion"].replace(
    {"truth": "train", "pred": "inferred", "val": "test"}
)

# Subset the data to only the columns we care about
full_data = full_data[
    [
        "name",
        "city_short_code",
        "city_name",
        "date",
        "year_month",
        "dataset_portion",
        "meeting_section",
        "speaker_role",
        "start",
        "end",
        "text",
    ]
]

# full_data.sample(3)
full_data.loc[full_data["name"].str.contains("OAK_02_16_21")]

In [None]:
# Filter to only the cities of interest
full_data = full_data[full_data["city_short_code"].isin(CITIES_OF_INTEREST)]

# Filter out government comments and only use public comment not hearing
meeting_comments = full_data[
    (full_data["meeting_section"] == "Public Comment")
    & (full_data["speaker_role"] == "Commenter")
]

meeting_comments.shape

# Order data by city population and date
city_order = [
    "Seattle, WA",  # 737,015
    "Oakland, CA",  # 440,646
    "Ann Arbor, MI",  # 123,851
    "Lansing, MI",  # 112,644
    "Royal Oak, MI",  # 58,211
    "Jackson, MI",  # 31,309
]

## Comments by City by Dataset Portion

In [None]:
# Get the percent of comments by city (per month)
# i.e. how much of the training or inferred data comes from this month

# First get the total number of comments per city
city_comment_counts = meeting_comments["city_name"].value_counts()

# Get the total number of comments per city per month
city_month_comment_counts = meeting_comments.groupby(["city_name", "year_month"]).size()

# Iter rows and calculate the percent
percent_comments_list = []
for _, row in meeting_comments.iterrows():
    this_city_total_comments = city_comment_counts[row["city_name"]]
    this_city_month_total_comments = city_month_comment_counts[
        row["city_name"], row["year_month"]
    ]
    percent = (this_city_month_total_comments / this_city_total_comments) * 100
    percent_comments_list.append(
        {
            "name": row["name"],
            "city_name": row["city_name"],
            "year_month": row["year_month"],
            "date": row["date"],
            "percent_of_comments": percent,
            "Dataset Portion": row["dataset_portion"],
        }
    )

# Convert to a DataFrame
processed_percent_comments = pd.DataFrame(percent_comments_list)

# For each city, order the months by date
city_dfs = []
for city in city_order:
    city_df = processed_percent_comments[
        processed_percent_comments["city_name"] == city
    ]
    city_df = city_df.sort_values("year_month", ascending=True)
    city_dfs.append(city_df)

# Concatenate the city dataframes
percent_comments_df = pd.concat(city_dfs)

# Col by city
g = sns.catplot(
    data=percent_comments_df,
    x="date",
    y="percent_of_comments",
    hue="Dataset Portion",
    col="city_name",
    col_wrap=3,
    kind="bar",
)

# Update the x-axis, and y-axis labels
g.set_axis_labels("Month", "Percent of Total Public Comments")
g.figure.autofmt_xdate()
g.set_titles("")
g.set_titles("{col_name}", loc="right")

# For each ax in the figure, set the x-axis locator to max 8 ticks
for ax in g.axes:
    ax.xaxis.set_major_locator(mtick.MaxNLocator(6))
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(decimals=0))

# Tight Layout
g.figure.tight_layout()

# Move the legend outside and to the right of the plot
sns.move_legend(g, loc="upper right", bbox_to_anchor=(1.1, 1))

In [None]:
percent_comments_df.loc[(percent_comments_df["city_name"] == "Oakland, CA")][
    ["name", "year_month", "Dataset Portion"]
].value_counts().to_frame().sort_values("name")

## Topic Modeling

In [None]:
TOPIC_SEEDS = {
    "Housing and Urban Development": [
        "zoning",
        "construction",
        "redevelopment",
        "growth",
        "planning",
        "housing",
        "rent",
        "single family",
        "duplex",
        "apartment",
    ],
    "Transportation and Mobility": [
        "public transit",
        "traffic",
        "bus",
        "car",
        "bike lanes",
        "pedestrian",
        "parking",
    ],
    "Public Safety and Law Enforcement": [
        "police",
        "crime",
        "emergency",
        "safety",
        "property",
        "theft",
        "violence",
        "gun",
    ],
    "Environment and Sustainability": [
        "climate",
        "green",
        "conservation",
        "energy",
    ],
    "Homelessness": [
        "homeless",
        "eviction",
        "shelter",
        "outreach",
        "mental health",
        "substance abuse",
    ],
    "Parks and Recreation": [
        "parks",
        "outdoors",
        "community",
        "events",
        "greenspace",
    ],
    "Economic Development": [
        "business",
        "jobs",
        "tax",
        "revitalization",
    ],
    "Arts and Culture": [
        "events",
        "festivals",
        "museums",
        "performances",
        "sculpture",
        "public art",
    ],
    "Education and Youth Services": [
        "schools",
        "libraries",
        "programs",
        "youth",
        "kids",
        "students",
    ],
    "Governance and Civic Engagement": [
        "transparency",
        "public participation",
        "elections",
        "accountability",
    ],
    "Israel-Palestine": [
        "Israel",
        "Palestine",
        "genocide",
        "Hamas",
        "Jewish",
        "Muslim",
        "discrimination",
    ],
    "Police Reform": [
        "accountability",
        "community oversight",
        "training",
        "defund",
        "reform",
        "descrimination",
        "racist",
    ],
    "Utilities": [
        "water",
        "electricity",
        "sewage",
        "internet",
        "utilities",
        "services",
        "waste",
    ],
}

In [None]:
client = anthropic.Anthropic(
    api_key=os.environ["ANTHROPIC_API_KEY"],
)

SYSTEM_MESSAGE_PROMPT = """
You are assisting a computational social science researcher by classifying public comments from city council meetings into various topics. The individual comment to be classified, and the possible topical classifications will be provided to you. Always format your response in XML.
""".strip()  # noqa: E501

USER_MESSAGE_PROMPT = """
### Context

You are tasked with classifying a public comment into a single topic from a provided list.

### Steps

1. First, review the list of topics.

2. Next, carefully read the public comment.

3. Classify this comment into exactly one of the topics from the provided list. Consider the main theme or subject of the comment and how it aligns with the available topics.

4. Before making your final classification, provide your reasoning in two sentences. Explain why you believe the comment fits best into the topic you've chosen. Include this reasoning within <reasoning> tags.

5. After providing your reasoning, state your final classification. Choose only one topic from the list provided, or, if none of the topics are appropriate, choose "Other".

Remember:
- Choose only one topic for classification.
- Provide clear reasoning for your choice.
- Ensure your classification is based solely on the content of the public comment and the provided list of topics, or, if necessary, choose "Other".

Here is an example response structure:

<classification-response>
    <reasoning>...</reasoning>
    <topic>...</topic>
</classification-response>

### Topic Information

{topic_list_str}

### Public Comment

{public_comment}

### Classification

""".strip()  # noqa: E501

TOPIC_SEED_STR = """
{topic_name}
- {keywords}
""".strip()

topic_seed_strs = []
for topic_name, keywords in TOPIC_SEEDS.items():
    topic_seed_strs.append(
        TOPIC_SEED_STR.format(
            topic_name=topic_name,
            keywords="\n- ".join(keywords),
        )
    )

TOPIC_SEED_LIST_STR = "\n\n".join(topic_seed_strs)

# Classify all comments
classified_comments = []
for _, row in tqdm(
    meeting_comments.sample(50).iterrows(),
    desc="Classifying Comments",
    total=len(meeting_comments),
):
    # Sleep to avoid rate limiting
    time.sleep(0.5)

    try:
        # Send the message
        message = client.messages.create(
            model="claude-3-5-sonnet-20240620",
            max_tokens=1000,
            temperature=0,
            system=SYSTEM_MESSAGE_PROMPT,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": USER_MESSAGE_PROMPT.format(
                                topic_list_str=TOPIC_SEED_LIST_STR,
                                public_comment=row["text"],
                            ),
                        }
                    ],
                }
            ],
        )

        # Unpack the content
        content = message.content[0].text

        # Parse the XML
        root = ET.fromstring(content)

        # Print for logging
        # print(ET.tostring(root, encoding="utf-8").decode("utf-8"))

        # Get the topic, throw away the reasoning
        topic = root.find("topic").text

        # Add to the row
        classified_comments.append(
            {
                **row.to_dict(),
                "topic": topic,
            }
        )

    except Exception as e:
        print(f"Exception occurred: {e}")

In [None]:
classified_comments_df = pd.DataFrame(classified_comments)
classified_comments_df.to_csv("claude-35-sonnet-classifications.csv", index=False)

In [None]:
# Filter out all undefined topics
plotting_comments = meeting_comments[meeting_comments["topic_name"] != "Undefined"]

### Percent of Comments by Topic

In [None]:
# Keep this version of "all of the data"
# For the next two versions of the heatmap,
# fit the topic model on the training and valid set and apply on the inferred set

# Add one version of the heatmap that uses the "inferred" with the true comments
# Add one version of the heatmap that uses the "inferred" with the inferred comments
# Ignore the "Commenter" Speaker Role check for the "inferred" inferred data

# Get keywords for each major topic
# And a few representative comments
# Flip the axes and make versions of the plots with the keywords

# Create topic name and city count dataframe
topic_city_counts = (
    plotting_comments.groupby(["city_name", "topic_name"])
    .size()
    .reset_index(name="comment_count")
)

# Convert to percentage and store in new frame
topic_city_counts["comment_pct"] = topic_city_counts.groupby("city_name")[
    "comment_count"
].transform(lambda x: x / x.sum())

# Heatmap of topic percentage per month per city
sns.heatmap(
    data=topic_city_counts.pivot(
        index="city_name", columns="topic_name", values="comment_pct"
    ),
    annot=True,
    fmt=".1f",
)
_ = plt.xticks(rotation=45, ha="right")

## Safety Checks on Model

In [None]:
# LM plot with F1 of model + number of comments
city_name_f1_lut = {
    "Ann Arbor": 0.854,
    "Royal Oak": 0.781,
    "Jackson": 0.789,
    "Lansing": 0.627,
    "Seattle": 0.957,
    "Oakland": 0.719,
}

city_name_k_alpha_lut = {
    "Ann Arbor": 0.918,
    "Royal Oak": 0.931,
    "Jackson": 0.876,
    "Lansing": 0.953,
    "Seattle": 0.982,
    "Oakland": 0.900,
}

# Create dataframe of city_name, f1, comment_count, and entropy(topic_percents)
city_f1_entropy_df = []
for city_name, city_df in topic_city_counts.groupby("city_name"):
    if city_name not in city_name_f1_lut:
        continue

    # Get the f1
    f1 = city_name_f1_lut[city_name]
    k_alpha = city_name_k_alpha_lut[city_name]

    # Get the comment count
    comment_count = city_df["comment_count"].sum()

    # Get the entropy
    topic_percents = city_df["comment_pct"].values
    entropy_val = entropy(topic_percents)

    city_f1_entropy_df.append(
        {
            "city_name": city_name,
            "model_f1": f1,
            "k_alpha": k_alpha,
            "comment_count": comment_count,
            "topic_entropy": entropy_val,
        }
    )

city_f1_entropy_df = pd.DataFrame(city_f1_entropy_df)
city_f1_entropy_df

### Model F1 by Number of Comments

In [None]:
# LM plot with F1 of model + number of comments
_ = sns.lmplot(
    data=city_f1_entropy_df,
    x="comment_count",
    y="model_f1",
    # hue="city_name",
)

### Model F1 by Entropy of Topic Distribution

In [None]:
# LM plot with F1 and entropy
_ = sns.lmplot(
    data=city_f1_entropy_df,
    x="topic_entropy",
    y="model_f1",
    # hue="city_name",
)

In [None]:
# LM plot with F1 and entropy
_ = sns.lmplot(
    data=city_f1_entropy_df,
    x="k_alpha",
    y="model_f1",
    # hue="city_name",
)

In [None]:
# LM plot with F1 and entropy
_ = sns.lmplot(
    data=city_f1_entropy_df,
    x="k_alpha",
    y="topic_entropy",
    # hue="city_name",
)