## Extension: BERTopic 

This jupyter file follows as we attempt to do a comparative analysis between STM and a modern framework, namely BERTopic. 

In [237]:
import pandas as pd
from sklearn.cluster import KMeans
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from umap import UMAP
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [238]:
DATA_DIR = "/Users/giomhern/04 Projects/topic-models/data"
df = pd.read_csv(f"{DATA_DIR}/gadarian_bertopic_input.csv")
texts = df["open.ended.response"].astype(str).tolist()

In [239]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42)
vectorizer_model = CountVectorizer(stop_words="english")
kmeans_model = KMeans(n_clusters=3, random_state=42)
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model = kmeans_model, 
    calculate_probabilities=False,
    verbose=True
)

In [240]:
topics, _ = topic_model.fit_transform(texts)

# Show top 15 words for each topic
for topic_idx in topic_model.get_topics().keys():
    print(f"\n--- Topic {topic_idx} ---")
    for word, weight in topic_model.get_topic(topic_idx)[:15]:
        print(f"{word:<15} {weight:.5f}")

2025-05-29 19:28:56,609 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 11/11 [00:00<00:00, 40.16it/s]
2025-05-29 19:28:56,889 - BERTopic - Embedding - Completed ✓
2025-05-29 19:28:56,890 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-29 19:28:57,185 - BERTopic - Dimensionality - Completed ✓
2025-05-29 19:28:57,185 - BERTopic - Cluster - Start clustering the reduced embeddings

divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul

2025-05-29 19:28:57,195 - BERTopic - Cluster - Completed ✓
2025-05-29 19:28:57,198 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-29 19:28:57,208 - BERTopic - Representation - Completed ✓



--- Topic 0 ---
jobs            0.10629
people          0.06557
illegal         0.06493
americans       0.06314
welfare         0.06115
care            0.05780
taxes           0.04862
security        0.04679
social          0.04641
immigrants      0.04515

--- Topic 1 ---
illegal         0.10536
people          0.09744
immigrants      0.08177
country         0.07615
mexico          0.06034
border          0.05925
coming          0.05522
legal           0.04923
entering        0.04482
english         0.03863

--- Topic 2 ---
immigration     0.09282
think           0.09010
immigrants      0.07798
people          0.06006
country         0.05926
need            0.05137
legally         0.04147
illegal         0.04138
worry           0.04075
come            0.03783


In [250]:
with open("bertopic_top_words.txt", "w") as f:
    for topic_idx in topic_model.get_topics().keys():
        if topic_idx == -1:
            continue  # Skip outlier topic if present
        f.write(f"--- Topic {topic_idx} ---\n")
        top_words = topic_model.get_topic(topic_idx)[:15]
        for word, weight in top_words:
            f.write(f"{word:<15} {weight:.5f}\n")
        f.write("\n")

In [251]:
df["topic"] = topics

topic_labels = {
    0: "Economic Costs",
    1: "Border Control",
    2: "Moral Reasoning"
}
df["topic_label"] = df["topic"].map(topic_labels)

In [252]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

# Setup: classify Topic 1
df["is_topic_1"] = (df["topic"] == 0).astype(int)  # Adjust if needed

# Center pid_rep
df["pid_centered"] = df["pid_rep"] - df["pid_rep"].mean()
df["interaction"] = df["pid_centered"] * df["treatment"]

# Fit logistic model with interaction
X = sm.add_constant(df[["treatment", "pid_centered", "interaction"]])
y = df["is_topic_1"]
model = sm.Logit(y, X).fit(disp=0)

# Prediction grid
pid_vals = np.linspace(df["pid_rep"].min(), df["pid_rep"].max(), 100)
grid = []
for t in [0, 1]:
    for pid in pid_vals:
        centered = pid - df["pid_rep"].mean()
        grid.append({
            "const": 1,
            "treatment": t,
            "pid_centered": centered,
            "interaction": centered * t,
            "pid_rep": pid,
            "label": "Treated" if t == 1 else "Control"
        })

pred_df = pd.DataFrame(grid)
pred_X = pred_df[["const", "treatment", "pid_centered", "interaction"]]

# Predict with confidence intervals
pred = model.get_prediction(pred_X).summary_frame(alpha=0.05)
pred_df["predicted"] = pred["predicted"]
pred_df["lower"] = pred["ci_lower"]
pred_df["upper"] = pred["ci_upper"]

In [254]:

plt.figure(figsize=(6, 5))

for label, color in zip(["Control", "Treated"], ["blue", "red"]):
    subset = pred_df[pred_df["label"] == label]
    plt.plot(subset["pid_rep"], subset["predicted"], color=color, label=label)
    plt.fill_between(subset["pid_rep"], subset["lower"], subset["upper"], color=color, alpha=0.2)

# Custom x-axis ticks
plt.xticks(
    [df["pid_rep"].min(), df["pid_rep"].mean(), df["pid_rep"].max()],
    labels=["Strong Democrat", "Moderate", "Strong Republican"]
)

# Remove x-axis label and title
plt.xlabel("")
plt.title("")

# Y-axis label and limits
plt.ylabel("Predicted Probability")
plt.ylim(0, 1)

# Black plot frame
for spine in plt.gca().spines.values():
    spine.set_edgecolor("black")

# Add legend
plt.legend()

# Save to file
plt.tight_layout()
plt.savefig("/Users/giomhern/04 Projects/topic-models/output/gadarian/bert_border_control_minimal.pdf", bbox_inches="tight", dpi=300)
plt.close()

## ANES Data 

In [255]:
df = pd.read_csv(f"{DATA_DIR}/final_anes_metadata.csv")
df["text"] = df[["mii_1", "mii_2"]].fillna("").agg(" ".join, axis=1).str.strip()
df = df[
    (df["pid_summary"] > 0) &
    (df["highest grade completed"] > 0) &
    (df["age"] > 0) &
    (df["text"] != "") &
    df["female"].notna()
].copy()

# Extract text and covariates
texts = df["text"].tolist()
covariates = df[["pid_summary", "age", "highest grade completed", "female"]]

In [256]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

# Setup components
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42)
vectorizer_model = CountVectorizer(stop_words="english")
kmeans_model = KMeans(n_clusters=10, random_state=42)

# Fit BERTopic model
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    vectorizer_model=vectorizer_model,
    hdbscan_model=kmeans_model,
    calculate_probabilities=False,
    verbose=True
)

topics = topic_model.fit_transform(texts)[0]
df["topic"] = topics

2025-05-29 20:00:31,494 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 47/47 [00:00<00:00, 79.44it/s]
2025-05-29 20:00:32,093 - BERTopic - Embedding - Completed ✓
2025-05-29 20:00:32,093 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-29 20:00:34,097 - BERTopic - Dimensionality - Completed ✓
2025-05-29 20:00:34,098 - BERTopic - Cluster - Start clustering the reduced embeddings

divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul

2025-05-29 20:00:34,103 - BERTopic - Cluster - Completed ✓
2025-05-29 20:00:34,104 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-29 20:00:34,114 - BERTopic - Representation - Completed ✓


In [257]:
with open("top_bertopic_words.txt", "w") as f:
    for topic_num in topic_model.get_topic_info().head(10)["Topic"]:
        if topic_num == -1:
            continue  # skip outliers
        f.write(f"\n--- Topic {topic_num} ---\n")
        for word, weight in topic_model.get_topic(topic_num)[:15]:
            f.write(f"{word:<15} {weight:.5f}\n")

In [259]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

# Select topic
target_topic = 5  # adjust as needed
df["is_topic"] = (df["topic"] == target_topic).astype(int)

# Party dummy: 0 = Democrat (pid < 4), 1 = Republican (pid >= 4)
df["is_republican"] = (df["pid_summary"] >= 4).astype(int)

# Center education
df["edu_centered"] = df["highest grade completed"] - df["highest grade completed"].mean()

# Interaction term
df["interaction"] = df["edu_centered"] * df["is_republican"]

In [260]:
X = sm.add_constant(df[["edu_centered", "is_republican", "interaction"]])
y = df["is_topic"]

model = sm.Logit(y, X).fit(disp=0)

In [261]:
# Prediction grid
edu_vals = np.linspace(13, 17, 100)
grid = []

for party in [0, 1]:  # 0 = Democrat, 1 = Republican
    for edu in edu_vals:
        edu_c = edu - df["highest grade completed"].mean()
        grid.append({
            "const": 1,
            "edu_centered": edu_c,
            "is_republican": party,
            "interaction": edu_c * party,
            "education": edu,
            "label": "Republican" if party else "Democrat"
        })

pred_df = pd.DataFrame(grid)
pred_X = pred_df[["const", "edu_centered", "is_republican", "interaction"]]

# Predict with CI
pred = model.get_prediction(pred_X).summary_frame(alpha=0.05)
pred_df["predicted"] = pred["predicted"]
pred_df["lower"] = pred["ci_lower"]
pred_df["upper"] = pred["ci_upper"]

In [265]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 5))

# Plot lines and confidence intervals
for label, color in zip(["Democrat", "Republican"], ["blue", "red"]):
    subset = pred_df[pred_df["label"] == label]
    plt.plot(subset["education"], subset["predicted"], color=color, label=label)
    plt.fill_between(subset["education"], subset["lower"], subset["upper"], color=color, alpha=0.2)

# Customize axes
plt.xlabel("")  # Remove x-axis label
plt.ylabel("Predicted Probability")
plt.title("")   # Remove title
plt.ylim(0, 0.4)

# Add x-axis ticks for clarity if needed
plt.xticks(
    [13, 14, 15, 16, 17],
    labels=["13", "14", "15", "16", "17"]
)

# Add legend
plt.legend()

# Add black frame
for spine in plt.gca().spines.values():
    spine.set_edgecolor("black")

# Save if desired
plt.tight_layout()
plt.savefig("/Users/giomhern/04 Projects/topic-models/output/anes/topic_edu_party.pdf", dpi=300, bbox_inches="tight")
plt.close()