In [1]:
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from nltk.corpus import stopwords
from bertopic.representation import KeyBERTInspired
import re

In [2]:
df = pd.read_csv('/Users/sanjanakarthick/Downloads/df_with_topics.csv')

In [3]:


def remove_boilerplate(text):
    if not isinstance(text, str):
        return ""

    patterns = [
        r"mr\.? speaker",
        r"madam speaker",
        r"yield(ing)? (back|to|for)",
        r"i yield( the balance of my time)?",
        r"motion to (reconsider|suspend)",
        r"suspend the rules",
        r"unanimous consent",
        r"pursuant to",
        r"the clerk will report",
        r"i move to",
        r"without objection",
        r"so ordered",
        r"the gentleman from [A-Za-z]+",
        r"the gentlewoman from [A-Za-z]+",
        r"seeks recognition",
        r"for what purpose does",
        r"under the speaker['’]s announced policy",
        r"request(ing)? (permission|consent|to print)",
        r"additional material",
        r"amendment(?:s)? (was|were) agreed",
        r"amendment(?:s)? (is|are) adopted",
        r"printed in the record",
        r"record(?! of homeless| of housing)",  # keep real records
    ]

    for p in patterns:
        text = re.sub(p, "", text, flags=re.IGNORECASE)

    return text.strip()

df["clean_policy_text"] = df["clean_text"].apply(remove_boilerplate)


In [4]:
df["clean_policy_text"].head(20)

0     pass b housing exclude certain disability bene...
1     pass authorize housing agencies share certain ...
2     pass ensure children homeless veterans include...
3     consume support homeless veterans assistance o...
4     pass direct labor prioritize provision service...
5     pass runaway homeless youth authorize appropri...
6     spring proud introduce bipartisan runaway home...
7     pass homeless assistance enable indian tribes ...
8     pass homeless assistance reauthorize purposes ...
9     agree res recognizing anniversary homeless ass...
10    encourage join coalition supporting fairer sim...
11    runaway homeless youth authorize appropriation...
12    strong support commend jersey effective leader...
13    salute jersey well done excellent work bringin...
14    pass direct veterans affairs ensure extent pos...
15    close business yesterday wednesday july debt s...
16    support homeless veterans veterans health care...
17    intern chloe becker shadowing accorded ful

In [5]:
mask = (
    ~df["clean_text"].str.contains("extraneous material", case=False, na=False) &
    ~df["clean_text"].str.contains("tabular material", case=False, na=False) &
    ~df["clean_text"].str.contains("printed in the record", case=False, na=False) &
    ~df["clean_text"].str.contains("material follows", case=False, na=False) &
    ~df["clean_text"].str.contains("further proceedings", case=False, na=False) &
    ~df["clean_text"].str.contains("amendment numbered", case=False, na=False) &
    ~df["clean_text"].str.contains("amendment to be printed", case=False, na=False) &
    ~df["clean_text"].str.contains("clerk read", case=False, na=False) &
    ~df["clean_text"].str.contains("bill to", case=False, na=False) &
    ~df["clean_text"].str.contains("motion to", case=False, na=False)
)

df = df[mask].copy()

In [6]:
STOP = set(stopwords.words("english"))

procedural_stopwords = [
    "yield", "gentleman", "gentlewoman", "speaker", "chair", "chairman",
    "chairwoman", "presiding", "colleague", "colleagues",
    "recognize", "recognized", "recognition", "remarks",
    "motion", "proceed", "order", "ordered", "reconsider", "suspend",
    "pursuant", "referred",
    "unanimous", "consent", "rollcall", "yeas", "nays",
    "subsection", "section", "paragraph", "subparagraph", "clause",
    "title", "chapter", "subtitle",
    "print", "printed", "printing", "material", "tabular",
    "extraneous", "matter",
    "record", "volume", "vol", "congressional", "app", "follows",
    "insert", "strike",
    "thank", "appreciate", "congratulate",
    "rise", "rise today", "rise in", "before", "gentlelady",
    "tempore", "pro", "pro tempore", "amendment", "amendments",
    "adopt", "adopted", "agree", "agreed"
]

STOP = STOP.union(procedural_stopwords)


In [23]:
def subcluster_topic(df, topic_number, min_cluster_size=20):
    
    subset = df[df["topic"] == topic_number].copy()
    docs = subset["clean_policy_text"].tolist()

    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    umap_model = UMAP(
        n_neighbors=30,
        n_components=5,
        min_dist=0.1,
        metric="cosine"
    )

    hdbscan_model = HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=5,
        cluster_selection_epsilon=0.01
    )

    vectorizer_model = CountVectorizer(
        stop_words=list(STOP),
        ngram_range=(1, 2),
        min_df=0.03,
        max_df=1.0
    )

    ctfidf_model = ClassTfidfTransformer()

    sub_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        ctfidf_model=ctfidf_model,
        representation_model=KeyBERTInspired(),
        verbose=True
    )

    topics, probs = sub_model.fit_transform(docs)
    subset["subtopic"] = topics
    
    return subset, sub_model

In [24]:
target_topics = [0, 2, 3]

subtopic_results = []
models = {}

for t in target_topics:
    subset, model = subcluster_topic(df, topic_number=t)
    subtopic_results.append(subset)
    models[t] = model

2025-11-23 13:56:13,917 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-11-23 13:56:14,308 - BERTopic - Embedding - Completed ✓
2025-11-23 13:56:14,308 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-23 13:56:14,370 - BERTopic - Dimensionality - Completed ✓
2025-11-23 13:56:14,371 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-23 13:56:14,372 - BERTopic - Cluster - Completed ✓
2025-11-23 13:56:14,373 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 13:56:14,472 - BERTopic - Representation - Completed ✓
2025-11-23 13:56:16,546 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2025-11-23 13:56:16,584 - BERTopic - Embedding - Completed ✓
2025-11-23 13:56:16,585 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-23 13:56:16,605 - BERTopic - Dimensionality - Completed ✓
2025-11-23 13:56:16,606 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-23 13:56:16,607 - BERTopic - Cluster - Completed ✓
2025-11-23 13:56:16,608 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 13:56:16,666 - BERTopic - Representation - Completed ✓
2025-11-23 13:56:18,732 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/2 [00:00<?, ?it/s]

2025-11-23 13:56:19,128 - BERTopic - Embedding - Completed ✓
2025-11-23 13:56:19,128 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-23 13:56:19,189 - BERTopic - Dimensionality - Completed ✓
2025-11-23 13:56:19,189 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-23 13:56:19,194 - BERTopic - Cluster - Completed ✓
2025-11-23 13:56:19,197 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-23 13:56:19,766 - BERTopic - Representation - Completed ✓


In [25]:
df_subtopics = pd.concat(subtopic_results).reset_index(drop=True)

In [26]:
df_subtopics["topic_name"] = df_subtopics["Name"]

subtopic_labels = {}
for t in models.keys():
    info = models[t].get_topic_info()
    subtopic_labels[t] = dict(zip(info["Topic"], info["Name"]))

def get_subtopic_name(row):
    return subtopic_labels.get(row["topic"], {}).get(row["subtopic"], None)

df_subtopics["subtopic_name"] = df_subtopics.apply(get_subtopic_name, axis=1)


In [27]:
result = df_subtopics[[
    "topic", "topic_name",
    "subtopic", "subtopic_name",
    "Speaker", "Party", "Title", "clean_policy_text"
]]
result.head()


Unnamed: 0,topic,topic_name,subtopic,subtopic_name,Speaker,Party,Title,clean_policy_text
0,0,0_defense_military_air_army,0,0_amended_provisions_consideration waived_pass...,Ford,Democrat,NATIONAL DEFENSE AUTHORIZATION ACT FOR FISCAL ...,assisting distinguished armed services request...
1,0,0_defense_military_air_army,1,1_consideration report_amended_provisions_comp...,Christensen,Democrat,"HOUSING, EMPLOYMENT, AND LIVING PROGRAMS FOR V...",concur amendments authorize interior lease cer...
2,0,0_defense_military_air_army,-1,-1_amended_consideration res_provisions_postpo...,Lewis,,DEPARTMENTS OF VETERANS AFFAIRS AND HOUSING AN...,consideration provisions whole postpone consid...
3,0,0_defense_military_air_army,-1,-1_amended_consideration res_provisions_postpo...,Guthrie,Republican,PANDEMIC IS OVER ACT; Congressional Record Vol...,call terminate health emergency declared respe...
4,0,0_defense_military_air_army,-1,-1_amended_consideration res_provisions_postpo...,Wicker,Republican,"TRANSPORTATION, HOUSING AND URBAN DEVELOPMENT,...",reserving right object point clarification ass...


In [28]:
topic_and_subtopic_title_summary = (
   result.groupby(["topic_name", "subtopic_name"])["Title"]
    .agg(["count"])
    .reset_index()
   .set_index(["topic_name", "subtopic_name"])
)

topic_and_subtopic_title_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,count
topic_name,subtopic_name,Unnamed: 2_level_1
0_defense_military_air_army,-1_amended_consideration res_provisions_postpone consideration,53
0_defense_military_air_army,0_amended_provisions_consideration waived_passage intervening,33
0_defense_military_air_army,1_consideration report_amended_provisions_comply xxi,24
2_veterans_housing_health_affairs,-1_request indiana_request iowa_request pennsylvania_carolina request,10
3_child_violence_trafficking_victims,-1_housing assistance_rental assistance_homelessness_homeless,43
