In [24]:
import pandas as pd
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
import seaborn as sns
import matplotlib.pyplot as plt

In [25]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [26]:
#read csv file
df = pd.read_csv('/Users/sanjanakarthick/Downloads/df_with_topics.csv')


In [27]:
selected_topics = [0, 2, 3]
df_subset = df[df["topic"].isin(selected_topics)].copy()
df_subset.head()

Unnamed: 0,Chamber,Year,Speaker,Party,State,Title,speech_text,turn_number,speech_length_words,clean_text,topic,Name
0,House,2025,Hill,Republican,AR,HOUSING UNHOUSED DISABLED VETERANS ACT; Congre...,"Mr. Speaker, I move to suspend the rules and p...",1,422,pass b housing exclude certain disability bene...,3,3_child_violence_trafficking_victims
1,House,2020,San Nicolas,Democrat,GU,HOMELESS ASSISTANCE ACT OF 2019; Congressional...,"Madam Speaker, I move to suspend the rules and...",1,402,pass authorize housing agencies share certain ...,3,3_child_violence_trafficking_victims
2,House,2019,Takano,Democrat,CA,HOMELESS VETERAN FAMILIES ACT; Congressional R...,"Mr. Speaker, I move to suspend the rules and p...",1,234,pass ensure children homeless veterans include...,3,3_child_violence_trafficking_victims
4,House,2021,Takano,Democrat,CA,HOMELESS VETERANS WITH CHILDREN REINTEGRATION ...,"Mr. Speaker, I move to suspend the rules and p...",1,313,pass direct labor prioritize provision service...,3,3_child_violence_trafficking_victims
7,House,2019,Green,,,TRIBAL ACCESS TO HOMELESS ASSISTANCE ACT; Cong...,"Mr. Speaker, I move to suspend the rules and p...",1,450,pass homeless assistance enable indian tribes ...,3,3_child_violence_trafficking_victims


In [28]:
def get_sentiment(text):
    scores = analyzer.polarity_scores(str(text))
    return scores["compound"]

df_subset["sentiment_score"] = df_subset["speech_text"].apply(get_sentiment)
df_subset.head()

Unnamed: 0,Chamber,Year,Speaker,Party,State,Title,speech_text,turn_number,speech_length_words,clean_text,topic,Name,sentiment_score
0,House,2025,Hill,Republican,AR,HOUSING UNHOUSED DISABLED VETERANS ACT; Congre...,"Mr. Speaker, I move to suspend the rules and p...",1,422,pass b housing exclude certain disability bene...,3,3_child_violence_trafficking_victims,0.9918
1,House,2020,San Nicolas,Democrat,GU,HOMELESS ASSISTANCE ACT OF 2019; Congressional...,"Madam Speaker, I move to suspend the rules and...",1,402,pass authorize housing agencies share certain ...,3,3_child_violence_trafficking_victims,0.9791
2,House,2019,Takano,Democrat,CA,HOMELESS VETERAN FAMILIES ACT; Congressional R...,"Mr. Speaker, I move to suspend the rules and p...",1,234,pass ensure children homeless veterans include...,3,3_child_violence_trafficking_victims,0.9863
4,House,2021,Takano,Democrat,CA,HOMELESS VETERANS WITH CHILDREN REINTEGRATION ...,"Mr. Speaker, I move to suspend the rules and p...",1,313,pass direct labor prioritize provision service...,3,3_child_violence_trafficking_victims,0.9201
7,House,2019,Green,,,TRIBAL ACCESS TO HOMELESS ASSISTANCE ACT; Cong...,"Mr. Speaker, I move to suspend the rules and p...",1,450,pass homeless assistance enable indian tribes ...,3,3_child_violence_trafficking_victims,0.9597


In [29]:
topic_sentiment_summary = (
    df_subset.groupby("topic")["sentiment_score"]
    .agg(["mean", "median", "count"])
    .reset_index()
)

topic_sentiment_summary

Unnamed: 0,topic,mean,median,count
0,0,0.335707,0.44275,110
1,2,0.151167,0.1027,52
2,3,0.858205,0.9848,43


In [30]:
#most positive speeches
df_subset.sort_values("sentiment_score", ascending=False).head(10)[
    ["topic", "Speaker", "speech_text"]
]   

Unnamed: 0,topic,Speaker,speech_text
1196,3,Leach,"Mr. Speaker, I move to suspend the rules and p..."
266,3,Kennedy,"Mr. Chairman, I offer an amendment in the natu..."
8,3,Moore,"Mr. Speaker, I move to suspend the rules and p..."
45,3,Waters,"Mr. Speaker, pursuant to House Resolution 1017..."
953,3,Leach,"Mr. Speaker, I move to suspend the rules and p..."
50,3,Frank,"Mr. Speaker, I move to suspend the rules and p..."
1059,3,Sherwood,"Mr. Speaker, I move to suspend the rules and p..."
260,3,Pearce,"Mr. Speaker, I move to suspend the rules and p..."
241,3,Pearce,"Mr. Speaker, I move to suspend the rules and p..."
1159,0,Hyde,"Mr. Speaker, I move to suspend the rules and c..."


In [31]:
#most negative speeches
df_subset.sort_values("sentiment_score").head(10)[
    ["topic", "Speaker", "speech_text"]
]

Unnamed: 0,topic,Speaker,speech_text
154,3,Baker,"Madam Speaker, I move to suspend the rules and..."
449,0,Smith,"I suppose that that is in order, Mr. Speaker, ..."
1185,0,Pryce,"Mr. Speaker, by direction of the Committee on ..."
631,0,Smith,"Madam Speaker, I raise a point of order agains..."
916,0,Quillen,"Mr. Speaker, by direction of the Committee on ..."
1177,0,Regula,"Mr. Speaker, in the interests of expediting th..."
1394,0,Flake,"Mr. Speaker, I raise a point of order against ..."
583,0,Obey,"Mr. Speaker, I offer a motion to instruct conf..."
846,0,Lowey,"Mr. Speaker, I rise to offer the motion to ins..."
883,0,Kolbe,"Mr. Speaker, I ask unanimous consent that duri..."


In [32]:
topic_sentiment_summary.rename(columns={
    "mean": "avg_sentiment",
    "count": "num_speeches"
})

topic_sentiment_summary["topic"] = topic_sentiment_summary.index
topic_sentiment_summary

Unnamed: 0,topic,mean,median,count
0,0,0.335707,0.44275,110
1,1,0.151167,0.1027,52
2,2,0.858205,0.9848,43


### SENTIMENT

In [33]:
STOP = [ "agency",
"the","a","an","and","or","if","of","for","in","on","at","by","to","from","with", "people", "state", "want", "going",
"is","are","was","were","be","been","being","as","it","its","this","that","these","those",
"i","me","my","we","our","you","your","he","she","they","them","their","his","her", 'objection', 'rollcall',
"not","but","do","does","did","so","because","can","could","should","would","will","may","might",
"also","just","very","much","more","such","than","then","there","here","when","where","who","what","which","how",
"all","any","each","every","some","no","nor","only","own","same","too","into","up","down","about","over","under",
"thank","thankyou","yield","gentleman","gentlewoman","chair","chairman","chairwoman","speaker","presiding",
"recognize","recognized","recognition","remarks","floor","colleague","colleagues","proceed","order","ordered",
"committee","subcommittee","motion","unanimous","consent","debate","amendment","amend","section","title",
"chapter","paragraph","subparagraph","clause","statutory","herein","whereas","therefore","resolved","pursuant",
"enact","enacted","mr","mrs","ms","madam","sir","act","bill","resolution","senate","house","representative",
"representatives","congress","member","members","session","hearing","witness","testimony","law","legislation",
"legislative","vote","enactment","proceedings","statute","appropriation","federal","agency","program",
"department","secretary","policy","administration","executive","authority","director","office","ranking",
"majority","minority","sec","subsec","subtitle","appendix","figure","table","page","pages","line","lines",
"insert","strike","sentence","word","words","heading","part","division","codified","usc","code","public","law",
"number","numbers","date","dates","document","doc","clerk","shall","must","make","made","makes","take","taken",
"took","get","got","getting","give","gave","let","say","said","says","see","seen","look","looked","know","known",
"think","thought","myself","ours","ourselves","your","yours","yourself","yourselves","him","his","himself",
"hers","herself","theirs","themselves","itself","one","ones","someone","anyone","everyone","noone","many",
"much","none","both","either","neither","while","though","although","however","therefore","through","across",
"around","between","within","without","such","as","now","today","time","times","year","years","day","days",
"long","short","new","old","first","second","third","next","last","us","including","need","percent","miss", 'suspend',
"dr","chairperson","president","gentlelady","gentlemen","lady","panel","rules","rule","laws","policies", 'pre html', 'html'
"reading","read","united","states","america","american","washington","dc","bureau","tomorrow","week","body","week", 
"gentlewoman","body","floor","colleague","colleagues", 'move', 'congressional', 'record', 'vol', "appropriation", "senator", 
'fl', 'johnson', 'ga', 'tx', 'recorded', 'smith', 'ny', 'rise', 'nation', 'honor', 'great', 'revise extend',' ask revise', 'revise', 
'extend', 'ask', 'purposes ask', 'ask immediate','rollcall', 'voted', 'present', 'present voted', 'ask', 'objection', 'reconsider', 
'laid', 'tempore_pro' 'tempore_pro_extraneous','yeas', 'nays', 'tabular', 'material', 'extraneous', 'matter', 'matter extraneous', 'tempore',
'pro', 'pro_extraneous', 'hereby', 'incorporated', 'incorporated herein', 'record', 'vol', 'volume', 'volume ', 'congressional record', 
"include consideration", 'include', 'consideration', 'pass']

In [34]:
def subcluster_topic(df, topic_number, min_cluster_size=8, nr_topics=None):
    subset = df[df["topic"] == topic_number].copy()
    docs = subset["clean_text"].tolist()

    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    umap_model = UMAP(
        n_neighbors=15,
        n_components=5,
        min_dist=0.05,
        metric="cosine",
        random_state=42,
    )
    
    hdbscan_model = HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=1,
        cluster_selection_epsilon=0.05
    )

    vectorizer_model = CountVectorizer(
        stop_words=list(STOP),
        ngram_range=(1, 2),
        min_df=2
    )

    ctfidf_model = ClassTfidfTransformer()

    sub_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=UMAP(n_neighbors=30, n_components=5, min_dist=0.1),
        hdbscan_model=HDBSCAN(min_cluster_size=20, min_samples=5),
        vectorizer_model=CountVectorizer(stop_words=STOP, ngram_range=(1, 2), min_df=10),
        ctfidf_model=ClassTfidfTransformer(),
        verbose=True
    )


    subtopics, _ = sub_model.fit_transform(docs)
    subset["subtopic"] = subtopics
    
    return subset, sub_model


In [35]:
target_topics = [0, 2, 3]

subtopic_results = []
models = {}

for t in target_topics:
    subset, model = subcluster_topic(df, topic_number=t, min_cluster_size=8)
    models[t] = model
    subtopic_results.append(subset)


2025-11-23 13:27:45,086 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-11-23 13:27:45,332 - BERTopic - Embedding - Completed ✓
2025-11-23 13:27:45,333 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-23 13:27:46,016 - BERTopic - Dimensionality - Completed ✓
2025-11-23 13:27:46,017 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-23 13:27:46,020 - BERTopic - Cluster - Completed ✓
2025-11-23 13:27:46,022 - BERTopic - Representation - Fine-tuning topics using representation models.


ValueError: max_df corresponds to < documents than min_df

In [None]:
df_subtopics = pd.concat(subtopic_results).reset_index(drop=True)
df_subtopics.head()

Unnamed: 0,Chamber,Year,Speaker,Party,State,Title,speech_text,turn_number,speech_length_words,clean_text,topic,Name,subtopic
0,Senate,1995,Ford,Democrat,KY,NATIONAL DEFENSE AUTHORIZATION ACT FOR FISCAL ...,I will be assisting the distinguished chairman...,2,69,assisting distinguished armed services request...,0,0_defense_military_air_army,-1
1,House,2010,Christensen,Democrat,VI,"HOUSING, EMPLOYMENT, AND LIVING PROGRAMS FOR V...","Mr. Speaker, I move to suspend the rules and c...",1,213,concur amendments authorize interior lease cer...,0,0_defense_military_air_army,4
2,House,1995,Lewis,,,DEPARTMENTS OF VETERANS AFFAIRS AND HOUSING AN...,"Mr. Speaker, I ask unanimous consent that duri...",1,161,consideration provisions whole postpone consid...,0,0_defense_military_air_army,5
3,House,2023,Guthrie,Republican,KY,PANDEMIC IS OVER ACT; Congressional Record Vol...,"Mr. Speaker, pursuant to House Resolution 75, ...",1,231,call terminate health emergency declared respe...,0,0_defense_military_air_army,-1
4,Senate,2015,Wicker,Republican,MS,"TRANSPORTATION, HOUSING AND URBAN DEVELOPMENT,...","Reserving the right to object, just for point ...",7,51,reserving right object point clarification ass...,0,0_defense_military_air_army,-1


In [None]:
subtopic_labels = {}

for t in models.keys():
    info = models[t].get_topic_info()
    subtopic_labels[t] = dict(zip(info["Topic"], info["Name"]))

In [None]:
def get_subtopic_name(row):
    topic = row["topic"]
    sub = row["subtopic"]
    return subtopic_labels.get(topic, {}).get(sub, None)

df_subtopics["subtopic_name"] = df_subtopics.apply(get_subtopic_name, axis=1)
df_subtopics.head()

Unnamed: 0,Chamber,Year,Speaker,Party,State,Title,speech_text,turn_number,speech_length_words,clean_text,topic,Name,subtopic,subtopic_name
0,Senate,1995,Ford,Democrat,KY,NATIONAL DEFENSE AUTHORIZATION ACT FOR FISCAL ...,I will be assisting the distinguished chairman...,2,69,assisting distinguished armed services request...,0,0_defense_military_air_army,-1,-1_property_civil_court_government
1,House,2010,Christensen,Democrat,VI,"HOUSING, EMPLOYMENT, AND LIVING PROGRAMS FOR V...","Mr. Speaker, I move to suspend the rules and c...",1,213,concur amendments authorize interior lease cer...,0,0_defense_military_air_army,4,4_national_purposes_amendments_commerce
2,House,1995,Lewis,,,DEPARTMENTS OF VETERANS AFFAIRS AND HOUSING AN...,"Mr. Speaker, I ask unanimous consent that duri...",1,161,consideration provisions whole postpone consid...,0,0_defense_military_air_army,5,5_minutes_numbered_printed_debatable
3,House,2023,Guthrie,Republican,KY,PANDEMIC IS OVER ACT; Congressional Record Vol...,"Mr. Speaker, pursuant to House Resolution 75, ...",1,231,call terminate health emergency declared respe...,0,0_defense_military_air_army,-1,-1_property_civil_court_government
4,Senate,2015,Wicker,Republican,MS,"TRANSPORTATION, HOUSING AND URBAN DEVELOPMENT,...","Reserving the right to object, just for point ...",7,51,reserving right object point clarification ass...,0,0_defense_military_air_army,-1,-1_property_civil_court_government


In [None]:
result = df_subtopics[[
    "topic",
    "subtopic",
    "Name",
    "subtopic_name",
    "Speaker", 
    "Party",
    "Title",
    "clean_text"
]]

result = result.rename(columns={"Name": "topic_name"})

result

Unnamed: 0,topic,subtopic,topic_name,subtopic_name,Speaker,Party,Title,clean_text
0,0,-1,0_defense_military_air_army,-1_property_civil_court_government,Ford,Democrat,NATIONAL DEFENSE AUTHORIZATION ACT FOR FISCAL ...,assisting distinguished armed services request...
1,0,4,0_defense_military_air_army,4_national_purposes_amendments_commerce,Christensen,Democrat,"HOUSING, EMPLOYMENT, AND LIVING PROGRAMS FOR V...",concur amendments authorize interior lease cer...
2,0,5,0_defense_military_air_army,5_minutes_numbered_printed_debatable,Lewis,,DEPARTMENTS OF VETERANS AFFAIRS AND HOUSING AN...,consideration provisions whole postpone consid...
3,0,-1,0_defense_military_air_army,-1_property_civil_court_government,Guthrie,Republican,PANDEMIC IS OVER ACT; Congressional Record Vol...,call terminate health emergency declared respe...
4,0,-1,0_defense_military_air_army,-1_property_civil_court_government,Wicker,Republican,"TRANSPORTATION, HOUSING AND URBAN DEVELOPMENT,...",reserving right object point clarification ass...
...,...,...,...,...,...,...,...,...
200,3,0,3_child_violence_trafficking_victims,0_housing_assistance_subsection_inserting,Cleaver,Democrat,INDIAN VETERANS HOUSING OPPORTUNITY ACT OF 200...,pass exclude consideration income native housi...
201,3,0,3_child_violence_trafficking_victims,0_housing_assistance_subsection_inserting,Leach,Republican,AMERICAN HOMEOWNERSHIP AND ECONOMIC OPPORTUNIT...,pass expand homeownership purposes follows ass...
202,3,1,3_child_violence_trafficking_victims,1_tribe_housing_indian_native,Case,Democrat,LEECH LAKE BAND OF OJIBWE RESERVATION RESTORAT...,pass provide transfer certain land minnesota b...
203,3,1,3_child_violence_trafficking_victims,1_tribe_housing_indian_native,Sherwood,Republican,OMNIBUS INDIAN ADVANCEMENT ACT; Congressional ...,pass authorize construction wakpa sica reconci...


In [None]:
#sort by topic and subtopic, and count unique titles
#set topic and subtopic names as index

topic_and_subtopic_title_summary = (
   result.groupby(["topic_name", "subtopic_name"])["Title"]
    .agg(["count"])
    .reset_index()
   .set_index(["topic_name", "subtopic_name"])
)

topic_and_subtopic_title_summary

Unnamed: 0_level_0,Unnamed: 1_level_0,count
topic_name,subtopic_name,Unnamed: 2_level_1
0_defense_military_air_army,-1_property_civil_court_government,38
0_defense_military_air_army,0_return_respect_tax_income,13
0_defense_military_air_army,1_amendments_report_considered_appropriations,12
0_defense_military_air_army,2_education_nature substitute_substitute_nature,12
0_defense_military_air_army,3_ways means_ways_means_considered,10
0_defense_military_air_army,4_national_purposes_amendments_commerce,9
0_defense_military_air_army,5_minutes_numbered_printed_debatable,8
0_defense_military_air_army,6_considered_report_waived_points,8
2_veterans_housing_health_affairs,0_request_passed_pennsylvania_request pennsylvania,32
2_veterans_housing_health_affairs,1_request_request york_york_florida,20
