In [72]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from bertopic import BERTopic
from nltk.corpus import stopwords

In [73]:
pd.set_option('display.max_colwidth', None)
#load pickle of labels
QUOTEBANK_PATH = "../data/binary/us-politicians.pickle"
quotebank = pd.read_pickle(QUOTEBANK_PATH)
# Read data
data = quotebank.sample(n=int(1e4))
data.head()

Unnamed: 0,speaker_id,quote_id,quotation,speaker,party
152425,76,2018-09-08-003225,"And when people don't participate, then that vacuum is filled by lobbyists and special interests and we get into a downward spiral where people get more and more discouraged and they think nothing's going to make a difference.",Barack Obama,29552
902779,439729,2017-10-22-024075,I mean everyone I know is trying to come up with solutions and the best solution is to stop the addiction.,Rick Scott,29468
867683,22686,2020-03-27-060462,"This public health emergency situation has affected all of us across the nation,",Donald Trump,29468
1541083,50597,2018-11-07-101123,The left doesn't always have it right. The right doesn't always have it right.,Mary Kathryn Heitkamp,29552
549016,22686,2018-10-01-042264,"I think the discussion around this is, in general, almost entirely among those who haven't seen the film and I am really excited it's coming out, because I think the film speaks for itself.",Donald Trump,29468


In [74]:
# Stopwords and special characters removal
data['quotation'] = data['quotation'].str.replace('\W',' ')
stop = stopwords.words('english')
data['quotation'] = data['quotation'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in (stop)]))
quotes = list(data['quotation'])
data.head()


Unnamed: 0,speaker_id,quote_id,quotation,speaker,party
152425,76,2018-09-08-003225,people participate vacuum filled lobbyists special interests get downward spiral people get discouraged think nothing going make difference,Barack Obama,29552
902779,439729,2017-10-22-024075,mean everyone know trying come solutions best solution stop addiction,Rick Scott,29468
867683,22686,2020-03-27-060462,public health emergency situation affected us across nation,Donald Trump,29468
1541083,50597,2018-11-07-101123,left always right right always right,Mary Kathryn Heitkamp,29552
549016,22686,2018-10-01-042264,think discussion around general almost entirely among seen film really excited coming think film speaks,Donald Trump,29468


In [75]:
# Create Topic Model
topic_model = BERTopic(min_topic_size=30, n_gram_range=(1,3), verbose=True)
topics, probs = topic_model.fit_transform(quotes)

Batches: 100%|██████████| 313/313 [01:44<00:00,  3.00it/s]
2021-12-06 10:33:55,899 - BERTopic - Transformed documents to Embeddings
2021-12-06 10:34:06,862 - BERTopic - Reduced dimensionality with UMAP
2021-12-06 10:34:07,326 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [76]:
freq = topic_model.get_topic_info()
freq.head(10)

Unnamed: 0,Topic,Count,Name
0,-1,5121,-1_think_us_say_make
1,0,978,0_president_trump_vote_republicans
2,1,290,1_think_would_say_answer
3,2,227,2_isis_syria_terrorism_iraq
4,3,224,3_wall_border_immigration_borders
5,4,196,4_children_families_schools_school
6,5,183,5_putin_collusion_russian_russians
7,6,183,6_military_nations_countries_allies
8,7,150,7_china_tariffs_deal china_chinese
9,8,138,8_investigation_fbi_cohen_attorney general


In [77]:
topic_nr = freq.iloc[-1]["Topic"] # select a frequent topic
topic_model.get_topic(topic_nr)

[('religion', 0.02753138092152062),
 ('church', 0.024791230074559918),
 ('religious liberty', 0.01912133520785656),
 ('people faith', 0.01912133520785656),
 ('christian', 0.012578903844595968),
 ('security talking religion', 0.01044931783541539),
 ('protestant presbyterian', 0.01044931783541539),
 ('protestant', 0.01044931783541539),
 ('tell religious liberty', 0.01044931783541539),
 ('talking religion talking', 0.01044931783541539)]