In [1]:
from bertopic import BERTopic
import pandas as pd
import numpy as np
import bz2
import json
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN

In [2]:
DATASET_PATH = './quotes_nyt.json'
PATH_MODEL = './bertopic_model'

In [3]:
# Load dataset
df = pd.read_json(DATASET_PATH, orient='index')
df.shape

(858367, 9)

In [4]:
quotes = df.quotation.to_list()

In [5]:
# Define stopwords
stop_words_file = "stop_words_english.txt"
with open(stop_words_file, "r", encoding='utf-8') as f:
    stop_words = f.read().splitlines() 

In [6]:
vectorizer_model = CountVectorizer(stop_words=stop_words)
hdbscan_model = HDBSCAN(core_dist_n_jobs=1, prediction_data = True)

In [7]:
# Fit
topic_model = BERTopic(min_topic_size=100,
                       low_memory=True,
                       calculate_probabilities=False,
                       verbose=True,
                       vectorizer_model=vectorizer_model,
                       hdbscan_model=hdbscan_model).fit(quotes)

Batches:   0%|          | 0/26824 [00:00<?, ?it/s]

2021-12-16 09:57:11,758 - BERTopic - Transformed documents to Embeddings
2021-12-16 10:11:08,885 - BERTopic - Reduced dimensionality with UMAP
2021-12-16 10:20:45,845 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [8]:
topic_model.save(PATH_MODEL)

In [17]:
topic_model.get_topic_info()[:10]

Unnamed: 0,Topic,Count,Name
0,-1,420545,-1_feminine_husbands_wives_painter
1,0,3780,0_beijings_beijing_chinaus_shanghai
2,1,3008,1_putins_vladimir_russias_moscow
3,2,2279,2_justices_appellate_courtroom_judiciary
4,3,2201,3_parenting_childs_toddlers_preschool
5,4,2194,4_yorker_yorks_manhattan_nyc
6,5,2084,5_deduction_deductions_taxed_taxation
7,6,1642,6_negro_blackness_negroes_africanamericans
8,7,1531,7_hillarys_congresswoman_palin_clintons
9,8,1491,8_iranians_irans_tehran_hardliners


In [10]:
# Labeling quotes
topics, _ = topic_model.transform(quotes)

Batches:   0%|          | 0/26824 [00:00<?, ?it/s]

In [11]:
df['topic'] = topics

In [None]:
# Save augmented dataset
df_json = df.to_json('quotes_topics', orient="index")

In [16]:
df.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,probas,urls,phase,topic
0,2020-02-18-004289,an appetite for power.,,[],2020-02-18 14:44:45,3,"[[None, 0.3665], [Robin Niblett, 0.3339], [Jos...","[https://hypervocal.com/items/3249757, https:/...",E,188
1,2020-01-09-006199,Andrew Yang's Lies About Supporting Medicare f...,Andrew Yang,"[Q11118258, Q28723576]",2020-01-09 01:21:54,2,"[[Andrew Yang, 0.7197], [None, 0.2804]]",[https://www.nytimes.com/2020/01/08/opinion/me...,E,3851
2,2020-01-22-017789,eager to erase the image of congressional Repu...,Eric Cantor,[Q497271],2020-01-22 21:20:52,2,"[[Eric Cantor, 0.5013], [None, 0.3045], [Kevin...",[http://mobile.nytimes.com/2020/01/22/us/polit...,E,-1
3,2020-01-31-022641,Given the partisan nature of this impeachment ...,Lisa Murkowski,[Q22360],2020-01-31 00:00:00,24,"[[Lisa Murkowski, 0.6433], [None, 0.224], [Joh...",[http://feeds.foxnews.com/~r/foxnews/politics/...,E,6535
4,2020-01-23-024008,"He got on top of me, and he raped me.",Annabella Sciorra,[Q231395],2020-01-23 00:00:00,75,"[[Annabella Sciorra, 0.5251], [Harvey Weinstei...",[https://www.rawstory.com/2020/01/sopranos-act...,E,3838


In [13]:
# Save model
topic_model.get_topic_info().to_json('topics_info', orient="index")

In [27]:
topic_model.get_topics()[0]

[('beijings', 0.0021196342063963215),
 ('beijing', 0.0019653227601270696),
 ('chinaus', 0.0016565958543176365),
 ('shanghai', 0.0014343154481039663),
 ('zhang', 0.0009808039439948859),
 ('chineseamerican', 0.000726674785055448),
 ('yuan', 0.0006794125806808262),
 ('euchina', 0.0005865849466177186),
 ('shenzhen', 0.0005258193750105361),
 ('zhou', 0.0004967961347313305)]