In [8]:
from bertopic import BERTopic
import pandas as pd
import numpy as np
import bz2
import json
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from hdbscan import HDBSCAN

In [9]:
DATASET_PATH = './quotes_nyt.json'
PATH_MODEL = './bertopic_model'

In [10]:
# Load dataset
df = pd.read_json(DATASET_PATH, orient='index')
df.shape

(858367, 9)

In [11]:
quotes = df.quotation.to_list()

In [12]:
# Define stopwords
stop_words_file = "stop_words_english.txt"
with open(stop_words_file, "r", encoding='utf-8') as f:
    stop_words = f.read().splitlines() 

In [13]:
vectorizer_model = CountVectorizer(stop_words=stop_words)
hdbscan_model = HDBSCAN(core_dist_n_jobs=1, prediction_data = True)

In [None]:
# Fit
topic_model = BERTopic(min_topic_size=100,
                       low_memory=True,
                       calculate_probabilities=False,
                       verbose=True,
                       vectorizer_model=vectorizer_model,
                       hdbscan_model=hdbscan_model).fit(quotes)

Batches:   0%|          | 0/26824 [00:00<?, ?it/s]

In [22]:
topic_model.save(PATH_MODEL)

In [23]:
topic_model.get_topic_info()[:10]

Unnamed: 0,Topic,Count,Name
0,-1,314589,-1_dresses_yards_feminine_coat
1,0,3177,0_beijing_chinaus_shanghai_beijings
2,1,1830,1_deduction_deductions_taxation_taxed
3,2,1734,2_putins_moscow_russias_russianamerican
4,3,1435,3_koreas_jongun_koreans_pyongyang
5,4,1349,4_yorker_yorks_yorkers_nyc
6,5,1246,5_justices_courtroom_judiciary_judicial
7,6,1243,6_fixing_patchwork_fixes_repair
8,7,1153,7_roster_aguero_seed_offseason
9,8,1151,8_iranians_irans_tehran_iranian


In [24]:
# Labeling quotes
topics, _ = topic_model.transform(quotes)

Batches:   0%|          | 0/19848 [00:00<?, ?it/s]

In [25]:
df['topic'] = topics

In [27]:
# Save augmented dataset
df_json = df.to_json('quotes_topics', orient="index")

In [30]:
# Save model
topic_model.get_topic_info().to_json('topics_info', orient="index")

In [63]:
last_try = topic_model.get_topic_info()[:10]

In [32]:
topic_model.get_topics()

{-1: [('dresses', 6.11164238291428e-05),
  ('yards', 6.111113742318652e-05),
  ('feminine', 6.066398051988231e-05),
  ('coat', 5.863026816313527e-05),
  ('literary', 5.8308150872335795e-05),
  ('costume', 5.810646736118082e-05),
  ('pants', 5.6342074543663055e-05),
  ('sexist', 5.5101943030017045e-05),
  ('sexually', 5.497895983696153e-05),
  ('wore', 5.4773472518015376e-05)],
 0: [('beijing', 0.0024400950954891563),
  ('chinaus', 0.0020660239639000736),
  ('shanghai', 0.0018740817069641032),
  ('beijings', 0.0018229791670903133),
  ('yuan', 0.0012188325333786683),
  ('shenzhen', 0.0012133556729582414),
  ('tariff', 0.0009607258359702299),
  ('zhang', 0.0009168107572412004),
  ('guangzhou', 0.0008874981253057831),
  ('chinabashing', 0.0007160555255818507)],
 1: [('deduction', 0.004414755023012282),
  ('deductions', 0.0038657511647054873),
  ('taxation', 0.0033671616518013837),
  ('taxed', 0.002430218481095354),
  ('highesttaxed', 0.001429182551143739),
  ('earners', 0.00139098627039344

In [31]:
df['topics'] = topics
df.head()