In [1]:
from bertopic import BERTopic
import pandas as pd
import numpy as np
import bz2
import json
from sklearn.feature_extraction.text import CountVectorizer
from hdbscan import HDBSCAN

In [7]:
DATASET_PATH = './quotes_nyt.json'
PATH_MODEL = './bertopic_model'

In [8]:
# Load dataset
df = pd.read_json(DATASET_PATH, orient='index')
df.shape

(858367, 9)

In [9]:
quotes = df.quotation.to_list()

In [10]:
vectorizer_model = CountVectorizer(stop_words='english')
hdbscan_model = HDBSCAN(core_dist_n_jobs=1, prediction_data=True)

In [11]:
topic_model = BERTopic(min_topic_size=100,
                       low_memory=True,
                       calculate_probabilities=False,
                       verbose=True,
                       vectorizer_model=vectorizer_model,
                       hdbscan_model=hdbscan_model).load(PATH_MODEL)


In [7]:
# Fit
topic_model = BERTopic(min_topic_size=100,
                       nr_topics='auto',
                       low_memory=True,
                       calculate_probabilities=False,
                       verbose=True,
                       vectorizer_model=vectorizer_model,
                       hdbscan_model=hdbscan_model).fit(training_set)

Batches:   0%|          | 0/31250 [00:00<?, ?it/s]

2021-12-16 21:34:47,756 - BERTopic - Transformed documents to Embeddings
2021-12-16 21:53:56,860 - BERTopic - Reduced dimensionality with UMAP
2021-12-16 22:06:18,711 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2021-12-16 22:14:57,959 - BERTopic - Reduced number of topics from 11959 to 7907


In [12]:
topic_model.save(PATH_MODEL)

In [None]:
#Number of topics
len(topic_model.get_topics())

7073

In [12]:
topic_model.get_topic_info()[:20]

Unnamed: 0,Topic,Count,Name
0,-1,420063,-1_defendants_defendant_prosecution_dressed
1,0,4380,0_russias_putins_vladimir_moscow
2,1,3926,1_beijing_chinas_chinaus_beijings
3,2,2647,2_justices_judges_judicial_judiciary
4,3,2290,3_negro_africanamericans_blacks_africanamerican
5,4,2205,4_singing_sing_sang_songwriting
6,5,1980,5_yorkers_yorker_brooklyn_yorks
7,6,1942,6_solutions_fixing_fixes_repair
8,7,1904,7_cristiano_ronaldo_mourinho_zlatan
9,8,1791,8_deduction_taxation_deductions_taxed


It is really impressive how Bertopic is able to extract so well-clustered topics.

In [13]:
# Labeling quotes
topics, _ = topic_model.transform(quotes)

Batches:   0%|          | 0/26824 [00:00<?, ?it/s]

In [14]:
# Add a 'topic' column to the original dataframe 
df['topic'] = topics

In [15]:
df.shape

(858367, 10)

Remove outliers which are the quotes labeled with -1.

In [19]:
df = df[df.topic!=-1]

In [23]:
# Save augmented dataset
df.to_json('quotes_topics.json', orient="index")

In [24]:
# Save topics information
topic_model.get_topic_info().to_json('topics_info.json', orient="index")