# BERTopic Modelling

```sh
pip install bertopic
```

In [1]:
# Built-in library
import re
import json
from typing import Any, Optional, TypeAlias, Union
import logging
import warnings

# Standard imports
import numpy as np
from pprint import pprint
import pandas as pd
import polars as pl
from rich import print
import torch

# Visualization
import matplotlib.pyplot as plt


# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

### Load Data

In [2]:
from datasets import Dataset, load_dataset

path: str = "jamescalam/reddit-topics"
split: str = "train"

data: Dataset = load_dataset(path=path, split=split)
data

Dataset({
    features: ['sub', 'title', 'selftext', 'upvote_ratio', 'id', 'created_utc'],
    num_rows: 3791
})

In [3]:
# Remove data points with small text size
data = data.filter(lambda x: len(x.get("selftext")) > 30)
data.num_rows

3118

In [4]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS
from umap import UMAP


encoder_checkpoint: str = "all-MiniLM-L6-v2"
stopwords: list[str] = list(STOP_WORDS) + ["http", "https", "amp", "com"]

# Step 1
embedding_model: SentenceTransformer = SentenceTransformer(encoder_checkpoint)

# Step 2
umap_model = UMAP(n_neighbors=3, n_components=3, min_dist=0.05)

# Step 3
hdbscan_model = HDBSCAN(
    max_cluster_size=100,
    prediction_data=True,  # Whether to generate extra cached data for predicting labels
    gen_min_span_tree=True,  # Whether to generate the min spanning tree with regard to mutual reachability distance for later analysis.
)

# Step 4
# Add this to remove stopwords that can pollute topcs
vectorizer_model = CountVectorizer(stop_words=stopwords)

# Step 5
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

2023-12-07 18:17:34.649643: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
model = BERTopic(
    embedding_model=embedding_model,  # Step 1 - Extract embeddings
    umap_model=umap_model,  # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,  # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,  # Step 5 - Extract topic words
    representation_model=representation_model,  # Step 6 - (Optional) Fine-tune topic represenations
    top_n_words=5,
    language="english",
    calculate_probabilities=True,
    verbose=True,
)

model

<bertopic._bertopic.BERTopic at 0x7f9d59dbfee0>

In [6]:
# It takes approx 2m 20s
topics, probs = model.fit_transform(data["selftext"])

2023-12-07 18:17:43,195 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/98 [00:00<?, ?it/s]

2023-12-07 18:19:22,289 - BERTopic - Embedding - Completed ✓
2023-12-07 18:19:22,290 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-12-07 18:19:34,775 - BERTopic - Dimensionality - Completed ✓
2023-12-07 18:19:34,776 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-12-07 18:19:36,763 - BERTopic - Cluster - Completed ✓
2023-12-07 18:19:36,767 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-12-07 18:20:22,826 - BERTopic - Representation - Completed ✓


In [7]:
print(model.get_params())  # ["nr_topics"]

In [8]:
# Check the number of topics
num_topics = model.get_params()["nr_topics"]
print(f"Number of topics: {num_topics}")

In [9]:
for i in range(5):
    print(f"{topics[i]}: {data['title'][i]}")

In [10]:
model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,729,-1_gpu_512_market_stock,"[gpu, 512, market, stock, inflation]","[*Disclaimer:* huge bricks of text coming up...\n\n**Stock Valuation:** \n\nEuropean comps 888 Holdings, Entain Plc, and Flutter Entertainment trade at avg. multiples of 17.9x EBITDA and 3.5x sales. 888 Holdings is half the size of Kindred on a revenue basis, and a quarter of the size on an EBITDA basis. Entain is 2.5x larger than Kindred on a revenue basis, but less than 2x its size on an EBITDA basis. Flutter is 4.5x the size of Kindred on a revenue basis but only 2.5x its size on an EBITDA basis. These comparisons point to the significantly higher profitability of Kindred vis-à-vis it..."
1,0,63,0_investing_financial_savings_funds,"[investing, financial, savings, funds, 000]","[Have a general question? Want to offer some commentary on markets? Maybe you would just like to throw out a neat fact that doesn't warrant a self post? Feel free to post here! \n\nIf your question is ""I have $10,000, what do I do?"" or other ""advice for my personal situation"" questions, you should include relevant information, such as the following:\n\n* How old are you? What country do you live in? \n* Are you employed/making income? How much? \n* What are your objectives with this money? (Buy a house? Retirement savings?) \n* What is your time horizon? Do you need this money next m..."
2,1,35,1_multiprocessing_async_parallelism_parallelise,"[multiprocessing, async, parallelism, parallelise, pytorch]","[I want to exploit some parallelism in the custom model I've built and submit asynchronous kernels to CUDA which execute in parallel.\n\nSay I have a ModuleList full of operations which are fully independent of each other but which require the same input. The results of those operations are later collated into a single object to be passed on in the forward pass.\n\nBelow is a minimally illustrative example of how I *think* streams and events are used.\n\n from torch.cuda import Stream, stream, Event\n from torch import empty\n def func(x, module_list):\n out = empty(len(mod..."
3,2,35,2_pytorch_tensor_torch_training_outputs,"[pytorch, tensor, torch, training_outputs, training_inputs]","[Im attempting to make a self driving car in a game right now I have a network taking in 5 inputs from the game and I want 4 binary outputs ex\[0,1,0,1\] I've tried a bunch of functions/loss calculations and nothing I can get to work. What i want on the output is a mutli hot vector. The code I'm using right now is below. Any advice is appreciated.\n\n&amp;#x200B;\n\n import torch\n import torchvision\n from torchvision import transforms,datasets\n import numpy as np\n import torch.nn as nn\n import torch.nn.functional as F\n import torch.optim as optim\n from torch..."
4,3,35,3_sentimentanalysisapp_sentiment_nlp_nltk,"[sentimentanalysisapp, sentiment, nlp, nltk, texts]","[Hi everyone,\n\nI am doing an Aspect Based Sentiment Analysis using BERT Model, however, I noticed that the state of art XLNet model over performed the BERT model in most of NLP applications. I couldn't see any implementation for Aspect Based Sentiment Analysis on Internet , so I am curious if it is possible to do it?, I was planning to perform sentiment analysis for news articles but after reading this post it seem that it will not be easy, I have an assessment and I was asked to perform 2 NLP tasks so I chose sentiment analysis and summarisation \nNow I'll perform sentiment for article..."


In [11]:
# Modify topics
topic_labels: list[str] = model.generate_topic_labels(
    nr_words=4, topic_prefix=False, word_length=15, separator=" - "
)
topic_labels[:5]

['gpu - 512 - market - stock',
 'investing - financial - savings - funds',
 'multiprocessing - async - parallelism - parallelise',
 'pytorch - tensor - torch - training_output',
 'sentimentanalys - sentiment - nlp - nltk']

In [12]:
model.set_topic_labels(topic_labels=topic_labels)

model.get_topic_info().head()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,729,-1_gpu_512_market_stock,gpu - 512 - market - stock,"[gpu, 512, market, stock, inflation]","[*Disclaimer:* huge bricks of text coming up...\n\n**Stock Valuation:** \n\nEuropean comps 888 Holdings, Entain Plc, and Flutter Entertainment trade at avg. multiples of 17.9x EBITDA and 3.5x sales. 888 Holdings is half the size of Kindred on a revenue basis, and a quarter of the size on an EBITDA basis. Entain is 2.5x larger than Kindred on a revenue basis, but less than 2x its size on an EBITDA basis. Flutter is 4.5x the size of Kindred on a revenue basis but only 2.5x its size on an EBITDA basis. These comparisons point to the significantly higher profitability of Kindred vis-à-vis it..."
1,0,63,0_investing_financial_savings_funds,investing - financial - savings - funds,"[investing, financial, savings, funds, 000]","[Have a general question? Want to offer some commentary on markets? Maybe you would just like to throw out a neat fact that doesn't warrant a self post? Feel free to post here! \n\nIf your question is ""I have $10,000, what do I do?"" or other ""advice for my personal situation"" questions, you should include relevant information, such as the following:\n\n* How old are you? What country do you live in? \n* Are you employed/making income? How much? \n* What are your objectives with this money? (Buy a house? Retirement savings?) \n* What is your time horizon? Do you need this money next m..."
2,1,35,1_multiprocessing_async_parallelism_parallelise,multiprocessing - async - parallelism - parallelise,"[multiprocessing, async, parallelism, parallelise, pytorch]","[I want to exploit some parallelism in the custom model I've built and submit asynchronous kernels to CUDA which execute in parallel.\n\nSay I have a ModuleList full of operations which are fully independent of each other but which require the same input. The results of those operations are later collated into a single object to be passed on in the forward pass.\n\nBelow is a minimally illustrative example of how I *think* streams and events are used.\n\n from torch.cuda import Stream, stream, Event\n from torch import empty\n def func(x, module_list):\n out = empty(len(mod..."
3,2,35,2_pytorch_tensor_torch_training_outputs,pytorch - tensor - torch - training_output,"[pytorch, tensor, torch, training_outputs, training_inputs]","[Im attempting to make a self driving car in a game right now I have a network taking in 5 inputs from the game and I want 4 binary outputs ex\[0,1,0,1\] I've tried a bunch of functions/loss calculations and nothing I can get to work. What i want on the output is a mutli hot vector. The code I'm using right now is below. Any advice is appreciated.\n\n&amp;#x200B;\n\n import torch\n import torchvision\n from torchvision import transforms,datasets\n import numpy as np\n import torch.nn as nn\n import torch.nn.functional as F\n import torch.optim as optim\n from torch..."
4,3,35,3_sentimentanalysisapp_sentiment_nlp_nltk,sentimentanalys - sentiment - nlp - nltk,"[sentimentanalysisapp, sentiment, nlp, nltk, texts]","[Hi everyone,\n\nI am doing an Aspect Based Sentiment Analysis using BERT Model, however, I noticed that the state of art XLNet model over performed the BERT model in most of NLP applications. I couldn't see any implementation for Aspect Based Sentiment Analysis on Internet , so I am curious if it is possible to do it?, I was planning to perform sentiment analysis for news articles but after reading this post it seem that it will not be easy, I have an assessment and I was asked to perform 2 NLP tasks so I chose sentiment analysis and summarisation \nNow I'll perform sentiment for article..."


### Visualizations

In [13]:
model.visualize_barchart(
    top_n_topics=10,
    width=300,
    height=300,
)

In [14]:
model.visualize_hierarchy()

In [15]:
model.visualize_heatmap(top_n_topics=10)