In [33]:
import os
import pandas as pd
import nltk 
import numpy as np
import scipy 
import matplotlib 
import re
import transformers

In [None]:
df = pd.read_csv('your_dataset.csv', index_col=0) # Load CSV file with 'post_title', 'comment_text', 'upvotes'
df.head()  # Inspect the dataset 

Unnamed: 0,post_title,comment_text,upvotes
0,Canada retaliating for Trump’s tariffs with 25...,American here. Canada and Mexico are respondin...,8725
1,Canada retaliating for Trump’s tariffs with 25...,It won't just be Canada and Mexico. That's ju...,4884
2,Canada retaliating for Trump’s tariffs with 25...,As an American I really wish we invested in ed...,3471
3,Canada retaliating for Trump’s tariffs with 25...,"I mean, doesn't surprise me. I hope the Candia...",2610
4,Canada retaliating for Trump’s tariffs with 25...,Trudeau talking to Americans directly saying i...,2214


In [None]:
comments = [str(comment) for comment in df.comment_text.to_list()] # Convert the comments to a list of strings 
print(comments) 



In [None]:
# Load and train the topic model on Reddit data
from bertopic import BERTopic 
from bertopic.representation import KeyBERTInspired 

representation_model = KeyBERTInspired() 
topic_model = BERTopic(representation_model=representation_model, embedding_model="all-MiniLM-L6-v2") # Choosing representation and embedding models is optional. Constructor can be called also without the two paramenters
topics, probs = topic_model.fit_transform(comments) 

In [None]:
topic_model.save("path_to_model_directory", serialization="safetensors", save_ctfidf=True, save_embedding_model="all-MiniLM-L6-v2") # Save the model 

In [50]:
topic_model.get_topic_info() # Data Frame for visualizing the features of the topic model
topic_model.visualize_barchart(top_n_topics=20) # Visualize the top n topics in a bar chart
topic_df = topic_model.get_topic_info() # Store the topics into a data frame
topic_df.head() # Inspect the topic data frame

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2496,-1_trump_government_president_tariffs,"[trump, government, president, tariffs, federa...","[Yes, we are watching the worst aspects of Ame..."
1,0,322,0_faa_aviation_pilots_airlines,"[faa, aviation, pilots, airlines, pilot, donal...","[I am no expert, but the air traffic controlle..."
2,1,167,1_pelosi_bernie_republican_president,"[pelosi, bernie, republican, president, candid...",[To everyone who believed in this man enough t...
3,2,155,2_shit_that_good_fucking,"[shit, that, good, fucking, fuck, lol, summed,...",[This one didn’t. I actually give a shit what ...
4,3,112,3_news_breitbart_journalism_huffington,"[news, breitbart, journalism, huffington, jour...","[>The New York Times, NBC News, National Publi..."


In [51]:
topic_model.visualize_heatmap(n_clusters=10) # Create a heatmap to investigate correlation between topics

In [52]:
topic_labels = topic_model.generate_topic_labels(nr_words=3, topic_prefix=False, word_length=15, separator=" - ") # Create human readable labels for each topic
topic_model.set_topic_labels(topic_labels) # Set the labels
topic_df = topic_model.get_topic_info() # Replace the data frame with the new one including more informative labels
topic_df.head()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,2496,-1_trump_government_president_tariffs,trump - government - president,"[trump, government, president, tariffs, federa...","[Yes, we are watching the worst aspects of Ame..."
1,0,322,0_faa_aviation_pilots_airlines,faa - aviation - pilots,"[faa, aviation, pilots, airlines, pilot, donal...","[I am no expert, but the air traffic controlle..."
2,1,167,1_pelosi_bernie_republican_president,pelosi - bernie - republican,"[pelosi, bernie, republican, president, candid...",[To everyone who believed in this man enough t...
3,2,155,2_shit_that_good_fucking,shit - that - good,"[shit, that, good, fucking, fuck, lol, summed,...",[This one didn’t. I actually give a shit what ...
4,3,112,3_news_breitbart_journalism_huffington,news - breitbart - journalism,"[news, breitbart, journalism, huffington, jour...","[>The New York Times, NBC News, National Publi..."


In [None]:
topic_model.visualize_documents( # Visualize documents in an interactive 2D map 
    comments, 
    topics=list(range(30)),
    custom_labels = True,
    height = 600
)

In [None]:
df['topic_id'] = [topic for topic in topic_model.topics_] # Add the topics to the data frame

topic_comment_df = (df.merge(topic_df, left_on='topic_id', right_on='Topic').reindex(columns=['post_title', 'comment_text', 'upvotes', 'CustomName', 'topic_id'])) # Map the comments to their topic
topic_comment_df.rename(columns={'CustomName': 'topic_label'}, inplace=True) # Rename the topic label column
topic_comment_df.head() # Inspect the data frame 

In [None]:
topic_comment_df.to_csv("/your_path/topic_comment_dataset.csv") # Store the data frame in a CSV file 