In [1]:
# import libraries
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
import pandas as pd
import umap
import torch
import gc
from bertopic.representation import MaximalMarginalRelevance
import pandas as pd

In [2]:
def implement_bertopic(df, text_col, nr_topics=25):
    """
    Function to implement BERTopic on a dataframe with a specified text column.

    Parameters:
        df (pd.DataFrame): The input dataframe containing the text data.
        text_col (str): The column name in the dataframe that contains the text to be analyzed.
        nr_topics (int): The number of topics to reduce to. Default is 25.

    Returns:
        BERTopic: The trained BERTopic model.
    """
    # Extract the specified text column from the dataframe
    comments = df[text_col]
    
    # Convert all text entries to string format to ensure compatibility
    comments = [str(i) for i in comments]
    
    # Initialize Maximal Marginal Relevance (MMR) for topic diversity
    mmr = MaximalMarginalRelevance(diversity=0.9)  # Higher diversity for more varied topics
    
    # Use MMR as the representation model
    representation_model = mmr
    
    # Initialize the Class-based TF-IDF model to reduce frequent words
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    
    # Set up a CountVectorizer with English stopwords
    vectorizer_model = CountVectorizer(stop_words="english")
    
    # Initialize UMAP (Uniform Manifold Approximation and Projection) for dimensionality reduction
    # Set a fixed random state to ensure reproducibility of results
    umap_model = umap.UMAP(random_state=42)
    
    # Configure and initialize the BERTopic model
    topic_model = BERTopic(
        # Representation model to control the diversity of topics
        representation_model=representation_model,
        
        # Vectorizer model for converting text into numerical representations
        vectorizer_model=vectorizer_model,
        
        # Enable verbose output for progress tracking
        verbose=True,
        
        # Class-based TF-IDF model for topic representation
        ctfidf_model=ctfidf_model,
        
        # Number of topics to reduce to
        nr_topics=nr_topics,
        
        # Pass UMAP model for dimensionality reduction with fixed random state
        umap_model=umap_model,
        
        # Calculate probabilities for each topic assignment
        calculate_probabilities=True,
    )
    
    # Fit the BERTopic model on the text data and transform to generate topics and probabilities
    topics, probs = topic_model.fit_transform(comments)
    
    # Clear GPU memory to prevent memory leaks
    torch.cuda.empty_cache()
    
    # Run garbage collection to free up memory
    gc.collect()
    
    # Return the trained BERTopic model
    return topic_model


In [3]:
# read and drop duplicates. 
df = pd.read_csv('data/depression_SG_reddit.csv.gz', compression='gzip')
df['created_utc'] = (pd.to_datetime(df['created_utc'], unit='s'))
df=df[df['created_utc'] >= '2015-01-01'].reset_index(drop=True)
len(df)
df=df.drop_duplicates(subset=["title"])

# implementing BERTopic
#df_title=df[df["title"].apply(lambda x: len(x.split())>=5)]
topic_model_title=implement_bertopic(df,"title")

2025-01-13 10:15:24,940 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/87 [00:00<?, ?it/s]

2025-01-13 10:15:30,669 - BERTopic - Embedding - Completed ✓
2025-01-13 10:15:30,670 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-01-13 10:16:01,243 - BERTopic - Dimensionality - Completed ✓
2025-01-13 10:16:01,247 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-01-13 10:16:02,001 - BERTopic - Cluster - Completed ✓
2025-01-13 10:16:02,004 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-01-13 10:16:04,945 - BERTopic - Representation - Completed ✓
2025-01-13 10:16:04,949 - BERTopic - Topic reduction - Reducing number of topics
2025-01-13 10:16:06,091 - BERTopic - Topic reduction - Reduced number of topics from 53 to 25


In [4]:
# merging similar topics (esp those that pre-trained models will not be familiar with in the Singapore context)
# e.g., --> "PES" and "NS" are related but the pre-trained models are not familiar with those. 

topic_model_title.merge_topics(list(df["title"]), [[-1,0,3,7,12,14,16,17,23],[21,19,9],[4,20],[18,5],[11,6],[15,2]]) 
#topic_model_title.merge_topics(list(df["title"]), [[-1,0,3,7,12,14,16,17,23],[21,19,9],[4,20],[11,6],[15,2]]) 

# get topic ingo
topic_models=topic_model_title.get_topic_info()

topic_models

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,1687,-1_man_help_singaporeans_amp,"[man, help, singaporeans, amp, daughter, spore...",[About 12am+ this guy started yelling. Im unsu...
1,0,241,0_loneliness_like_sad_destined,"[loneliness, like, sad, destined, talk, year, ...",[Helpless as I can no longer stay at my house ...
2,1,229,1_imh_help_affordable_bipolar,"[imh, help, affordable, bipolar, seek, counsel...","[I need help on my mental health., Death at Ri..."
3,2,136,2_poly_gpa_secondary_fresh,"[poly, gpa, secondary, fresh, options, hiring,...",[Career advice for late twenties early jobber ...
4,3,126,3_internship_sg_just_resignation,"[internship, sg, just, resignation, battle, ap...","[just started a internship about a week ago, f..."
5,4,126,4_coronavirus_migrant_measures_pfizer,"[coronavirus, migrant, measures, pfizer, wuhan...",[At breaking point: Singapore's migrant worker...
6,5,86,5_attacks_eating_ptsd_antidepressants,"[attacks, eating, ptsd, antidepressants, natio...","[Anxiety/Panic attacks, [Serious] People who e..."
7,6,82,6_pes_nsf_bunk_depression,"[pes, nsf, bunk, depression, cofounder, exempt...","[Can you down NS PES from depression?, NOC Co-..."
8,7,36,7_heard_shes_assaulted_families,"[heard, shes, assaulted, families, december, d...",[My (now ex) boyfriend attempted rape December...
9,8,24,8_breakup_attachment_unavailable_sensitive,"[breakup, attachment, unavailable, sensitive, ...",[How do I (21M) ask my friend out that I previ...


In [5]:
# save as csv
topic_models.to_csv("topic_models_title.csv", index=False)

In [6]:
document_clusters=topic_model_title.get_document_info(list(df["title"]))
document_clusters=document_clusters.rename(columns={"Document":"title"})
documents=pd.merge(document_clusters,df,how="right",on="title")

In [7]:
documents.to_csv("document_clusters.csv",index=False)

In [8]:
len(documents)

2783