In [63]:
import pandas as pd

# Loading the dataset
path = "/Users/gozde/code/g0zzy/stress_sense/raw_data/Data.csv"
data = pd.read_csv(path)

data.drop(columns=["Unnamed: 0"], inplace=True)
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [64]:
import re

def strip_urls(text: str) -> str:
    """
    Remove URLs (http, https, www, youtu links) from a string.
    """
    # remove http/https URLs
    text = re.sub(r"http\S+", "", text)
    # remove www.* URLs
    text = re.sub(r"www\.\S+", "", text)
    # remove youtube short links
    text = re.sub(r"youtu\.be\S+", "", text)
    return text.strip()

* Do not do any other preprocessing steps such as lowercasing,  removing stopwords, lemmatization, stemming, etc. as they might remove important context for the model to learn from. Just have to remove the URLs etc from data. 

* **Transformers are trained on raw(ish) text**

In [65]:
data.statement = data.statement.apply(lambda x: x.strip())

In [66]:
data.statement = data.statement.apply(strip_urls)

In [67]:
data.head(10)

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety
5,"every break, you must be nervous, like somethi...",Anxiety
6,"I feel scared, anxious, what can I do? And may...",Anxiety
7,Have you ever felt nervous but didn't know why?,Anxiety
8,"I haven't slept well for 2 days, it's like I'm...",Anxiety
9,"I'm really worried, I want to cry.",Anxiety


In [68]:
data.shape

(51093, 2)

In [69]:
new_df = data[data['status'].isin(['Stress','Anxiety'])].copy()
stressed_posts = new_df['statement'].to_list()

In [70]:
new_df.head(10)

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety
5,"every break, you must be nervous, like somethi...",Anxiety
6,"I feel scared, anxious, what can I do? And may...",Anxiety
7,Have you ever felt nervous but didn't know why?,Anxiety
8,"I haven't slept well for 2 days, it's like I'm...",Anxiety
9,"I'm really worried, I want to cry.",Anxiety


In [71]:
new_df.shape

(5919, 2)

In [72]:
%pip install sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [73]:
from sentence_transformers import SentenceTransformer

sbert = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sbert.encode(stressed_posts, normalize_embeddings=True, show_progress_bar=True)

Batches: 100%|██████████| 185/185 [00:13<00:00, 13.54it/s]


In [74]:
embeddings

array([[ 0.03366001, -0.03712323,  0.01179676, ..., -0.05401971,
         0.04482312, -0.03369883],
       [ 0.01052009, -0.04578955,  0.04768023, ...,  0.01939357,
        -0.06008147, -0.02400911],
       [ 0.05667384, -0.06692987,  0.0137174 , ...,  0.01170356,
        -0.11973175, -0.05436602],
       ...,
       [ 0.07792854, -0.05168048,  0.05081963, ...,  0.04558662,
        -0.08271255,  0.00072075],
       [ 0.0443574 , -0.108092  ,  0.03242296, ..., -0.00546338,
        -0.0618789 , -0.00842193],
       [ 0.0619358 ,  0.0029512 ,  0.0452257 , ...,  0.00549195,
        -0.07962377,  0.04503339]], shape=(5919, 384), dtype=float32)

In [75]:
%pip install bertopic

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [76]:
from bertopic import BERTopic

topic_model = BERTopic(
    min_topic_size=100,      # try 50, 80, 100; pick the one that gives 8–12 coherent topics
    n_gram_range=(1, 2),
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(stressed_posts, embeddings)  # pass precomputed embeddings!
topic_info = topic_model.get_topic_info()
topic_info.head(10)

2025-09-03 13:56:58,757 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-03 13:57:00,061 - BERTopic - Dimensionality - Completed ✓
2025-09-03 13:57:00,061 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-03 13:57:00,261 - BERTopic - Cluster - Completed ✓
2025-09-03 13:57:00,263 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-03 13:57:01,098 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,210,-1_the_and_my_to,"[the, and, my, to, it, of, was, that, in, but]",[I've been making a fool of myself for the pas...
1,0,5401,0_and_to_the_my,"[and, to, the, my, of, it, in, that, im, me]","[I'm relapsing on my past HA, at least right n..."
2,1,308,1_restless_is_the_sleep,"[restless, is, the, sleep, to, why, you, im, r...","[restless and restless, want to be angry, Rest..."


In [77]:
# Top words per topic
for tid in topic_info["Topic"].head(10):
    if tid == -1:
        continue
    print("Topic", tid, topic_model.get_topic(tid)[:10])  # list of (word, score)

# Sample 5 docs from a topic
import numpy as np
def sample_topic(tid, k=5):
    idx = np.where(np.array(topics) == tid)[0]
    for i in np.random.choice(idx, size=min(k, len(idx)), replace=False):
        print("-", stressed_posts[i][:200], "...\n")

Topic 0 [('and', np.float64(0.05264873776552635)), ('to', np.float64(0.0488812488960592)), ('the', np.float64(0.04246602995609705)), ('my', np.float64(0.04054899415204985)), ('of', np.float64(0.031035622148064563)), ('it', np.float64(0.029134535506475573)), ('in', np.float64(0.02425288967949876)), ('that', np.float64(0.02390682802249852)), ('im', np.float64(0.023332905139487114)), ('me', np.float64(0.02263502363168563))]
Topic 1 [('restless', np.float64(0.2268177727203945)), ('is', np.float64(0.05171397311796096)), ('the', np.float64(0.043257488140505895)), ('sleep', np.float64(0.03918209872235521)), ('to', np.float64(0.03896605847584167)), ('why', np.float64(0.037052601512180594)), ('you', np.float64(0.03646484522473232)), ('im', np.float64(0.0356290551262489)), ('restlessness', np.float64(0.03387201784387406)), ('restless and', np.float64(0.032630925234269595))]


In [78]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=(1,2))

In [79]:
from bertopic import BERTopic

topic_model = BERTopic(
    vectorizer_model=vectorizer,
    min_topic_size=100,      # try 50, 80, 100; pick the one that gives 8–12 coherent topics
    n_gram_range=(1, 2),
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(stressed_posts, embeddings)  # pass precomputed embeddings!
topic_info = topic_model.get_topic_info()
topic_info.head(10)

2025-09-03 13:57:01,200 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-03 13:57:02,456 - BERTopic - Dimensionality - Completed ✓
2025-09-03 13:57:02,456 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-03 13:57:02,672 - BERTopic - Cluster - Completed ✓
2025-09-03 13:57:02,674 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-03 13:57:03,371 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,110,-1_rabies_im_hiv_anxiety,"[rabies, im, hiv, anxiety, cat, got, water, ge...",[Anxious about Rabies even though there's no r...
1,0,5486,0_im_like_anxiety_ive,"[im, like, anxiety, ive, feel, dont, get, know...",[How I overcame health anxiety (read this is y...
2,1,323,1_restless_sleep_im_restlessness,"[restless, sleep, im, restlessness, restless r...","[Restless.. and.. restless.., Why are you so r..."


In [80]:
# Top words per topic
for tid in topic_info["Topic"].head(10):
    if tid == -1:
        continue
    print("Topic", tid, topic_model.get_topic(tid)[:10])  # list of (word, score)

Topic 0 [('im', np.float64(0.03709199565985355)), ('like', np.float64(0.024035741365322653)), ('anxiety', np.float64(0.021955521530779246)), ('ive', np.float64(0.01955641682698614)), ('feel', np.float64(0.01888903613904251)), ('dont', np.float64(0.01772381541377867)), ('get', np.float64(0.016975976508118062)), ('know', np.float64(0.015662768015663475)), ('time', np.float64(0.014688172254500831)), ('really', np.float64(0.013573663666862204))]
Topic 1 [('restless', np.float64(0.3288216465217983)), ('sleep', np.float64(0.07170521537813838)), ('im', np.float64(0.05403716125967268)), ('restlessness', np.float64(0.04785297428917061)), ('restless restless', np.float64(0.0445845328835058)), ('dont', np.float64(0.041073371824443136)), ('heart', np.float64(0.04021388477010722)), ('feeling', np.float64(0.039874161110571335)), ('like', np.float64(0.03542050737094046)), ('im restless', np.float64(0.0351703939661033))]


Not great performance. Try a better performing model. 

In [81]:
from sentence_transformers import SentenceTransformer

sbert = SentenceTransformer("all-mpnet-base-v2")
embeddings = sbert.encode(stressed_posts, normalize_embeddings=True, show_progress_bar=True)

Batches: 100%|██████████| 185/185 [01:41<00:00,  1.82it/s]


In [82]:
topic_model = BERTopic(
    min_topic_size=100,      # try 50, 80, 100; pick the one that gives 8–12 coherent topics
    n_gram_range=(1, 2),
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(stressed_posts, embeddings)  # pass precomputed embeddings!
topic_info = topic_model.get_topic_info()
topic_info.head(10)

2025-09-03 13:58:48,000 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-03 13:58:49,545 - BERTopic - Dimensionality - Completed ✓
2025-09-03 13:58:49,546 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-03 13:58:49,752 - BERTopic - Cluster - Completed ✓
2025-09-03 13:58:49,754 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-03 13:58:50,632 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,227,-1_the_and_to_my,"[the, and, to, my, it, of, in, was, that, but]",[Anxious about Rabies even though there's no r...
1,0,5103,0_and_to_the_my,"[and, to, the, my, of, it, in, that, im, me]",[Anxiety seriously holding me back. Any insigh...
2,1,589,1_restless_the_you_im,"[restless, the, you, im, nervous, is, why, to,...","[I'm so restless, I don't know why, Restless a..."


UMAP: squashes high-dimensional embeddings (384 or 768 dims) down to a smaller space (like 5–10 dims). This makes clustering easier and can emphasize local structure.

HDBSCAN: does the actual clustering on that reduced space.

In [83]:
from umap import UMAP
umap_model = UMAP(
    n_neighbors=8,       # more local clusters
    n_components=10,     # preserve more info
    min_dist=0.05,
    metric="cosine",
    random_state=42
)

In [84]:
import hdbscan
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=30,   # start here; try 30–60
    min_samples=10,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True   # REQUIRED for probabilities
)

In [85]:
vectorizer = CountVectorizer(stop_words="english", ngram_range=(1,2), min_df=5, max_df=0.3)

topic_model = BERTopic(
    embedding_model=sbert,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    calculate_probabilities=True,
    verbose=True
)
topics, probs = topic_model.fit_transform(stressed_posts, embeddings)  # pass precomputed embeddings!
topic_info = topic_model.get_topic_info()
topic_info.head(20)

2025-09-03 13:58:50,780 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-03 13:58:54,842 - BERTopic - Dimensionality - Completed ✓
2025-09-03 13:58:54,843 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-03 13:58:55,085 - BERTopic - Cluster - Completed ✓
2025-09-03 13:58:55,087 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-03 13:58:55,447 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2008,-1_ultrasound_kidney_liver_urine,"[ultrasound, kidney, liver, urine, hypochondri...",[Would you go to the doctor again? Alright. Th...
1,0,283,0_restlessness_want sleep_oh god_default,"[restlessness, want sleep, oh god, default, ag...","[What is the sign of restlessness -_-, Panic, ..."
2,1,274,1_ex_police_abuse_bf,"[ex, police, abuse, bf, abusive, door, loves, ...",[Extra: Apparently he was jailed before marryi...
3,2,244,2_cardiologist_pulse_beats_ecg,"[cardiologist, pulse, beats, ecg, ekg, shortne...",[Worried about diagnosis I've been having some...
4,3,219,3_zoloft_dose_lexapro_xanax,"[zoloft, dose, lexapro, xanax, pill, ssri, pro...",[Health Anxiety and Xanax My health anxiety in...
5,4,201,4_lymph_node_nodes_lymph node,"[lymph, node, nodes, lymph node, lymph nodes, ...",[Are swollen lymph nodes always something to w...
6,5,176,5_brain tumor_tumor_aneurysm_tinnitus,"[brain tumor, tumor, aneurysm, tinnitus, sneez...","[First it was lung disease, now it’s a brain t..."
7,6,173,6_ex_invited_friendship_messages,"[ex, invited, friendship, messages, dating, gi...",[Anxiety? Stress? Guilt? Scarred? I’ll try to ...
8,7,155,7_rent_bills_assistance_url,"[rent, bills, assistance, url, income, cost, s...","[>Like, I budget and we are responsible but th..."
9,8,140,8_abuse_abused_memories_sexually,"[abuse, abused, memories, sexually, sexual, ni...","[So, to start with I have ptsd from years of e..."


In [86]:
topic_info.shape

(38, 5)

In [87]:
# Top words per topic
for tid in topic_info["Topic"]:
    if tid == -1:
        continue
    print("Topic", tid, topic_model.get_topic(tid)[:])  # list of (word, score)

Topic 0 [('restlessness', np.float64(0.3018701169115991)), ('want sleep', np.float64(0.09680581022536351)), ('oh god', np.float64(0.0605036313908522)), ('default', np.float64(0.05599260373229614)), ('agitated', np.float64(0.04533724144316514)), ('dancing', np.float64(0.04533724144316514)), ('lazy', np.float64(0.04340560356576938)), ('pls', np.float64(0.0427111901316542)), ('blah', np.float64(0.041335464586347605)), ('complain', np.float64(0.040158237438447694))]
Topic 1 [('ex', np.float64(0.040476444349575744)), ('police', np.float64(0.036263159049526185)), ('abuse', np.float64(0.035255470349810664)), ('bf', np.float64(0.03085301014029086)), ('abusive', np.float64(0.026033587970712185)), ('door', np.float64(0.024926727270324096)), ('loves', np.float64(0.02384061087733486)), ('violence', np.float64(0.02023101895060227)), ('dating', np.float64(0.01822484396714653)), ('court', np.float64(0.01786892126037358))]
Topic 2 [('cardiologist', np.float64(0.04139334425529853)), ('pulse', np.float6

## ^ Gotta tweak the above hyperparameters more to get better topics!

In [106]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

from umap import UMAP
umap_model = UMAP(
    n_neighbors=10,       # more local clusters
    n_components=10,     # preserve more info
    min_dist=0.05,
    metric="cosine",
    random_state=42
)

import hdbscan
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=80,   # increase when you need fewer topics
    min_samples=10,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True   # REQUIRED for probabilities
)

vectorizer = CountVectorizer(stop_words="english", ngram_range=(1,2), min_df=2, max_df=0.5)

topic_model = BERTopic(
    embedding_model=sbert,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    calculate_probabilities=True,
    verbose=True
)
topics, probs = topic_model.fit_transform(stressed_posts, embeddings)  # pass precomputed embeddings!
topic_info = topic_model.get_topic_info()
topic_info.head(20)

# Top words per topic
for tid in topic_info["Topic"]:
    if tid == -1:
        continue
    print("Topic", tid, topic_model.get_topic(tid)[:])  # list of (word, score)

2025-09-03 15:18:36,583 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-03 15:18:40,851 - BERTopic - Dimensionality - Completed ✓
2025-09-03 15:18:40,852 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-03 15:18:41,047 - BERTopic - Cluster - Completed ✓
2025-09-03 15:18:41,049 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-03 15:18:41,492 - BERTopic - Representation - Completed ✓


Topic 0 [('restless', np.float64(0.3881834339601408)), ('restlessness', np.float64(0.06889082193107779)), ('restless restless', np.float64(0.06784374448406616)), ('feeling restless', np.float64(0.048282579205958714)), ('feel restless', np.float64(0.04389381906339793)), ('heart restless', np.float64(0.030808897346353486)), ('sleep restless', np.float64(0.030808897346353486)), ('don worry', np.float64(0.030614793668932362)), ('oh god', np.float64(0.03004738913126938)), ('nervous don', np.float64(0.027665081501363743))]
Topic 1 [('lymph', np.float64(0.031781539706695755)), ('flu', np.float64(0.0265896843300256)), ('swollen', np.float64(0.024162704057952622)), ('lump', np.float64(0.02238521680119616)), ('node', np.float64(0.021611693168892762)), ('nodes', np.float64(0.019862489175609708)), ('lymph node', np.float64(0.019502312205036572)), ('lymph nodes', np.float64(0.017684686008414913)), ('fever', np.float64(0.015378807966367041)), ('cough', np.float64(0.014664437133846943))]
Topic 2 [('p

In [107]:
topic_info.head(20)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2048,-1_rabies_hiv_social anxiety_reaction,"[rabies, hiv, social anxiety, reaction, smoked...",[Having anxiety and PTSD over being excluded f...
1,0,590,0_restless_restlessness_restless restless_feel...,"[restless, restlessness, restless restless, fe...","[Why are you so restless, why are you always r..."
2,1,424,1_lymph_flu_swollen_lump,"[lymph, flu, swollen, lump, node, nodes, lymph...",[Swollen lymph node in neck and groin for mont...
3,2,286,2_police_abuse_ex_bf,"[police, abuse, ex, bf, door, loves, met, viol...",[He's been violent pretty much since 6months i...
4,3,247,3_cardiologist_pulse_beats_ecg,"[cardiologist, pulse, beats, ecg, shortness br...","[Those of you that Beat Cardiophobia, I Could ..."
5,4,207,4_survey_app_meditation_mindfulness,"[survey, app, meditation, mindfulness, relaxat...",[[Repost] The Effectiveness of a 4-Week Online...
6,5,187,5_brain tumor_ear_aneurysm_tinnitus,"[brain tumor, ear, aneurysm, tinnitus, ears, s...",[Fear of Brain Aneurysm presenting as Headache...
7,6,182,6_grades_studying_classes_motivation,"[grades, studying, classes, motivation, physic...",[My physics class is wringing me out The more ...
8,7,182,7_rent_homeless_bills_paid,"[rent, homeless, bills, paid, assistance, inco...","[Long story short, the alternator + module + r..."
9,8,180,8_mg_zoloft_lexapro_dose,"[mg, zoloft, lexapro, dose, buspar, pill, ssri...",[Zoloft and Buspar Is anyone else on this comb...


In [108]:
topic_model.reduce_topics(stressed_posts, nr_topics=15)


2025-09-03 15:18:45,554 - BERTopic - Topic reduction - Reducing number of topics
2025-09-03 15:18:45,562 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-03 15:18:45,963 - BERTopic - Representation - Completed ✓
2025-09-03 15:18:45,965 - BERTopic - Topic reduction - Reduced number of topics from 21 to 15


<bertopic._bertopic.BERTopic at 0x396555b10>

In [109]:
topic_info = topic_model.get_topic_info()
for tid in topic_info["Topic"]:
    if tid == -1:
        continue
    print("Topic", tid, topic_model.get_topic(tid)[:])  # list of (word, score)

Topic 0 [('restlessness', np.float64(0.09086124888645698)), ('restless restless', np.float64(0.08929499577339291)), ('feeling restless', np.float64(0.063437073204608)), ('feel restless', np.float64(0.05763779437234563)), ('sleep restless', np.float64(0.04034258001859497)), ('heart restless', np.float64(0.04034258001859497)), ('don worry', np.float64(0.040338606710893614)), ('oh god', np.float64(0.03949963090925287)), ('nervous don', np.float64(0.036240823301123076)), ('want sleep', np.float64(0.03620799500014846))]
Topic 1 [('lymph', np.float64(0.028035185612398546)), ('swollen', np.float64(0.022303881746611243)), ('node', np.float64(0.019035425588186102)), ('nodes', np.float64(0.017282698514807886)), ('lymph node', np.float64(0.017181372146158887)), ('stool', np.float64(0.016281042086312102)), ('lymph nodes', np.float64(0.015365725553471642)), ('colon', np.float64(0.013603621750012186)), ('colon cancer', np.float64(0.012199469661919406)), ('lymphoma', np.float64(0.011095468099101577))