In [107]:
import pandas as pd

# Loading the dataset
path = "/Users/gozde/code/g0zzy/stress_sense/raw_data/Data.csv"
data = pd.read_csv(path)

data.drop(columns=["Unnamed: 0"], inplace=True)
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [108]:
import re

def strip_urls(text: str) -> str:
    """
    Remove URLs (http, https, www, youtu links) from a string.
    """
    # remove http/https URLs
    text = re.sub(r"http\S+", "", text)
    # remove www.* URLs
    text = re.sub(r"www\.\S+", "", text)
    # remove youtube short links
    text = re.sub(r"youtu\.be\S+", "", text)
    return text.strip()

* Do not do any other preprocessing steps such as lowercasing,  removing stopwords, lemmatization, stemming, etc. as they might remove important context for the model to learn from. Just have to remove the URLs etc from data. 

* **Transformers are trained on raw(ish) text**

In [109]:
data.statement = data.statement.apply(lambda x: x.strip())

In [110]:
data.statement = data.statement.apply(strip_urls)

In [111]:
data.head(10)

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety
5,"every break, you must be nervous, like somethi...",Anxiety
6,"I feel scared, anxious, what can I do? And may...",Anxiety
7,Have you ever felt nervous but didn't know why?,Anxiety
8,"I haven't slept well for 2 days, it's like I'm...",Anxiety
9,"I'm really worried, I want to cry.",Anxiety


In [112]:
data.shape

(51093, 2)

In [140]:
suicidal = data[data['status'].isin(['Suicidal'])].copy()
suicidal['statement'].to_list()[:5]


["I am so exhausted of this. Just when I think I can finally rest, just when I think maybe things are starting to settle, another hurdle comes flying at me. This month alone we found out my mum could be dying, my girlfriend left me, my parents revealed that they wanted a divorce, my grandad was hospitalised again and just now my little sister's been rushed to A&amp;E with possible brain damage. If there is a god up there they must fucking hate me. it is like life is trying to get me to kill myself and honestly I think I would be better off dead. I attempted when I was 12 but I was stupid and there was no way I could cut deep enough. Now I am 15 and everything is so much worse than it ever has been and I just cannot hold on much longer -- it is going to take a miracle to get me through this. I feel so alone. I feel like the world hates me and I have no idea what I did wrong to deserve this. I thought I was getting better. I was doing so well and now everything's just come crashing down 

In [None]:
new_df = data[data['status'].isin(['Stress', 'Anxiety'])].copy()
stressed_posts = new_df['statement'].to_list()

In [114]:
new_df.head(10)

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety
5,"every break, you must be nervous, like somethi...",Anxiety
6,"I feel scared, anxious, what can I do? And may...",Anxiety
7,Have you ever felt nervous but didn't know why?,Anxiety
8,"I haven't slept well for 2 days, it's like I'm...",Anxiety
9,"I'm really worried, I want to cry.",Anxiety


In [115]:
new_df.shape

(5919, 2)

In [116]:
%pip install sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [117]:
from sentence_transformers import SentenceTransformer

sbert = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sbert.encode(stressed_posts, normalize_embeddings=True, show_progress_bar=True)

Batches: 100%|██████████| 185/185 [00:12<00:00, 14.55it/s]


In [118]:
embeddings

array([[ 0.03366001, -0.03712323,  0.01179676, ..., -0.05401971,
         0.04482312, -0.03369883],
       [ 0.01052009, -0.04578955,  0.04768023, ...,  0.01939357,
        -0.06008147, -0.02400911],
       [ 0.05667384, -0.06692987,  0.0137174 , ...,  0.01170356,
        -0.11973175, -0.05436602],
       ...,
       [ 0.07792854, -0.05168048,  0.05081963, ...,  0.04558662,
        -0.08271255,  0.00072075],
       [ 0.0443574 , -0.108092  ,  0.03242296, ..., -0.00546338,
        -0.0618789 , -0.00842193],
       [ 0.0619358 ,  0.0029512 ,  0.0452257 , ...,  0.00549195,
        -0.07962377,  0.04503339]], shape=(5919, 384), dtype=float32)

In [119]:
%pip install bertopic

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [120]:
from bertopic import BERTopic

topic_model = BERTopic(
    min_topic_size=100,      # try 50, 80, 100; pick the one that gives 8–12 coherent topics
    n_gram_range=(1, 2),
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(stressed_posts, embeddings)  # pass precomputed embeddings!
topic_info = topic_model.get_topic_info()
topic_info.head(10)

2025-09-03 19:22:52,175 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-03 19:22:53,423 - BERTopic - Dimensionality - Completed ✓
2025-09-03 19:22:53,423 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-03 19:22:53,621 - BERTopic - Cluster - Completed ✓
2025-09-03 19:22:53,623 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-03 19:22:54,420 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,242,-1_the_and_my_to,"[the, and, my, to, it, of, was, that, in, but]",[I've been making a fool of myself for the pas...
1,0,5356,0_and_to_the_my,"[and, to, the, my, of, it, in, that, im, me]",[I'm mad and sulking over a single bad date. I...
2,1,321,1_restless_is_sleep_the,"[restless, is, sleep, the, to, im, why, you, r...","[restless and restless, want to be angry, Rest..."


In [121]:
# Top words per topic
for tid in topic_info["Topic"].head(10):
    if tid == -1:
        continue
    print("Topic", tid, topic_model.get_topic(tid)[:10])  # list of (word, score)

# Sample 5 docs from a topic
import numpy as np
def sample_topic(tid, k=5):
    idx = np.where(np.array(topics) == tid)[0]
    for i in np.random.choice(idx, size=min(k, len(idx)), replace=False):
        print("-", stressed_posts[i][:200], "...\n")

Topic 0 [('and', np.float64(0.0526925385143319)), ('to', np.float64(0.04894260017416556)), ('the', np.float64(0.04240761654491774)), ('my', np.float64(0.04052513037214601)), ('of', np.float64(0.0309790192657049)), ('it', np.float64(0.029104722976379113)), ('in', np.float64(0.024203234403244303)), ('that', np.float64(0.0238967856497673)), ('im', np.float64(0.02336078525772959)), ('me', np.float64(0.02270074327011312))]
Topic 1 [('restless', np.float64(0.21458831386464486)), ('is', np.float64(0.05232389159246527)), ('sleep', np.float64(0.04685726006492148)), ('the', np.float64(0.04220056505689587)), ('to', np.float64(0.03874799912409774)), ('im', np.float64(0.03719313784059267)), ('why', np.float64(0.0367177579103352)), ('you', np.float64(0.03517788178999008)), ('restlessness', np.float64(0.031701143938705885)), ('and', np.float64(0.031101837843728625))]


In [122]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
vectorizer = CountVectorizer(stop_words=stop_words, ngram_range=(1,2))

In [123]:
from bertopic import BERTopic

topic_model = BERTopic(
    vectorizer_model=vectorizer,
    min_topic_size=100,      # try 50, 80, 100; pick the one that gives 8–12 coherent topics
    n_gram_range=(1, 2),
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(stressed_posts, embeddings)  # pass precomputed embeddings!
topic_info = topic_model.get_topic_info()
topic_info.head(10)

2025-09-03 19:22:54,517 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-03 19:22:55,717 - BERTopic - Dimensionality - Completed ✓
2025-09-03 19:22:55,718 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-03 19:22:55,909 - BERTopic - Cluster - Completed ✓
2025-09-03 19:22:55,910 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-03 19:22:56,572 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,127,-1_rabies_im_hiv_anxiety,"[rabies, im, hiv, anxiety, got, cat, get, wate...",[Anxious about Rabies even though there's no r...
1,0,5481,0_im_like_anxiety_ive,"[im, like, anxiety, ive, feel, dont, get, know...",[I’ve been reading this vape study I’m getting...
2,1,311,1_restless_sleep_im_restlessness,"[restless, sleep, im, restlessness, restless r...","[I'm so restless, why isn't it, very restless,..."


In [124]:
# Top words per topic
for tid in topic_info["Topic"].head(10):
    if tid == -1:
        continue
    print("Topic", tid, topic_model.get_topic(tid)[:10])  # list of (word, score)

Topic 0 [('im', np.float64(0.03703890823211782)), ('like', np.float64(0.024042901550566233)), ('anxiety', np.float64(0.021954514716800714)), ('ive', np.float64(0.019534611416293396)), ('feel', np.float64(0.01890170824998638)), ('dont', np.float64(0.017705148411988378)), ('get', np.float64(0.016986607527659323)), ('know', np.float64(0.015678192051054227)), ('time', np.float64(0.01468359990259021)), ('really', np.float64(0.013580544984070641))]
Topic 1 [('restless', np.float64(0.3457944175257168)), ('sleep', np.float64(0.06618405209307118)), ('im', np.float64(0.0546628484411642)), ('restlessness', np.float64(0.050685034909206596)), ('restless restless', np.float64(0.047223158835553496)), ('dont', np.float64(0.04350419834262731)), ('heart', np.float64(0.042593844660329956)), ('feeling', np.float64(0.04127415130743989)), ('im restless', np.float64(0.03725186725427661)), ('like', np.float64(0.03676644734586805))]


## Not great performance. Try a better performing model. 

In [125]:
from sentence_transformers import SentenceTransformer

sbert = SentenceTransformer("all-mpnet-base-v2")
embeddings = sbert.encode(stressed_posts, normalize_embeddings=True, show_progress_bar=True)

Batches: 100%|██████████| 185/185 [01:31<00:00,  2.03it/s]


In [126]:
topic_model = BERTopic(
    min_topic_size=100,      # try 50, 80, 100; pick the one that gives 8–12 coherent topics
    n_gram_range=(1, 2),
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(stressed_posts, embeddings)  # pass precomputed embeddings!
topic_info = topic_model.get_topic_info()
topic_info.head(10)

2025-09-03 19:24:30,538 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-03 19:24:32,251 - BERTopic - Dimensionality - Completed ✓
2025-09-03 19:24:32,252 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-03 19:24:32,469 - BERTopic - Cluster - Completed ✓
2025-09-03 19:24:32,471 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-03 19:24:33,436 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,194,-1_the_and_to_my,"[the, and, to, my, it, of, in, was, that, but]",[Uncontrollable HIV anxiety New to Reddit so I...
1,0,5129,0_and_to_the_my,"[and, to, the, my, of, it, in, that, im, me]",[Pressure on head likely due to stress and anx...
2,1,596,1_restless_you_im_nervous,"[restless, you, im, nervous, the, is, why, to,...","[wake up restless, want to sleep restless, I'm..."


UMAP: squashes high-dimensional embeddings (384 or 768 dims) down to a smaller space (like 5–10 dims). This makes clustering easier and can emphasize local structure.

HDBSCAN: does the actual clustering on that reduced space.

In [127]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

from umap import UMAP
umap_model = UMAP(
    n_neighbors=10,       # more local clusters
    n_components=10,     # preserve more info
    min_dist=0.05,
    metric="cosine",
    random_state=42
)

import hdbscan
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=80,   # increase when you need fewer topics
    min_samples=10,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True   # REQUIRED for probabilities
)

vectorizer = CountVectorizer(stop_words="english", ngram_range=(1,2), min_df=2, max_df=0.5)

topic_model = BERTopic(
    embedding_model=sbert,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    calculate_probabilities=True,
    verbose=True
)
topics, probs = topic_model.fit_transform(stressed_posts, embeddings)  # pass precomputed embeddings!
topic_info = topic_model.get_topic_info()
topic_info.head(20)

# Top words per topic
for tid in topic_info["Topic"]:
    if tid == -1:
        continue
    print("Topic", tid, topic_model.get_topic(tid)[:])  # list of (word, score)

2025-09-03 19:24:33,550 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-03 19:24:38,275 - BERTopic - Dimensionality - Completed ✓
2025-09-03 19:24:38,276 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-03 19:24:38,448 - BERTopic - Cluster - Completed ✓
2025-09-03 19:24:38,450 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-03 19:24:38,837 - BERTopic - Representation - Completed ✓


Topic 0 [('restless', np.float64(0.3881834339601408)), ('restlessness', np.float64(0.06889082193107779)), ('restless restless', np.float64(0.06784374448406616)), ('feeling restless', np.float64(0.048282579205958714)), ('feel restless', np.float64(0.04389381906339793)), ('heart restless', np.float64(0.030808897346353486)), ('sleep restless', np.float64(0.030808897346353486)), ('don worry', np.float64(0.030614793668932362)), ('oh god', np.float64(0.03004738913126938)), ('nervous don', np.float64(0.027665081501363743))]
Topic 1 [('lymph', np.float64(0.031781539706695755)), ('flu', np.float64(0.0265896843300256)), ('swollen', np.float64(0.024162704057952622)), ('lump', np.float64(0.02238521680119616)), ('node', np.float64(0.021611693168892762)), ('nodes', np.float64(0.019862489175609708)), ('lymph node', np.float64(0.019502312205036572)), ('lymph nodes', np.float64(0.017684686008414913)), ('fever', np.float64(0.015378807966367041)), ('cough', np.float64(0.014664437133846943))]
Topic 2 [('p

In [128]:
topic_info.head(20)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2048,-1_rabies_hiv_social anxiety_reaction,"[rabies, hiv, social anxiety, reaction, smoked...",[Having anxiety and PTSD over being excluded f...
1,0,590,0_restless_restlessness_restless restless_feel...,"[restless, restlessness, restless restless, fe...","[Why are you so restless, why are you always r..."
2,1,424,1_lymph_flu_swollen_lump,"[lymph, flu, swollen, lump, node, nodes, lymph...",[Swollen lymph node in neck and groin for mont...
3,2,286,2_police_abuse_ex_bf,"[police, abuse, ex, bf, door, loves, met, viol...",[He's been violent pretty much since 6months i...
4,3,247,3_cardiologist_pulse_beats_ecg,"[cardiologist, pulse, beats, ecg, shortness br...","[Those of you that Beat Cardiophobia, I Could ..."
5,4,207,4_survey_app_meditation_mindfulness,"[survey, app, meditation, mindfulness, relaxat...",[[Repost] The Effectiveness of a 4-Week Online...
6,5,187,5_brain tumor_ear_aneurysm_tinnitus,"[brain tumor, ear, aneurysm, tinnitus, ears, s...",[Fear of Brain Aneurysm presenting as Headache...
7,6,182,6_grades_studying_classes_motivation,"[grades, studying, classes, motivation, physic...",[My physics class is wringing me out The more ...
8,7,182,7_rent_homeless_bills_paid,"[rent, homeless, bills, paid, assistance, inco...","[Long story short, the alternator + module + r..."
9,8,180,8_mg_zoloft_lexapro_dose,"[mg, zoloft, lexapro, dose, buspar, pill, ssri...",[Zoloft and Buspar Is anyone else on this comb...


In [129]:
info = topic_model.get_topic_info()
print(info[["Topic", "Count", "Name"]].head(20))

# Peek at top words for specific topics to decide merges
def show_top_words(tid, k=10):
    print(f"Topic {tid}:",
        ", ".join([w for w, _ in topic_model.get_topic(tid)[:k]]))

for tid in info["Topic"].tolist()[:]:
    if tid != -1:
        show_top_words(tid)

    Topic  Count                                               Name
0      -1   2048              -1_rabies_hiv_social anxiety_reaction
1       0    590  0_restless_restlessness_restless restless_feel...
2       1    424                           1_lymph_flu_swollen_lump
3       2    286                               2_police_abuse_ex_bf
4       3    247                     3_cardiologist_pulse_beats_ecg
5       4    207                4_survey_app_meditation_mindfulness
6       5    187                5_brain tumor_ear_aneurysm_tinnitus
7       6    182               6_grades_studying_classes_motivation
8       7    182                         7_rent_homeless_bills_paid
9       8    180                           8_mg_zoloft_lexapro_dose
10      9    175                          9_team_boss_manager_fired
11     10    165                          10_dating_ex_gift_cheated
12     11    159                11_abuse_abused_memories_flashbacks
13     12    146            12_ultrasound_radiat

In [130]:
health_help_groups = [[1, 3, 5, 8, 12, 14, 16, 17, 19]]

In [131]:
# Merge them one by one
topic_model.merge_topics(stressed_posts, health_help_groups)
# Inspect new topic distribution
info = topic_model.get_topic_info()
print(info[["Topic","Count","Name"]].head(20))

    Topic  Count                                               Name
0      -1   2048                     -1_rabies_hiv_amp x200b_throat
1       0   1593                      0_throat_lymph_heart rate_flu
2       1    590  1_restlessness_restless restless_feeling restl...
3       2    286                   2_violence_domestic_court_dating
4       3    207                3_survey_app_mindfulness_relaxation
5       4    182               4_feel stressed_physics_uni_homework
6       5    182                 5_homeless_assistance_income_loans
7       6    175                         6_boss_manager_fired_staff
8       7    165                     7_dating_gift_cheated_messages
9       8    159              8_abused_memories_flashbacks_sexually
10      9    126                9_ha_reassurance_googling_amp x200b
11     10    122          10_math_attempted_struggled_anxiety years
12     11     84  11_falling asleep_wake feeling_morning anxiety...


In [132]:
for tid in info["Topic"].tolist()[:]:
    if tid != -1:
        show_top_words(tid)

Topic 0: throat, lymph, heart rate, flu, lump, swollen, tumor, blood pressure, vision, infection
Topic 1: restlessness, restless restless, feeling restless, feel restless, don worry, heart restless, sleep restless, oh god, nervous don, anxious restless
Topic 2: violence, domestic, court, dating, charges, jewish, yelling, dr boyfriend, threw, abusive relationship
Topic 3: survey, app, mindfulness, relaxation, reduce, stress management, stress relief, participate, relieve, participants
Topic 4: feel stressed, physics, uni, homework, chronic stress, math, boring, soccer, hair loss, professor
Topic 5: homeless, assistance, income, loans, loan, url, save money, shelter, housing, savings
Topic 6: boss, manager, fired, staff, shift, tasks, role, quit job, covid, bob
Topic 7: dating, gift, cheated, messages, started dating, marriage, threatened, phone number, texts, blocked
Topic 8: abused, memories, flashbacks, sexually, sexual abuse, abuser, nightmares, assault, rape, molested
Topic 9: ha, r

In [133]:
trauma_groups = [[2, 8]]


In [134]:

topic_model.merge_topics(stressed_posts, trauma_groups)

info = topic_model.get_topic_info()
print(info[["Topic","Count","Name"]].head(20))
for tid in info["Topic"].tolist()[:]:
    if tid != -1:
        show_top_words(tid)

    Topic  Count                                               Name
0      -1   2048                     -1_rabies_hiv_amp x200b_throat
1       0   1593                      0_throat_lymph_heart rate_flu
2       1    590  1_restlessness_restless restless_feeling restl...
3       2    445                            2_ptsd_abused_ex_police
4       3    207                3_survey_app_mindfulness_relaxation
5       4    182               4_feel stressed_uni_physics_homework
6       5    182                 5_homeless_assistance_income_loans
7       6    175                         6_boss_manager_fired_shift
8       7    165                           7_dating_ex_gift_cheated
9       8    126                8_ha_reassurance_googling_amp x200b
10      9    122           9_math_attempted_struggled_anxiety years
11     10     84  10_falling asleep_wake feeling_morning anxiety...
Topic 0: throat, lymph, heart rate, flu, lump, swollen, tumor, blood pressure, vision, infection
Topic 1: restlessne

In [135]:
sleep_groups = [[1,9]]

topic_model.merge_topics(stressed_posts, sleep_groups)

# Inspect new topic distribution
topic_info = topic_model.get_topic_info()

In [136]:
for tid in topic_info["Topic"]:
    if tid == -1:
        continue
    print("Topic", tid, topic_model.get_topic(tid)[:])  # list of (word, score)

Topic 0 [('lymph', np.float64(0.011991598651887795)), ('heart rate', np.float64(0.011971441547386545)), ('flu', np.float64(0.010809776288831806)), ('lump', np.float64(0.009931959177136199)), ('swollen', np.float64(0.009774134380559806)), ('tumor', np.float64(0.009295749237480557)), ('blood pressure', np.float64(0.009250535758297526)), ('vision', np.float64(0.008721652237448073)), ('infection', np.float64(0.008666693550674419)), ('node', np.float64(0.008084904012726535))]
Topic 1 [('restlessness', np.float64(0.03377140080095969)), ('restless restless', np.float64(0.03310918042486818)), ('feeling restless', np.float64(0.023473107975496135)), ('feel restless', np.float64(0.02131294934592358)), ('sleep restless', np.float64(0.014868636231943903)), ('heart restless', np.float64(0.014868636231943903)), ('nervous don', np.float64(0.014699697244312316)), ('lazy', np.float64(0.011306879117966207)), ('anxious restless', np.float64(0.01125520929883891)), ('prayer', np.float64(0.01109344649423921)

In [None]:
topic_to_theme_dict = {
  1: "sleep_issues",
  7: "financial_stress",
  8: "work_stress",
  10: "academic_stress",
  0: "relationships",
  3: "health_anxiety", 
  # fallback:
  -1: "general_stress"
}