In [192]:
import pandas as pd

# Loading the dataset
path = "/Users/gozde/code/g0zzy/stress_sense/raw_data/Data.csv"
data = pd.read_csv(path)

data.drop(columns=["Unnamed: 0"], inplace=True)
data.dropna(inplace=True)
data.drop_duplicates(inplace=True)

In [193]:
import re

def strip_urls(text: str) -> str:
    """
    Remove URLs (http, https, www, youtu links) from a string.
    """
    # remove http/https URLs
    text = re.sub(r"http\S+", "", text)
    # remove www.* URLs
    text = re.sub(r"www\.\S+", "", text)
    # remove youtube short links
    text = re.sub(r"youtu\.be\S+", "", text)
    return text.strip()

In [194]:
META_TOKENS = {
    # search/meta
    "googling","google","search","searching","searched","searches",
    "stop googling","reassure","reassurance","module","thread","post","comment",
    "discord","reddit","subreddit","dm","pm","chat","server",
    # link/platform
    "http","https","www","url","link","links","imgur","tiktok","instagram",
    "youtube","youtu","yt","twitter","xcom","fb","facebook","app","apps",
}

_meta_regex = re.compile(
    r"(http\S+|www\.\S+|youtu\.be/\S+|youtube\.com/\S+)", flags=re.IGNORECASE
)

def clean_text(s: str) -> str:
    s = s.lower().strip()
    # remove urls
    s = _meta_regex.sub("", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def is_meta_heavy(s: str, threshold=2) -> bool:
    """Flag texts that are mostly meta/search/platform chatter."""
    low = s.lower()
    hits = sum(tok in low for tok in META_TOKENS)
    return hits >= threshold

* Do not do any other preprocessing steps such as lowercasing,  removing stopwords, lemmatization, stemming, etc. as they might remove important context for the model to learn from. Just have to remove the URLs etc from data. 

* **Transformers are trained on raw(ish) text**

In [195]:
#data.statement = data.statement.apply(lambda x: x.strip())

In [196]:
#data.statement = data.statement.apply(strip_urls)

In [197]:
cleaned_docs = []
for statement in data.statement.to_list():
    cleaned_text = clean_text(statement)
    if not is_meta_heavy(cleaned_text, threshold=3):   # drop very meta-heavy posts
        cleaned_docs.append(cleaned_text)

print(len(data.statement.to_list()), "→", len(cleaned_docs), "after meta filtering")

51093 → 48342 after meta filtering


In [198]:
data.head(10)

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety
5,"every break, you must be nervous, like somethi...",Anxiety
6,"I feel scared, anxious, what can I do? And may...",Anxiety
7,Have you ever felt nervous but didn't know why?,Anxiety
8,"I haven't slept well for 2 days, it's like I'm...",Anxiety
9,"I'm really worried, I want to cry.",Anxiety


In [199]:
data.shape

(51093, 2)

In [200]:
suicidal = data[data['status'].isin(['Suicidal'])].copy()
suicidal['statement'].to_list()[:5]


["I am so exhausted of this. Just when I think I can finally rest, just when I think maybe things are starting to settle, another hurdle comes flying at me. This month alone we found out my mum could be dying, my girlfriend left me, my parents revealed that they wanted a divorce, my grandad was hospitalised again and just now my little sister's been rushed to A&amp;E with possible brain damage. If there is a god up there they must fucking hate me. it is like life is trying to get me to kill myself and honestly I think I would be better off dead. I attempted when I was 12 but I was stupid and there was no way I could cut deep enough. Now I am 15 and everything is so much worse than it ever has been and I just cannot hold on much longer -- it is going to take a miracle to get me through this. I feel so alone. I feel like the world hates me and I have no idea what I did wrong to deserve this. I thought I was getting better. I was doing so well and now everything's just come crashing down 

In [201]:
new_df = data[data['status'].isin(['Stress', 'Anxiety'])].copy()
stressed_posts = new_df['statement'].to_list()

In [202]:
new_df.head(10)

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety
5,"every break, you must be nervous, like somethi...",Anxiety
6,"I feel scared, anxious, what can I do? And may...",Anxiety
7,Have you ever felt nervous but didn't know why?,Anxiety
8,"I haven't slept well for 2 days, it's like I'm...",Anxiety
9,"I'm really worried, I want to cry.",Anxiety


In [203]:
new_df.shape

(5919, 2)

In [204]:
%pip install sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [205]:
from sentence_transformers import SentenceTransformer

sbert = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sbert.encode(stressed_posts, normalize_embeddings=True, show_progress_bar=True)

Batches: 100%|██████████| 185/185 [00:17<00:00, 10.88it/s]


In [206]:
embeddings

array([[ 0.03366001, -0.03712323,  0.01179676, ..., -0.05401971,
         0.04482312, -0.03369883],
       [ 0.01052009, -0.04578955,  0.04768023, ...,  0.01939357,
        -0.06008147, -0.02400911],
       [ 0.05667384, -0.06692987,  0.0137174 , ...,  0.01170356,
        -0.11973175, -0.05436602],
       ...,
       [ 0.07792854, -0.05168048,  0.05081963, ...,  0.04558662,
        -0.08271255,  0.00072075],
       [ 0.0443574 , -0.108092  ,  0.03242296, ..., -0.00546338,
        -0.0618789 , -0.00842193],
       [ 0.0619358 ,  0.0029512 ,  0.0452257 , ...,  0.00549195,
        -0.07962377,  0.04503339]], shape=(5919, 384), dtype=float32)

In [207]:
%pip install bertopic

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [208]:
from bertopic import BERTopic

topic_model = BERTopic(
    min_topic_size=50,      # try 50, 80, 100; pick the one that gives 8–12 coherent topics
    n_gram_range=(1, 2),
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(stressed_posts, embeddings)  # pass precomputed embeddings!
topic_info = topic_model.get_topic_info()
topic_info.head(10)

2025-09-04 15:52:35,933 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-04 15:52:37,401 - BERTopic - Dimensionality - Completed ✓
2025-09-04 15:52:37,402 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-04 15:52:37,559 - BERTopic - Cluster - Completed ✓
2025-09-04 15:52:37,561 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-04 15:52:38,417 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,80,-1_the_and_to_my,"[the, and, to, my, it, of, that, was, in, hiv]",[Ridiculously anxious about my teeth it’s maki...
1,0,5466,0_and_to_the_my,"[and, to, the, my, of, it, in, that, im, me]",[I am literally at my breaking point. I seriou...
2,1,314,1_restless_is_sleep_the,"[restless, is, sleep, the, to, why, im, you, r...","[Restless, Restless, Restless, Restless.. and...."
3,2,59,2_the_and_my_rabies,"[the, and, my, rabies, to, of, it, was, in, that]",[I believe Someone really has put a curse on m...


In [209]:
# Top words per topic
for tid in topic_info["Topic"].head(10):
    if tid == -1:
        continue
    print("Topic", tid, topic_model.get_topic(tid)[:10])  # list of (word, score)

# Sample 5 docs from a topic
import numpy as np
def sample_topic(tid, k=5):
    idx = np.where(np.array(topics) == tid)[0]
    for i in np.random.choice(idx, size=min(k, len(idx)), replace=False):
        print("-", stressed_posts[i][:200], "...\n")

Topic 0 [('and', np.float64(0.0478869893267478)), ('to', np.float64(0.04452461659268595)), ('the', np.float64(0.038862091178274966)), ('my', np.float64(0.03732715126261726)), ('of', np.float64(0.02864385321832984)), ('it', np.float64(0.027087122731897926)), ('in', np.float64(0.022526029556623202)), ('that', np.float64(0.022228361107371187)), ('im', np.float64(0.02176874575035048)), ('me', np.float64(0.021070882687500205))]
Topic 1 [('restless', np.float64(0.2113458310535256)), ('is', np.float64(0.048502355714489125)), ('sleep', np.float64(0.04139400346864417)), ('the', np.float64(0.03912077984257763)), ('to', np.float64(0.03636324873717884)), ('why', np.float64(0.03543248140155975)), ('im', np.float64(0.03479912101805159)), ('you', np.float64(0.03412982270795422)), ('restlessness', np.float64(0.031618917845023964)), ('restless and', np.float64(0.0314374657477079))]
Topic 2 [('the', np.float64(0.04994318745471983)), ('and', np.float64(0.04563111863818871)), ('my', np.float64(0.038783799

In [210]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
extra_stop_words = [
    # meta/search/platform
    "googling","google","search","searching","searched","reassure","reassurance",
    "module","thread","discord","reddit","subreddit","comment","post","posts",
    "link","links","url","http","https","www","imgur","youtube","youtu","tiktok",
    "instagram","facebook","fb","app","apps",
    # generic filler seen
    "dont","im","ive"
]

merged_stop_words = stop_words + extra_stop_words

vectorizer = CountVectorizer(
    stop_words=merged_stop_words,
    ngram_range=(1,2),
    min_df=0.02,     # appear in ≥ 10 docs
    max_df=0.5,    # appear in ≤ 50% of docs
    #If you see topics dominated by rare weird words, raise min_df
    #If you see topics dominated by generic words (feel, want, people, time), lower max_df
    token_pattern=r"(?u)\b[a-z][a-z]+\b"
)

In [211]:
from bertopic import BERTopic

topic_model = BERTopic(
    vectorizer_model=vectorizer,
    min_topic_size=80,      # try 50, 80, 100; pick the one that gives 8–12 coherent topics
    n_gram_range=(1, 2),
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(stressed_posts, embeddings)  # pass precomputed embeddings!
topic_info = topic_model.get_topic_info()
topic_info.head(10)

2025-09-04 15:52:38,526 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-04 15:52:39,735 - BERTopic - Dimensionality - Completed ✓
2025-09-04 15:52:39,736 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-04 15:52:39,917 - BERTopic - Cluster - Completed ✓
2025-09-04 15:52:39,918 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-04 15:52:40,594 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,100,-1_hantavirus_mice_droppings_bats,"[hantavirus, mice, droppings, bats, incubation...",[Hantavirus - freaking out - vacuumed large am...
1,0,5506,0_panic attack_medication_eye_experienced,"[panic attack, medication, eye, experienced, h...",[Having anxiety and PTSD over being excluded f...
2,1,313,1_restless restless_restless heart_allah_calm ...,"[restless restless, restless heart, allah, cal...","[Restless, restless, anxious. Please be anxiou..."


In [212]:
# Top words per topic
for tid in topic_info["Topic"].head(10):
    if tid == -1:
        continue
    print("Topic", tid, topic_model.get_topic(tid)[:10])  # list of (word, score)

Topic 0 [('panic attack', np.float64(0.004457919903704992)), ('medication', np.float64(0.003607007866384821)), ('eye', np.float64(0.0034400655059576408)), ('experienced', np.float64(0.003129337570735564)), ('heart rate', np.float64(0.003100771515115259)), ('social', np.float64(0.0030864677232975406)), ('lump', np.float64(0.0026504954387945247)), ('exercise', np.float64(0.0025764458990417873)), ('effects', np.float64(0.0024420596330937084)), ('losing', np.float64(0.002411997239379119))]
Topic 1 [('restless restless', np.float64(0.14463960845940668)), ('restless heart', np.float64(0.06416690399619415)), ('allah', np.float64(0.04463986519832602)), ('calm restless', np.float64(0.03959643038625072)), ('restless cant', np.float64(0.03959643038625072)), ('restless want', np.float64(0.03447071540782703)), ('restless since', np.float64(0.03447071540782703)), ('restless every', np.float64(0.03447071540782703)), ('restless know', np.float64(0.03447071540782703)), ('like restless', np.float64(0.02

## Not great performance. Try a better performing model. 

In [213]:
from sentence_transformers import SentenceTransformer

sbert = SentenceTransformer("all-mpnet-base-v2") # better model than all-MiniLM-L6-v2 but slower
embeddings = sbert.encode(stressed_posts, normalize_embeddings=True, show_progress_bar=True)

Batches: 100%|██████████| 185/185 [01:39<00:00,  1.86it/s]


In [214]:
topic_model = BERTopic(
    min_topic_size=60,      # try 50, 80, 100; pick the one that gives 8–12 coherent topics
    n_gram_range=(1, 2),
    calculate_probabilities=True,
    verbose=True
)

topics, probs = topic_model.fit_transform(stressed_posts, embeddings)  # pass precomputed embeddings!
topic_info = topic_model.get_topic_info()
topic_info.head(10)

2025-09-04 15:54:23,367 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-04 15:54:25,063 - BERTopic - Dimensionality - Completed ✓
2025-09-04 15:54:25,068 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-04 15:54:25,261 - BERTopic - Cluster - Completed ✓
2025-09-04 15:54:25,264 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-04 15:54:26,240 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,234,-1_and_the_to_my,"[and, the, to, my, it, of, that, in, was, but]","[My story of HIV anxiety Hi there,\n\nFor the ..."
1,0,5107,0_and_to_the_my,"[and, to, the, my, of, it, in, that, im, me]","[A story about my anxiety in the recent past, ..."
2,1,578,1_restless_im_nervous_you,"[restless, im, nervous, you, why, the, is, to,...","[Restless, Restless, Restless, Why are you so ..."


UMAP: squashes high-dimensional embeddings (384 or 768 dims) down to a smaller space (like 5–10 dims). This makes clustering easier and can emphasize local structure.

HDBSCAN: does the actual clustering on that reduced space.

In [215]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
extra_stop_words = [
    # meta/search/platform
    "googling","google","search","searching","searched","reassure","reassurance",
    "module","thread","discord","reddit","subreddit","comment","post","posts",
    "link","links","url","http","https","www","imgur","youtube","youtu","tiktok",
    "instagram","facebook","fb","app","apps",
    # generic filler seen
    "dont","im","ive"
]

merged_stop_words = stop_words + extra_stop_words

vectorizer = CountVectorizer(
    stop_words=merged_stop_words,
    ngram_range=(1,2),
    min_df=0.02,     # appear in ≥ 2% docs
    max_df=0.5,    # appear in ≤ 50% of docs
    #If you see topics dominated by rare weird words, raise min_df
    #If you see topics dominated by generic words (feel, want, people, time), lower max_df
    token_pattern=r"(?u)\b[a-z][a-z]+\b"
)

In [226]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

from umap import UMAP
umap_model = UMAP(
    n_neighbors=10,       # more local clusters
    n_components=10,     # preserve more info
    min_dist=0.05,
    metric="cosine",
    random_state=42
)

import hdbscan
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=80,   # increase when you need fewer topics
    min_samples=10,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True   # REQUIRED for probabilities
)

topic_model = BERTopic(
    embedding_model=sbert,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer,
    calculate_probabilities=True,
    verbose=True
)
topics, probs = topic_model.fit_transform(stressed_posts, embeddings)  # pass precomputed embeddings!
topic_info = topic_model.get_topic_info()
topic_info.head(20)

# Top words per topic
for tid in topic_info["Topic"]:
    print("Topic", tid, topic_model.get_topic(tid)[:])  # list of (word, score)

2025-09-04 15:57:56,790 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-04 15:58:01,056 - BERTopic - Dimensionality - Completed ✓
2025-09-04 15:58:01,056 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-04 15:58:01,221 - BERTopic - Cluster - Completed ✓
2025-09-04 15:58:01,222 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-04 15:58:01,901 - BERTopic - Representation - Completed ✓


Topic -1 [('rabies', np.float64(0.0031039746872374677)), ('hiv', np.float64(0.002862695539161277)), ('infection', np.float64(0.0024127165432952283)), ('range', np.float64(0.0018019129313513636)), ('diabetes', np.float64(0.0017238729349380807)), ('appetite', np.float64(0.0016365145375209138)), ('fever', np.float64(0.0016017505104125936)), ('back pain', np.float64(0.0015656832877593897)), ('lump', np.float64(0.0015244158777571574)), ('dry', np.float64(0.00140176555089786))]
Topic 0 [('restless', np.float64(0.41270491105956775)), ('restless restless', np.float64(0.07327062845363908)), ('restlessness', np.float64(0.06868786108390416)), ('feeling restless', np.float64(0.0464796866032355)), ('feel restless', np.float64(0.044502388730168464)), ('sleep restless', np.float64(0.03815290255235048)), ('restless heart', np.float64(0.030893977819277132)), ('restless sleep', np.float64(0.02624397370377709)), ('heart restless', np.float64(0.025889774814537147)), ('want sleep', np.float64(0.02565610487

In [217]:
topic_info.head(20)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2614,-1_rabies_hiv_infection_range,"[rabies, hiv, infection, range, diabetes, appe...","[Ex-Smoker, Fear, and Excessive Research Hi ev..."
1,0,287,0_restless_restless restless_restlessness_feel...,"[restless, restless restless, restlessness, fe...","[Why are you so restless, very restless, I'm s..."
2,1,269,1_lymph_flu_swollen_node,"[lymph, flu, swollen, node, nodes, lymph node,...","[Swollen lymph node, rash, dermatits and surpr..."
3,2,243,2_cardiologist_pulse_beats_ekg,"[cardiologist, pulse, beats, ekg, ecg, shortne...","[Mitral valve prolapse, anxiety or something e..."
4,3,232,3_abuse_ex_police_loves,"[abuse, ex, police, loves, bf, court, screamin...",[I obsess about this in my head 24/7 and even ...
5,4,231,4_zoloft_side effects_dose_mg,"[zoloft, side effects, dose, mg, xanax, buspar...",[Anxiety med. that's not an antidepressant? He...
6,5,228,5_survey_meditation_management_stress management,"[survey, meditation, management, stress manage...",[[Repost] The Effectiveness of a 4-Week Online...
7,6,227,6_nervous nervous_tbtb_nervous worried_like ne...,"[nervous nervous, tbtb, nervous worried, like ...",[Ado is nervous because he failed to damage th...
8,7,205,7_tumor_ultrasound_blood test_radiation,"[tumor, ultrasound, blood test, radiation, hyp...",[How I overcame health anxiety (read this is y...
9,8,162,8_team_boss_manager_business,"[team, boss, manager, business, staff, mistake...",[My Workplace Makes Me Anxious So I’m not too ...


In [218]:
info = topic_model.get_topic_info()
print(info[["Topic", "Count", "Name"]].head(20))

# Peek at top words for specific topics to decide merges
def show_top_words(tid, k=10):
    print(f"Topic {tid}:",
        ", ".join([w for w, _ in topic_model.get_topic(tid)[:k]]))

for tid in info["Topic"].tolist()[:]:
    if tid != -1:
        show_top_words(tid)

    Topic  Count                                               Name
0      -1   2614                      -1_rabies_hiv_infection_range
1       0    287  0_restless_restless restless_restlessness_feel...
2       1    269                           1_lymph_flu_swollen_node
3       2    243                     2_cardiologist_pulse_beats_ekg
4       3    232                            3_abuse_ex_police_loves
5       4    231                      4_zoloft_side effects_dose_mg
6       5    228   5_survey_meditation_management_stress management
7       6    227  6_nervous nervous_tbtb_nervous worried_like ne...
8       7    205            7_tumor_ultrasound_blood test_radiation
9       8    162                       8_team_boss_manager_business
10      9    162              9_tumor_brain tumor_aneurysm_floaters
11     10    151                10_suicidal_suicide_feeling way_bpd
12     11    148                   11_grades_studying_classes_grade
13     12    143                 12_rent_homeles

In [219]:
health_help_groups = [[1, 3, 5, 8, 12, 13, 14, 16, 17]]

In [220]:
# Merge them one by one
topic_model.merge_topics(stressed_posts, health_help_groups)
# Inspect new topic distribution
info = topic_model.get_topic_info()
print(info[["Topic","Count","Name"]].head(20))

    Topic  Count                                               Name
0      -1   2614                        -1_rabies_hiv_boyfriend_sex
1       0   1530                      0_lymph_node_nodes_lymph node
2       1    287  1_restless restless_restlessness_feeling restl...
3       2    243                       2_cardiologist_beats_ecg_bpm
4       3    231                       3_buspar_ssris_dosage_prozac
5       4    227  4_nervous nervous_tbtb_nervous worried_like ne...
6       5    205                    5_radiation_pcos_docs_pregnancy
7       6    162                 6_floaters_sneeze_glasses_left eye
8       7    151          7_suicide_feeling way_bpd_causes symptoms
9       8    148                  8_grades_classes_grade_motivation
10      9    121                 9_als_left hand_thumb_pins needles


In [221]:
for tid in info["Topic"].tolist()[:]:
    if tid != -1:
        show_top_words(tid)

Topic 0: lymph, node, nodes, lymph node, abuse, lymph nodes, boyfriend, colon, survey, colon cancer
Topic 1: restless restless, restlessness, feeling restless, feel restless, sleep restless, restless heart, restless sleep, heart restless, want sleep, getting restless
Topic 2: cardiologist, beats, ecg, bpm, side chest, holter, stress test, like heart, resting, anxiety heart
Topic 3: buspar, ssris, dosage, prozac, benzos, lorazepam, propranolol, pills, ibuprofen, hydroxyzine
Topic 4: nervous nervous, tbtb, nervous worried, like nervous, oh god, sincere, worried know, nervous like, anxiously, worried worried
Topic 5: radiation, pcos, docs, pregnancy, gov au, health wa, wa gov, cci, cci health, au
Topic 6: floaters, sneeze, glasses, left eye, side head, head pain, blind, screen, visual, double vision
Topic 7: suicide, feeling way, bpd, causes symptoms, understanding anxiety, anxiety anxiety, showering, years later, stop feeling, commit suicide
Topic 8: grades, classes, grade, motivation, h

In [222]:
trauma_groups = [[2, 8]]


In [223]:

topic_model.merge_topics(stressed_posts, trauma_groups)

info = topic_model.get_topic_info()
print(info[["Topic","Count","Name"]].head(20))
for tid in info["Topic"].tolist()[:]:
    if tid != -1:
        show_top_words(tid)

   Topic  Count                                               Name
0     -1   2614                      -1_rabies_hiv_boyfriend_range
1      0   1530                      0_lymph_node_nodes_lymph node
2      1    391                   1_cardiologist_beats_ecg_monitor
3      2    287  2_restless restless_restlessness_feeling restl...
4      3    231                       3_buspar_ssris_dosage_prozac
5      4    227  4_nervous nervous_tbtb_nervous worried_like ne...
6      5    205                     5_radiation_pregnant_pcos_docs
7      6    162                 6_floaters_sneeze_glasses_left eye
8      7    151          7_suicide_feeling way_bpd_causes symptoms
9      8    121                 8_als_left hand_thumb_pins needles
Topic 0: lymph, node, nodes, lymph node, abuse, lymph nodes, boyfriend, colon, survey, colon cancer
Topic 1: cardiologist, beats, ecg, monitor, grades, bpm, side chest, holter, stress test, classes
Topic 2: restless restless, restlessness, feeling restless, feel 

In [224]:
sleep_groups = [[1,9]]

topic_model.merge_topics(stressed_posts, sleep_groups)

# Inspect new topic distribution
topic_info = topic_model.get_topic_info()

IndexError: index 10 is out of bounds for axis 0 with size 10

In [None]:
for tid in topic_info["Topic"]:
    if tid == -1:
        continue
    print("Topic", tid, topic_model.get_topic(tid)[:])  # list of (word, score)

Topic 0 [('ha', np.float64(0.006488729250713299)), ('lymph', np.float64(0.005710478748406089)), ('heart rate', np.float64(0.005468879929905738)), ('flu', np.float64(0.005082835045853778)), ('lump', np.float64(0.004646079169779355)), ('swollen', np.float64(0.004590839339444098)), ('tumor', np.float64(0.004520149935529683)), ('blood pressure', np.float64(0.00424892128765092)), ('red', np.float64(0.004118251117690768)), ('infection', np.float64(0.004065827508212995))]
Topic 1 [('restlessness', np.float64(0.018060195461820458)), ('restless restless', np.float64(0.014889907924740067)), ('feeling restless', np.float64(0.011994490648297287)), ('feel restless', np.float64(0.011509174592726497)), ('sleep restless', np.float64(0.008706753362553383)), ('oh god', np.float64(0.007486630750168906)), ('restless heart', np.float64(0.006983866231717633)), ('dementia', np.float64(0.006653927123022298)), ('nervous restless', np.float64(0.006366187785901057)), ('tbtb', np.float64(0.006366187785901057))]
T

In [None]:
topic_to_theme_dict = {
    1: "sleep_issues",
    7: "financial_stress",
    8: "work_stress",
    10: "academic_stress",
    0: "relationships",
    3: "health_anxiety",
    # fallback:
    -1: "general_stress"
}