**IMPORT ALL LIBRARIES**

In [1]:
import pandas as pd
import numpy as np
import contractions
import re

from sentence_transformers import SentenceTransformer
import torch

from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic

import pickle
import joblib

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


# **LOAD DATASET AND PREPROCESS**

In [2]:
df = pd.read_csv("medical_data.csv")
df

  df = pd.read_csv("medical_data.csv")


Unnamed: 0,user_id,username,time,rating,original_text,translated_text,final_with_stopword,final_no_stopword,ori_wc,final_wc,...,category_12,category_13,category_14,category_15,category_16,category_17,category_18,category_19,category_20,category_21
0,1.144028e+20,Sajida Dar,1583977664092,5.0,"Great people,great staff and great facility to...","Great people,great staff and great facility to...",great people great staff and great facility to...,great people great staff great facility go thu...,9,10,...,,,,,,,,,,
1,1.027817e+20,Tamara Walker,1604380776309,2.0,Security guard asked me to put a mask on my 13...,Security guard asked me to put a mask on my 13...,security guard ask me to put a mask on my 13 m...,security guard ask put mask 13 month old baby ...,65,68,...,,,,,,,,,,
2,1.147746e+20,Christina Veres,1560489213707,4.0,One of the better Cub foods that I have shoppe...,One of the better Cub foods that I have shoppe...,one of the good cub food that i have shop at e...,one good cub food shop everything need right e...,39,39,...,,,,,,,,,,
3,1.108501e+20,Keegan Leahy,1581384638564,1.0,Front of house has amazing staff.. Owner howev...,Front of house has amazing staff.. Owner howev...,front of house have amazing staff owner howeve...,front house amazing staff owner however lot gr...,51,51,...,,,,,,,,,,
4,1.106635e+20,Kyle Ebert,1592438884887,2.0,Came in early\nNo one was there except one per...,Came in early\nNo one was there except one per...,come in early no one be there except one perso...,come early no one except one person need simpl...,60,62,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316339,1.020788e+20,Kevin Portee,1577891990033,5.0,I feel like a new person on every visit,I feel like a new person on every visit,i feel like a new person on every visit,feel like new person every visit,9,9,...,,,,,,,,,,
316340,1.143790e+20,James Rose,1522868195876,5.0,The aqua therapy I have been receiving there h...,The aqua therapy I have been receiving there h...,the aqua therapy i have be receive there have ...,aqua therapy receive helpful,12,12,...,,,,,,,,,,
316341,1.015381e+20,Maxwell Rosa,1540950220650,5.0,Had baby #1 last year in June. Staff was amaz...,Had baby #1 last year in June. Staff was amaz...,have baby 1 last year in june staff be amaze s...,baby 1 last year june staff amaze shoutout mid...,30,30,...,,,,,,,,,,
316342,1.028744e+20,Yolonda Taylor,1536770366671,1.0,I gave the staff my authorization number for m...,I gave the staff my authorization number for m...,i give the staff my authorization number for m...,give staff authorization number botox injectio...,116,119,...,,,,,,,,,,


**PREPROCES**

In [3]:
#EXPAND CONTRACTIONS
def expand_cont(text):
    if isinstance(text, str) and text.strip():
        try:
            return contractions.fix(text)
        except IndexError:
            return text
    else:
        return ""

df['cont_expanded'] = df['translated_text'].apply(expand_cont)
df

#CLEAN TEXT
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)            # Remove URLs
    text = re.sub(r"[^a-z\s]", "", text)           # Remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()       # Normalize whitespace
    return text

#apply preprocessing from the expanded contractions
df['clean_text'] = df['cont_expanded'].apply(preprocess_text)

#CALCULATE WORD COUNT
df['wc'] = df['clean_text'].str.split().str.len()
df

#DROP DUPLICATES
df = df.drop_duplicates(subset='clean_text').reset_index(drop=True)
df

#DROP LOW VALUE TEXT
df = df[df['wc'] >= 5 ].reset_index(drop=True)

In [30]:
df

Unnamed: 0,user_id,username,time,rating,original_text,translated_text,final_with_stopword,final_no_stopword,ori_wc,final_wc,...,category_16,category_17,category_18,category_19,category_20,category_21,cont_expanded,clean_text,wc,topic
0,1.144028e+20,Sajida Dar,1583977664092,5.0,"Great people,great staff and great facility to...","Great people,great staff and great facility to...",great people great staff and great facility to...,great people great staff great facility go thu...,9,10,...,,,,,,,"Great people,great staff and great facility to...",great peoplegreat staff and great facility to go,8,159
1,1.027817e+20,Tamara Walker,1604380776309,2.0,Security guard asked me to put a mask on my 13...,Security guard asked me to put a mask on my 13...,security guard ask me to put a mask on my 13 m...,security guard ask put mask 13 month old baby ...,65,68,...,,,,,,,Security guard asked me to put a mask on my 13...,security guard asked me to put a mask on my mo...,66,28
2,1.147746e+20,Christina Veres,1560489213707,4.0,One of the better Cub foods that I have shoppe...,One of the better Cub foods that I have shoppe...,one of the good cub food that i have shop at e...,one good cub food shop everything need right e...,39,39,...,,,,,,,One of the better Cub foods that I have shoppe...,one of the better cub foods that i have shoppe...,39,17
3,1.108501e+20,Keegan Leahy,1581384638564,1.0,Front of house has amazing staff.. Owner howev...,Front of house has amazing staff.. Owner howev...,front of house have amazing staff owner howeve...,front house amazing staff owner however lot gr...,51,51,...,,,,,,,Front of house has amazing staff.. Owner howev...,front of house has amazing staff owner however...,51,63
4,1.106635e+20,Kyle Ebert,1592438884887,2.0,Came in early\nNo one was there except one per...,Came in early\nNo one was there except one per...,come in early no one be there except one perso...,come early no one except one person need simpl...,60,62,...,,,,,,,Came in early\nNo one was there except one per...,came in early no one was there except one pers...,62,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305846,1.020788e+20,Kevin Portee,1577891990033,5.0,I feel like a new person on every visit,I feel like a new person on every visit,i feel like a new person on every visit,feel like new person every visit,9,9,...,,,,,,,I feel like a new person on every visit,i feel like a new person on every visit,9,-1
305847,1.143790e+20,James Rose,1522868195876,5.0,The aqua therapy I have been receiving there h...,The aqua therapy I have been receiving there h...,the aqua therapy i have be receive there have ...,aqua therapy receive helpful,12,12,...,,,,,,,The aqua therapy I have been receiving there h...,the aqua therapy i have been receiving there h...,12,3
305848,1.015381e+20,Maxwell Rosa,1540950220650,5.0,Had baby #1 last year in June. Staff was amaz...,Had baby #1 last year in June. Staff was amaz...,have baby 1 last year in june staff be amaze s...,baby 1 last year june staff amaze shoutout mid...,30,30,...,,,,,,,Had baby #1 last year in June. Staff was amaz...,had baby last year in june staff was amazing s...,28,-1
305849,1.028744e+20,Yolonda Taylor,1536770366671,1.0,I gave the staff my authorization number for m...,I gave the staff my authorization number for m...,i give the staff my authorization number for m...,give staff authorization number botox injectio...,116,119,...,,,,,,,I gave the staff my authorization number for m...,i gave the staff my authorization number for m...,115,-1


# **LOAD SENTENCE EMBEDDING**

In [4]:
embeddings = np.load("E:/JUSTYN/JUSTYN/BERTOPIC/embeddings.npy")
texts = df['clean_text'].tolist()

In [5]:

np.save("positive_embeddings.npy", embeddings)
embeddings

array([[ 0.04224291, -0.00934046,  0.0281748 , ..., -0.01225406,
        -0.02216125, -0.013539  ],
       [ 0.00086574,  0.12689483, -0.01475409, ..., -0.04474968,
         0.0565543 ,  0.05384716],
       [ 0.04120927, -0.03191343, -0.02362255, ..., -0.02323264,
        -0.02680399, -0.00689324],
       ...,
       [-0.10180993, -0.09129896,  0.04284954, ..., -0.0484487 ,
        -0.01394621,  0.06315085],
       [-0.0371679 ,  0.010715  ,  0.04975103, ..., -0.04702248,
        -0.00412026, -0.00575238],
       [-0.06922076, -0.06632493,  0.02183732, ..., -0.07766668,
         0.01874336,  0.04496736]], dtype=float32)

# **DEFINE UMAP, HDBSCAN, VECTORIZER**

In [6]:
umap_model = UMAP(
    n_neighbors=20,
    n_components=5,
    min_dist=0.0,
    metric="cosine"
)

hdbscan_model = HDBSCAN(
    min_cluster_size=30,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True  # Required for topic probabilities
)

vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")

In [7]:
embeddings

array([[ 0.04224291, -0.00934046,  0.0281748 , ..., -0.01225406,
        -0.02216125, -0.013539  ],
       [ 0.00086574,  0.12689483, -0.01475409, ..., -0.04474968,
         0.0565543 ,  0.05384716],
       [ 0.04120927, -0.03191343, -0.02362255, ..., -0.02323264,
        -0.02680399, -0.00689324],
       ...,
       [-0.10180993, -0.09129896,  0.04284954, ..., -0.0484487 ,
        -0.01394621,  0.06315085],
       [-0.0371679 ,  0.010715  ,  0.04975103, ..., -0.04702248,
        -0.00412026, -0.00575238],
       [-0.06922076, -0.06632493,  0.02183732, ..., -0.07766668,
         0.01874336,  0.04496736]], dtype=float32)

# **INITIALIZE AND FIT BERTOPIC**

In [8]:
# Reuse same embedding model or leave None since you have embeddings already
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

# Use the same texts and loaded embeddings
topics, probs = topic_model.fit_transform(texts, embeddings)


2025-07-17 14:32:41,974 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-17 14:36:21,758 - BERTopic - Dimensionality - Completed ✓
2025-07-17 14:36:21,766 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-17 15:33:52,552 - BERTopic - Cluster - Completed ✓
2025-07-17 15:33:52,652 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-17 15:34:35,199 - BERTopic - Representation - Completed ✓


# **SAVE EVERYTHING**

**DATASET + CLEANING + ASSIGNED TOPICS**

In [9]:
df["topic"] = topics
df.to_csv("data_with_topic.csv", index=False)
df


Unnamed: 0,user_id,username,time,rating,original_text,translated_text,final_with_stopword,final_no_stopword,ori_wc,final_wc,...,category_16,category_17,category_18,category_19,category_20,category_21,cont_expanded,clean_text,wc,topic
0,1.144028e+20,Sajida Dar,1583977664092,5.0,"Great people,great staff and great facility to...","Great people,great staff and great facility to...",great people great staff and great facility to...,great people great staff great facility go thu...,9,10,...,,,,,,,"Great people,great staff and great facility to...",great peoplegreat staff and great facility to go,8,159
1,1.027817e+20,Tamara Walker,1604380776309,2.0,Security guard asked me to put a mask on my 13...,Security guard asked me to put a mask on my 13...,security guard ask me to put a mask on my 13 m...,security guard ask put mask 13 month old baby ...,65,68,...,,,,,,,Security guard asked me to put a mask on my 13...,security guard asked me to put a mask on my mo...,66,28
2,1.147746e+20,Christina Veres,1560489213707,4.0,One of the better Cub foods that I have shoppe...,One of the better Cub foods that I have shoppe...,one of the good cub food that i have shop at e...,one good cub food shop everything need right e...,39,39,...,,,,,,,One of the better Cub foods that I have shoppe...,one of the better cub foods that i have shoppe...,39,17
3,1.108501e+20,Keegan Leahy,1581384638564,1.0,Front of house has amazing staff.. Owner howev...,Front of house has amazing staff.. Owner howev...,front of house have amazing staff owner howeve...,front house amazing staff owner however lot gr...,51,51,...,,,,,,,Front of house has amazing staff.. Owner howev...,front of house has amazing staff owner however...,51,63
4,1.106635e+20,Kyle Ebert,1592438884887,2.0,Came in early\nNo one was there except one per...,Came in early\nNo one was there except one per...,come in early no one be there except one perso...,come early no one except one person need simpl...,60,62,...,,,,,,,Came in early\nNo one was there except one per...,came in early no one was there except one pers...,62,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305846,1.020788e+20,Kevin Portee,1577891990033,5.0,I feel like a new person on every visit,I feel like a new person on every visit,i feel like a new person on every visit,feel like new person every visit,9,9,...,,,,,,,I feel like a new person on every visit,i feel like a new person on every visit,9,-1
305847,1.143790e+20,James Rose,1522868195876,5.0,The aqua therapy I have been receiving there h...,The aqua therapy I have been receiving there h...,the aqua therapy i have be receive there have ...,aqua therapy receive helpful,12,12,...,,,,,,,The aqua therapy I have been receiving there h...,the aqua therapy i have been receiving there h...,12,3
305848,1.015381e+20,Maxwell Rosa,1540950220650,5.0,Had baby #1 last year in June. Staff was amaz...,Had baby #1 last year in June. Staff was amaz...,have baby 1 last year in june staff be amaze s...,baby 1 last year june staff amaze shoutout mid...,30,30,...,,,,,,,Had baby #1 last year in June. Staff was amaz...,had baby last year in june staff was amazing s...,28,-1
305849,1.028744e+20,Yolonda Taylor,1536770366671,1.0,I gave the staff my authorization number for m...,I gave the staff my authorization number for m...,i give the staff my authorization number for m...,give staff authorization number botox injectio...,116,119,...,,,,,,,I gave the staff my authorization number for m...,i gave the staff my authorization number for m...,115,-1


**TOPIC INFO**

In [10]:
topic_info = topic_model.get_topic_info()
topic_info.to_csv("topic_info.csv", index=False)
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,142616,-1_doctor_time_staff_care,"[doctor, time, staff, care, dr, did, just, app...",[this review is written in regards to my secon...
1,0,54006,0_dentist_dental_teeth_tooth,"[dentist, dental, teeth, tooth, cleaning, offi...",[i visited the easley office of aspen dental f...
2,1,6385,1_billing_insurance_pay_paid,"[billing, insurance, pay, paid, collections, p...",[this is not our usual medical clinic my daugh...
3,2,3393,2_covid_covid test_test_testing,"[covid, covid test, test, testing, covid testi...","[i do not drive how i get covid test, do not g..."
4,3,3231,3_therapy_physical therapy_physical_pt,"[therapy, physical therapy, physical, pt, shou...",[the staff is so friendly and welcoming i love...
...,...,...,...,...,...
342,341,30,341_delivery_deliver_shipping_delivery need,"[delivery, deliver, shipping, delivery need, g...",[across the board first time using delivery an...
343,342,30,342_prices need_decent price_need decent_price...,"[prices need, decent price, need decent, price...","[usually have what i need for a decent price, ..."
344,343,30,343_bugs_bed bugs_bed_ants,"[bugs, bed bugs, bed, ants, roaches, roach, cr...","[bed bugs are just what the doctor ordered, be..."
345,344,30,344_vape_vaping_vape shop_juice,"[vape, vaping, vape shop, juice, vape bar, vap...",[best vape store hands down staff is very know...


**TOPIC + KEYWORDS + C-TF-IDF SCORES EACH KEYWORD**

In [11]:
topic_info = topic_model.get_topic_info()
topic_ids = topic_info['Topic'].tolist()
#topic keywords + c-td-idf scores for each topic
topic_keywords = []

for topic_id in topic_ids:
    keywords_scores = topic_model.get_topic(topic_id)
    for word, score in keywords_scores:
        topic_keywords.append({
            "Topic": topic_id,
            "Keyword": word,
            "c-TF-IDF": score
        })

topic_keywords

#convert to df
df_keywords = pd.DataFrame(topic_keywords)
df_keywords.to_csv("topic_ctfidf.csv", index=False)
df_keywords


Unnamed: 0,Topic,Keyword,c-TF-IDF
0,-1,doctor,0.002670
1,-1,time,0.002438
2,-1,staff,0.002348
3,-1,care,0.002337
4,-1,dr,0.002229
...,...,...,...
3465,345,lunch dinner,0.044924
3466,345,dinner,0.040809
3467,345,lunch shop,0.040746
3468,345,lunch lunch,0.037828


**TOKENIZED TEXT FOR NPMI, DIVERSITY, EMBEDDING COHERENCE**

In [12]:
import pickle

texts_tokenized = [doc.split() for doc in df['clean_text']]
with open("tokenized_positive_text.pkl", "wb") as f:
    pickle.dump(texts_tokenized, f)

texts_tokenized

[['great', 'peoplegreat', 'staff', 'and', 'great', 'facility', 'to', 'go'],
 ['security',
  'guard',
  'asked',
  'me',
  'to',
  'put',
  'a',
  'mask',
  'on',
  'my',
  'month',
  'old',
  'baby',
  'when',
  'i',
  'said',
  'he',
  'is',
  'too',
  'little',
  'he',
  'is',
  'under',
  'two',
  'he',
  'raised',
  'his',
  'eyebrows',
  'and',
  'asked',
  'again',
  'for',
  'me',
  'to',
  'mask',
  'my',
  'baby',
  'please',
  'educate',
  'those',
  'enforcing',
  'policies',
  'so',
  'i',
  'do',
  'not',
  'have',
  'to',
  'defend',
  'a',
  'cdc',
  'policy',
  'which',
  'is',
  'cloth',
  'face',
  'coverings',
  'should',
  'not',
  'be',
  'placed',
  'on',
  'young',
  'children',
  'under',
  'age'],
 ['one',
  'of',
  'the',
  'better',
  'cub',
  'foods',
  'that',
  'i',
  'have',
  'shopped',
  'at',
  'everything',
  'you',
  'need',
  'is',
  'right',
  'there',
  'they',
  'even',
  'have',
  'a',
  'place',
  'to',
  'sit',
  'and',
  'relax',
  'while',
 

**MODEL**

In [13]:
# Save the model to a directory (not a single file)
topic_model.save(
    "model_safetensors_positive",
    serialization="safetensors",  # or "pytorch"
    save_ctfidf=True,
    save_embedding_model="sentence-transformers/all-MiniLM-L6-v2"
)


In [14]:
# Save the model to a directory (not a single file)
topic_model.save(
    "model_pytorch_positive",
    serialization="pytorch",  # or "pytorch"
    save_ctfidf=True,
    save_embedding_model="sentence-transformers/all-MiniLM-L6-v2"
)


# **INTERPRETATION**

**CHECK ONE TOPIC**

In [15]:
#topic X
topic_model.get_topic(103)

[('stitches', 0.055308712268208715),
 ('wound', 0.028235615926763512),
 ('finger', 0.02402035854140455),
 ('cut', 0.01840727944360265),
 ('needed stitches', 0.01046869628678356),
 ('stitch', 0.00899748451132552),
 ('cut finger', 0.008725137258461631),
 ('bleeding', 0.008157036285817465),
 ('stitched', 0.007618980297177249),
 ('glue', 0.007527651545079018)]

In [16]:
#print reviews with topic x
topic_x_reviews = df[df["topic"] == 103]
topic_x_reviews

Unnamed: 0,user_id,username,time,rating,original_text,translated_text,final_with_stopword,final_no_stopword,ori_wc,final_wc,...,category_16,category_17,category_18,category_19,category_20,category_21,cont_expanded,clean_text,wc,topic
286,1.056289e+20,Erica Smith,1563841217989,3.0,Only real big problem was I came in for a poss...,Only real big problem was I came in for a poss...,only real big problem be i come in for a possi...,real big problem come possible infection pract...,42,43,...,,,,,,,Only real big problem was I came in for a poss...,only real big problem was i came in for a poss...,43,103
493,1.044740e+20,Anthony Urgo,1543439777737,1.0,After having a biopsy done and the stiches rem...,After having a biopsy done and the stiches rem...,after have a biopsy do and the stiches remove ...,biopsy stiches remove scar come open week late...,161,163,...,,,,,,,After having a biopsy done and the stiches rem...,after having a biopsy done and the stiches rem...,161,103
514,1.120096e+20,Steve Kirkpatrick,1577074050203,5.0,I cut my two fingers on my left hand quite bad...,I cut my two fingers on my left hand quite bad...,i cut my two finger on my left hand quite badl...,cut two finger left hand quite badly tin roof ...,75,74,...,,,,,,,I cut my two fingers on my left hand quite bad...,i cut my two fingers on my left hand quite bad...,74,103
748,1.140708e+20,Chatkan Bunnag,1603225998609,5.0,Quick look at cut and stiches. Able to get ap...,Quick look at cut and stiches. Able to get ap...,quick look at cut and stiches able to get appo...,quick look cut stiches able get appointment im...,21,21,...,,,,,,,Quick look at cut and stiches. Able to get ap...,quick look at cut and stiches able to get appo...,21,103
1267,1.126040e+20,Zach Carrington,1569623707100,1.0,Worst experience I’ve ever had with an urgent ...,Worst experience I’ve ever had with an urgent ...,bad experience i have ever have with an urgent...,bad experience ever urgent care continuously t...,55,59,...,,,,,,,Worst experience I have ever had with an urgen...,worst experience i have ever had with an urgen...,59,103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295873,1.043312e+20,patrick cote,1593182250585,1.0,cut through half my hand was bleeding through ...,cut through half my hand was bleeding through ...,cut through half my hand be bleed through a be...,cut half hand bleed beach towel tell get call ...,53,53,...,,,,,,,cut through half my hand was bleeding through ...,cut through half my hand was bleeding through ...,53,103
297182,1.032626e+20,Nicole Russell,1555670003246,3.0,Definitely a mixed bag. I have received EXCELL...,Definitely a mixed bag. I have received EXCELL...,definitely a mixed bag i have receive excellen...,definitely mixed bag receive excellent service...,101,102,...,,,,,,,Definitely a mixed bag. I have received EXCELL...,definitely a mixed bag i have received excelle...,99,103
301666,1.143566e+20,chari britnell,1557979048988,4.0,Only came once for staple removal & evaluation...,Only came once for staple removal & evaluation...,only come once for staple removal evaluation b...,come staple removal evaluation fellow andrew '...,20,20,...,,,,,,,Only came once for staple removal & evaluation...,only came once for staple removal evaluation b...,19,103
302455,1.011486e+20,Kaye Kelley,1616796481456,1.0,About late 1997 or early 2000 I went in to Dr ...,About late 1997 or early 2000 I went in to Dr ...,about late 1997 or early 2000 i go in to dr bu...,late 1997 early 2000 go dr budny come anesthes...,101,104,...,,,,,,,About late 1997 or early 2000 I went in to Dr ...,about late or early i went in to dr budny as i...,100,103


**BAR CHART FOR TOPIC'S TOP KEYWORDS**

In [31]:
fig1 = topic_model.visualize_barchart(top_n_topics=347)
fig1

In [32]:
fig1.write_html("A_BERTOPIC_NEG_barchart.html")

**TOPIC RELATIONSHIP = INTERTOPIC DISTANCE MAP**

In [19]:
fig2 = topic_model.visualize_topics()
fig2

In [20]:
fig2.write_html("A_BERTOPIC_NEG_distancemap.html")

**HIERARCHICAL CLUSTERING DENDOGRAM**

In [21]:
# Get total number of topics 
total_topics = len(topic_model.get_topics())
# Use all topics in hierarchy
fig3 = topic_model.visualize_hierarchy(top_n_topics=total_topics)
fig3.show()
fig3.write_html("A_BERTOPIC_NEG_hierarchy.html")

# **EVALUATION**

### **1. GENSIM - WITHOUT OUTLIERS**

In [22]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel

In [23]:
# #prepare input
# texts_tokenized = [doc.split() for doc in df['clean_text']]

# #build dictionary and corpus
# dictionary = Dictionary(texts_tokenized)
# corpus = [dictionary.doc2bow(text) for text in texts_tokenized]

# #extract topic keywords
# #each topic = list of words
# topic_words = []
# for topic_id in topic_model.get_topic_info()['Topic']:
#     if topic_id == -1:
#         continue
#     words_scores = topic_model.get_topic(topic_id)
#     if words_scores:  # skip empty topic
#         topic_words.append([word for word, score in words_scores])
    
# #compute coherence
# coherence_model = CoherenceModel(
#     topics=topic_words,
#     texts=texts_tokenized,
#     dictionary=dictionary,
#     coherence='c_v' 
# )
# coherence_model2 = CoherenceModel(
#     topics=topic_words,
#     texts=texts_tokenized,
#     dictionary=dictionary,
#     coherence='u_mass' 
# )
# coherence_model3 = CoherenceModel(
#     topics=topic_words,
#     texts=texts_tokenized,
#     dictionary=dictionary,
#     coherence='c_uci' 
# )
# coherence_model4 = CoherenceModel(
#     topics=topic_words,
#     texts=texts_tokenized,
#     dictionary=dictionary,
#     coherence='c_npmi'  
# )

# coherence_score = coherence_model.get_coherence()
# coherence_score2 = coherence_model2.get_coherence()
# coherence_score3 = coherence_model3.get_coherence()
# coherence_score4 = coherence_model4.get_coherence()
# print("Coherence Score (c_v):", coherence_score)
# print("Coherence Score (u_mass):", coherence_score2)
# print("Coherence Score (c_uci):", coherence_score3)
# print("Coherence Score (c_npmi):", coherence_score4)


In [24]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel

#tokenize documents
texts_tokenized = [doc.split() for doc in df['clean_text']]

#train bigram model
bigram_model = Phrases(texts_tokenized, min_count=3, threshold=10)
bigram_phraser = Phraser(bigram_model)

#apply bigram model to texts
texts_with_phrases = [bigram_phraser[doc] for doc in texts_tokenized]

#build dictionary and corpus 
dictionary = Dictionary(texts_with_phrases)
corpus = [dictionary.doc2bow(text) for text in texts_with_phrases]

#normalize topic words from bertopic
topic_words = []
for topic_id in topic_model.get_topic_info()['Topic']:
    if topic_id == -1:
        continue
    words_scores = topic_model.get_topic(topic_id)

    topic = []
    for word, _ in words_scores:
        # Replace spaces with underscores to match phrases
        word_token = word.replace(" ", "_")
        if word_token in dictionary.token2id:
            topic.append(word_token)

    if len(topic) >= 2:
        topic_words.append(topic)

print(f"Topics retained for coherence: {len(topic_words)}")

#COMPUTE COHERENCE
coherence_model = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    coherence='c_v'
)

coherence_model2 = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    corpus=corpus,
    coherence='u_mass'
)

coherence_model3 = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    coherence='c_uci'
)

coherence_model4 = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    coherence='c_npmi'
)

# Display results
print("Coherence Score (c_v):", coherence_model.get_coherence())
print("Coherence Score (u_mass):", coherence_model2.get_coherence())
print("Coherence Score (c_uci):", coherence_model3.get_coherence())
print("Coherence Score (c_npmi):", coherence_model4.get_coherence())



Topics retained for coherence: 345
Coherence Score (c_v): 0.6367987839843254
Coherence Score (u_mass): -4.605794512760808
Coherence Score (c_uci): 0.3245895965604759
Coherence Score (c_npmi): 0.09611192835624054


In [25]:
topic_words

[['dentist',
  'dental',
  'teeth',
  'tooth',
  'cleaning',
  'office',
  'dentists',
  'work',
  'hygienist',
  'dr'],
 ['billing',
  'insurance',
  'pay',
  'paid',
  'collections',
  'payment',
  'billing_department',
  'charged',
  'company',
  'bills'],
 ['covid',
  'covid_test',
  'test',
  'testing',
  'covid_testing',
  'results',
  'tested',
  'rapid',
  'rapid_covid',
  'hours'],
 ['therapy',
  'physical_therapy',
  'physical',
  'pt',
  'shoulder',
  'exercises',
  'therapist',
  'knee',
  'physical_therapist',
  'therapists'],
 ['chiropractic',
  'chiropractor',
  'chiropractors',
  'adjustments',
  'neck',
  'adjustment',
  'dr',
  'pain'],
 ['foot',
  'feet',
  'ankle',
  'podiatrist',
  'toe',
  'shoes',
  'foot_ankle',
  'ingrown',
  'orthotics',
  'inserts'],
 ['massage',
  'massages',
  'massage_therapist',
  'therapist',
  'relaxing',
  'deep_tissue',
  'couples',
  'tissue',
  'tip',
  'couples_massage'],
 ['professional',
  'friendly',
  'helpful',
  'people',
  '

1. c_v = 0 to 1
higher is better.
2. u_mass = negative values
closer to 0 is better.
3. c_uci = 0 to 1+
higher is better
4. c_npmi = -1 to 1
higher is better  
0 is neutral  
values above 0.1 are decent


**RESULT**  
1. c_v = moderate to good
2. u_mass = LOW. maybe bcs sparse data or poor word occurence
3. c_uci = moderate to good
topics are relatively coherent
4. c_npmi = barely acceptable
but 0.1 is a commonly used threshold in topic modeling

### **2. GENSIM - WITH OUTLIERS**

In [26]:
texts_tokenized

[['great', 'peoplegreat', 'staff', 'and', 'great', 'facility', 'to', 'go'],
 ['security',
  'guard',
  'asked',
  'me',
  'to',
  'put',
  'a',
  'mask',
  'on',
  'my',
  'month',
  'old',
  'baby',
  'when',
  'i',
  'said',
  'he',
  'is',
  'too',
  'little',
  'he',
  'is',
  'under',
  'two',
  'he',
  'raised',
  'his',
  'eyebrows',
  'and',
  'asked',
  'again',
  'for',
  'me',
  'to',
  'mask',
  'my',
  'baby',
  'please',
  'educate',
  'those',
  'enforcing',
  'policies',
  'so',
  'i',
  'do',
  'not',
  'have',
  'to',
  'defend',
  'a',
  'cdc',
  'policy',
  'which',
  'is',
  'cloth',
  'face',
  'coverings',
  'should',
  'not',
  'be',
  'placed',
  'on',
  'young',
  'children',
  'under',
  'age'],
 ['one',
  'of',
  'the',
  'better',
  'cub',
  'foods',
  'that',
  'i',
  'have',
  'shopped',
  'at',
  'everything',
  'you',
  'need',
  'is',
  'right',
  'there',
  'they',
  'even',
  'have',
  'a',
  'place',
  'to',
  'sit',
  'and',
  'relax',
  'while',
 

In [27]:
texts_with_phrases

[['great', 'peoplegreat', 'staff', 'and', 'great', 'facility', 'to', 'go'],
 ['security_guard',
  'asked',
  'me',
  'to',
  'put',
  'a',
  'mask',
  'on',
  'my',
  'month_old',
  'baby',
  'when',
  'i',
  'said',
  'he',
  'is',
  'too',
  'little',
  'he',
  'is',
  'under',
  'two',
  'he',
  'raised',
  'his',
  'eyebrows',
  'and',
  'asked',
  'again',
  'for',
  'me',
  'to',
  'mask',
  'my',
  'baby',
  'please',
  'educate',
  'those',
  'enforcing',
  'policies',
  'so',
  'i',
  'do',
  'not',
  'have',
  'to',
  'defend',
  'a',
  'cdc',
  'policy',
  'which',
  'is',
  'cloth',
  'face_coverings',
  'should',
  'not',
  'be',
  'placed',
  'on',
  'young_children',
  'under',
  'age'],
 ['one',
  'of',
  'the',
  'better',
  'cub_foods',
  'that',
  'i',
  'have',
  'shopped',
  'at',
  'everything',
  'you',
  'need',
  'is',
  'right',
  'there',
  'they',
  'even',
  'have',
  'a',
  'place',
  'to',
  'sit',
  'and',
  'relax',
  'while',
  'you',
  'eat',
  'or',


In [28]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel

#tokenize documents
texts_tokenized = [doc.split() for doc in df['clean_text']]

#train bigram model
bigram_model = Phrases(texts_tokenized, min_count=3, threshold=10)
bigram_phraser = Phraser(bigram_model)

#apply bigram model to texts
texts_with_phrases = [bigram_phraser[doc] for doc in texts_tokenized]

#build dictionary and corpus 
dictionary = Dictionary(texts_with_phrases)
corpus = [dictionary.doc2bow(text) for text in texts_with_phrases]

#normalize topic words from bertopic
topic_words = []
for topic_id in topic_model.get_topic_info()['Topic']:
    words_scores = topic_model.get_topic(topic_id)

    topic = []
    for word, _ in words_scores:
        # Replace spaces with underscores to match phrases
        word_token = word.replace(" ", "_")
        if word_token in dictionary.token2id:
            topic.append(word_token)

    if len(topic) >= 2:
        topic_words.append(topic)

print(f"Topics retained for coherence: {len(topic_words)}")

#COMPUTE COHERENCE
coherence_model = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    coherence='c_v'
)

coherence_model2 = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    corpus=corpus,
    coherence='u_mass'
)

coherence_model3 = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    coherence='c_uci'
)

coherence_model4 = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    coherence='c_npmi'
)

# Display results
print("Coherence Score (c_v):", coherence_model.get_coherence())
print("Coherence Score (u_mass):", coherence_model2.get_coherence())
print("Coherence Score (c_uci):", coherence_model3.get_coherence())
print("Coherence Score (c_npmi):", coherence_model4.get_coherence())



Topics retained for coherence: 346
Coherence Score (c_v): 0.6359378851456836
Coherence Score (u_mass): -4.59753172103381
Coherence Score (c_uci): 0.3229996536400134
Coherence Score (c_npmi): 0.09576014602206412


### **3. TOPIC DIVERSITY**

In [29]:
def calculate_topic_diversity(topic_words):
    unique_words = set(word for topic in topic_words for word in topic)
    total_words = sum(len(topic) for topic in topic_words)
    return len(unique_words) / total_words

diversity_score = calculate_topic_diversity(topic_words)
print("Topic Diversity:", diversity_score)


Topic Diversity: 0.6238193018480492


# **HIERARCHICAL TOPIC REDUCTION**