In [1]:
import pandas as pd
import numpy as np
import contractions
import re

from sentence_transformers import SentenceTransformer
import torch

from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic

import pickle
import joblib

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("medical_data.csv")
df

  df = pd.read_csv("medical_data.csv")


Unnamed: 0,user_id,username,time,rating,original_text,translated_text,final_with_stopword,final_no_stopword,ori_wc,final_wc,...,category_12,category_13,category_14,category_15,category_16,category_17,category_18,category_19,category_20,category_21
0,1.144028e+20,Sajida Dar,1583977664092,5.0,"Great people,great staff and great facility to...","Great people,great staff and great facility to...",great people great staff and great facility to...,great people great staff great facility go thu...,9,10,...,,,,,,,,,,
1,1.027817e+20,Tamara Walker,1604380776309,2.0,Security guard asked me to put a mask on my 13...,Security guard asked me to put a mask on my 13...,security guard ask me to put a mask on my 13 m...,security guard ask put mask 13 month old baby ...,65,68,...,,,,,,,,,,
2,1.147746e+20,Christina Veres,1560489213707,4.0,One of the better Cub foods that I have shoppe...,One of the better Cub foods that I have shoppe...,one of the good cub food that i have shop at e...,one good cub food shop everything need right e...,39,39,...,,,,,,,,,,
3,1.108501e+20,Keegan Leahy,1581384638564,1.0,Front of house has amazing staff.. Owner howev...,Front of house has amazing staff.. Owner howev...,front of house have amazing staff owner howeve...,front house amazing staff owner however lot gr...,51,51,...,,,,,,,,,,
4,1.106635e+20,Kyle Ebert,1592438884887,2.0,Came in early\nNo one was there except one per...,Came in early\nNo one was there except one per...,come in early no one be there except one perso...,come early no one except one person need simpl...,60,62,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316339,1.020788e+20,Kevin Portee,1577891990033,5.0,I feel like a new person on every visit,I feel like a new person on every visit,i feel like a new person on every visit,feel like new person every visit,9,9,...,,,,,,,,,,
316340,1.143790e+20,James Rose,1522868195876,5.0,The aqua therapy I have been receiving there h...,The aqua therapy I have been receiving there h...,the aqua therapy i have be receive there have ...,aqua therapy receive helpful,12,12,...,,,,,,,,,,
316341,1.015381e+20,Maxwell Rosa,1540950220650,5.0,Had baby #1 last year in June. Staff was amaz...,Had baby #1 last year in June. Staff was amaz...,have baby 1 last year in june staff be amaze s...,baby 1 last year june staff amaze shoutout mid...,30,30,...,,,,,,,,,,
316342,1.028744e+20,Yolonda Taylor,1536770366671,1.0,I gave the staff my authorization number for m...,I gave the staff my authorization number for m...,i give the staff my authorization number for m...,give staff authorization number botox injectio...,116,119,...,,,,,,,,,,


In [3]:
#EXPAND CONTRACTIONS
def expand_cont(text):
    if isinstance(text, str) and text.strip():
        try:
            return contractions.fix(text)
        except IndexError:
            return text
    else:
        return ""

df['cont_expanded'] = df['translated_text'].apply(expand_cont)
df

#CLEAN TEXT
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)            # Remove URLs
    text = re.sub(r"[^a-z\s]", "", text)           # Remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()       # Normalize whitespace
    return text

#apply preprocessing from the expanded contractions
df['clean_text'] = df['cont_expanded'].apply(preprocess_text)

#CALCULATE WORD COUNT
df['wc'] = df['clean_text'].str.split().str.len()
df

#DROP DUPLICATES
df = df.drop_duplicates(subset='clean_text').reset_index(drop=True)
df

#DROP LOW VALUE TEXT
df = df[df['wc'] >= 5 ].reset_index(drop=True)

## **GENERATE SENTENCE EMBEDDING**

In [4]:
from sentence_transformers import SentenceTransformer
import torch

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {device}")

# Load and move the model to GPU
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# Ensure input is a list of strings
texts_cleaned = df['clean_text'].tolist()

# Encode on GPU
embeddings = embedding_model.encode(
    texts_cleaned,
    show_progress_bar=True,
    batch_size=256,
    normalize_embeddings=True  # Helps UMAP clustering
)


Using device: cuda


Batches: 100%|██████████| 1195/1195 [08:51<00:00,  2.25it/s]


In [5]:
embeddings

array([[ 0.04224291, -0.00934046,  0.0281748 , ..., -0.01225406,
        -0.02216125, -0.013539  ],
       [ 0.00086574,  0.12689483, -0.01475409, ..., -0.04474968,
         0.0565543 ,  0.05384716],
       [ 0.04120927, -0.03191343, -0.02362255, ..., -0.02323264,
        -0.02680399, -0.00689324],
       ...,
       [-0.10180993, -0.09129896,  0.04284954, ..., -0.0484487 ,
        -0.01394621,  0.06315085],
       [-0.0371679 ,  0.010715  ,  0.04975103, ..., -0.04702248,
        -0.00412026, -0.00575238],
       [-0.06922076, -0.06632493,  0.02183732, ..., -0.07766668,
         0.01874336,  0.04496736]], dtype=float32)

# **DEFINE UMAP, HDBSCAN, VECTORIZER**


In [6]:
umap_model = UMAP(
    n_neighbors=30, 
    n_components=5, 
    min_dist=0.2, 
    metric='cosine')

hdbscan_model = HDBSCAN(
    min_cluster_size=50, 
    metric='euclidean', 
    cluster_selection_method="eom",
    prediction_data=True)

vectorizer_model = CountVectorizer(
    ngram_range=(1, 2), 
    stop_words="english")


# **INITIALIZE AND FIT BERTOPIC**

In [7]:

# Reuse same embedding model or leave None since you have embeddings already
topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True
)

# Use the same texts and loaded embeddings
topics, probs = topic_model.fit_transform(texts_cleaned, embeddings)

2025-07-17 11:29:44,723 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-07-17 11:33:45,272 - BERTopic - Dimensionality - Completed ✓
2025-07-17 11:33:45,279 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-17 11:46:15,593 - BERTopic - Cluster - Completed ✓
2025-07-17 11:46:15,662 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-17 11:46:57,175 - BERTopic - Representation - Completed ✓


## **SAVE EVERYTHING**

In [8]:
df["topic"] = topics
df

Unnamed: 0,user_id,username,time,rating,original_text,translated_text,final_with_stopword,final_no_stopword,ori_wc,final_wc,...,category_16,category_17,category_18,category_19,category_20,category_21,cont_expanded,clean_text,wc,topic
0,1.144028e+20,Sajida Dar,1583977664092,5.0,"Great people,great staff and great facility to...","Great people,great staff and great facility to...",great people great staff and great facility to...,great people great staff great facility go thu...,9,10,...,,,,,,,"Great people,great staff and great facility to...",great peoplegreat staff and great facility to go,8,46
1,1.027817e+20,Tamara Walker,1604380776309,2.0,Security guard asked me to put a mask on my 13...,Security guard asked me to put a mask on my 13...,security guard ask me to put a mask on my 13 m...,security guard ask put mask 13 month old baby ...,65,68,...,,,,,,,Security guard asked me to put a mask on my 13...,security guard asked me to put a mask on my mo...,66,31
2,1.147746e+20,Christina Veres,1560489213707,4.0,One of the better Cub foods that I have shoppe...,One of the better Cub foods that I have shoppe...,one of the good cub food that i have shop at e...,one good cub food shop everything need right e...,39,39,...,,,,,,,One of the better Cub foods that I have shoppe...,one of the better cub foods that i have shoppe...,39,16
3,1.108501e+20,Keegan Leahy,1581384638564,1.0,Front of house has amazing staff.. Owner howev...,Front of house has amazing staff.. Owner howev...,front of house have amazing staff owner howeve...,front house amazing staff owner however lot gr...,51,51,...,,,,,,,Front of house has amazing staff.. Owner howev...,front of house has amazing staff owner however...,51,7
4,1.106635e+20,Kyle Ebert,1592438884887,2.0,Came in early\nNo one was there except one per...,Came in early\nNo one was there except one per...,come in early no one be there except one perso...,come early no one except one person need simpl...,60,62,...,,,,,,,Came in early\nNo one was there except one per...,came in early no one was there except one pers...,62,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305846,1.020788e+20,Kevin Portee,1577891990033,5.0,I feel like a new person on every visit,I feel like a new person on every visit,i feel like a new person on every visit,feel like new person every visit,9,9,...,,,,,,,I feel like a new person on every visit,i feel like a new person on every visit,9,-1
305847,1.143790e+20,James Rose,1522868195876,5.0,The aqua therapy I have been receiving there h...,The aqua therapy I have been receiving there h...,the aqua therapy i have be receive there have ...,aqua therapy receive helpful,12,12,...,,,,,,,The aqua therapy I have been receiving there h...,the aqua therapy i have been receiving there h...,12,-1
305848,1.015381e+20,Maxwell Rosa,1540950220650,5.0,Had baby #1 last year in June. Staff was amaz...,Had baby #1 last year in June. Staff was amaz...,have baby 1 last year in june staff be amaze s...,baby 1 last year june staff amaze shoutout mid...,30,30,...,,,,,,,Had baby #1 last year in June. Staff was amaz...,had baby last year in june staff was amazing s...,28,-1
305849,1.028744e+20,Yolonda Taylor,1536770366671,1.0,I gave the staff my authorization number for m...,I gave the staff my authorization number for m...,i give the staff my authorization number for m...,give staff authorization number botox injectio...,116,119,...,,,,,,,I gave the staff my authorization number for m...,i gave the staff my authorization number for m...,115,-1


**DATASET + CLEANING + ASSIGNED TOPICS**

In [9]:
df.to_csv("data_with_topics.csv", index=False)

In [10]:
topic_info = topic_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,163812,-1_doctor_staff_time_care,"[doctor, staff, time, care, dr, did, just, pla...",[i went to er on saturday sept for a broken to...
1,0,53992,0_dentist_dental_teeth_tooth,"[dentist, dental, teeth, tooth, office, cleani...",[i had an awful experience with young family d...
2,1,7586,1_pharmacy_prescription_prescriptions_ready,"[pharmacy, prescription, prescriptions, ready,...",[only bad experiences at a pharmacy in my life...
3,2,5739,2_insurance_billing_pay_paid,"[insurance, billing, pay, paid, collections, c...",[i went to doctors express for my shoulder pai...
4,3,3487,3_covid_test_covid test_testing,"[covid, test, covid test, testing, covid testi...",[they charge for a covid test do not go there ...
...,...,...,...,...,...
159,158,52,158_atmosphere_friendly atmosphere_atmosphere ...,"[atmosphere, friendly atmosphere, atmosphere g...","[great atmosphere friendly and informative, fu..."
160,159,52,159_sober_addiction_program_life,"[sober, addiction, program, life, sobriety, re...",[i was a patient at the tonti location in all ...
161,160,51,160_shoulder_exercises_therapy_pt,"[shoulder, exercises, therapy, pt, physical, p...",[i was completely ready to go under the knife ...
162,161,50,161_clean_prices_prices clean_clean prices,"[clean, prices, prices clean, clean prices, pr...","[very clean and good prices, great prices and ..."


**TOPIC INFO**

In [11]:
topic_info.to_csv("topic_info.csv", index=False)

**TOPIC + KEYWORDS + C-TF-IDF SCORES EACH KEYWORD**

In [12]:
topic_info = topic_model.get_topic_info()
topic_ids = topic_info['Topic'].tolist()
#topic keywords + c-td-idf scores for each topic
topic_keywords = []

for topic_id in topic_ids:
    keywords_scores = topic_model.get_topic(topic_id)
    for word, score in keywords_scores:
        topic_keywords.append({
            "Topic": topic_id,
            "Keyword": word,
            "c-TF-IDF": score
        })

topic_keywords

#convert to df
df_keywords = pd.DataFrame(topic_keywords)
df_keywords.to_csv("topic_ctfidf.csv", index=False)
df_keywords

Unnamed: 0,Topic,Keyword,c-TF-IDF
0,-1,doctor,0.004332
1,-1,staff,0.004153
2,-1,time,0.004143
3,-1,care,0.004030
4,-1,dr,0.004019
...,...,...,...
1635,162,climb,0.032054
1636,162,bouldering,0.029750
1637,162,route,0.022485
1638,162,gyms,0.020915


**EMBEDDINGS**

In [13]:
np.save("embeddings.npy", embeddings)
embeddings

array([[ 0.04224291, -0.00934046,  0.0281748 , ..., -0.01225406,
        -0.02216125, -0.013539  ],
       [ 0.00086574,  0.12689483, -0.01475409, ..., -0.04474968,
         0.0565543 ,  0.05384716],
       [ 0.04120927, -0.03191343, -0.02362255, ..., -0.02323264,
        -0.02680399, -0.00689324],
       ...,
       [-0.10180993, -0.09129896,  0.04284954, ..., -0.0484487 ,
        -0.01394621,  0.06315085],
       [-0.0371679 ,  0.010715  ,  0.04975103, ..., -0.04702248,
        -0.00412026, -0.00575238],
       [-0.06922076, -0.06632493,  0.02183732, ..., -0.07766668,
         0.01874336,  0.04496736]], dtype=float32)

**TOKENIZED TEXT FOR NPMI, DIVERSITY, EMBEDDING COHERENCE**

In [14]:
import pickle

texts_tokenized = [doc.split() for doc in df['clean_text']]
with open("tokenized_text.pkl", "wb") as f:
    pickle.dump(texts_tokenized, f)

texts_tokenized

[['great', 'peoplegreat', 'staff', 'and', 'great', 'facility', 'to', 'go'],
 ['security',
  'guard',
  'asked',
  'me',
  'to',
  'put',
  'a',
  'mask',
  'on',
  'my',
  'month',
  'old',
  'baby',
  'when',
  'i',
  'said',
  'he',
  'is',
  'too',
  'little',
  'he',
  'is',
  'under',
  'two',
  'he',
  'raised',
  'his',
  'eyebrows',
  'and',
  'asked',
  'again',
  'for',
  'me',
  'to',
  'mask',
  'my',
  'baby',
  'please',
  'educate',
  'those',
  'enforcing',
  'policies',
  'so',
  'i',
  'do',
  'not',
  'have',
  'to',
  'defend',
  'a',
  'cdc',
  'policy',
  'which',
  'is',
  'cloth',
  'face',
  'coverings',
  'should',
  'not',
  'be',
  'placed',
  'on',
  'young',
  'children',
  'under',
  'age'],
 ['one',
  'of',
  'the',
  'better',
  'cub',
  'foods',
  'that',
  'i',
  'have',
  'shopped',
  'at',
  'everything',
  'you',
  'need',
  'is',
  'right',
  'there',
  'they',
  'even',
  'have',
  'a',
  'place',
  'to',
  'sit',
  'and',
  'relax',
  'while',
 

**MODEL**

In [15]:
# Save the model to a directory (not a single file)
topic_model.save(
    "model_safetensors",
    serialization="safetensors",  # or "pytorch"
    save_ctfidf=True,
    save_embedding_model="sentence-transformers/all-MiniLM-L6-v2"
)

In [16]:
# Save the model to a directory (not a single file)
topic_model.save(
    "model_pytorch",
    serialization="pytorch",  # or "pytorch"
    save_ctfidf=True,
    save_embedding_model="sentence-transformers/all-MiniLM-L6-v2"
)

# **INTERPRETATION**

**CHECK ONE TOPIC**

In [17]:
#topic X
topic_model.get_topic(33)

[('pill', 0.04920458906950968),
 ('pill club', 0.04423332737787105),
 ('club', 0.03930675731540478),
 ('birth control', 0.038493530526587355),
 ('birth', 0.03287255409267887),
 ('control', 0.032593338908063055),
 ('pills', 0.02117791940072585),
 ('goodies', 0.016554587859896882),
 ('package', 0.015558042721099228),
 ('text', 0.013800834366697252)]

In [18]:
#print reviews with topic x
topic_x_reviews = df[df["topic"] == 117]
topic_x_reviews

Unnamed: 0,user_id,username,time,rating,original_text,translated_text,final_with_stopword,final_no_stopword,ori_wc,final_wc,...,category_16,category_17,category_18,category_19,category_20,category_21,cont_expanded,clean_text,wc,topic
156,1.111085e+20,Dustin Ennis,1522166110771,5.0,Great experience every time I go. Dr. Patel an...,Great experience every time I go. Dr. Patel an...,great experience every time i go dr. patel and...,great experience every time go dr. patel staff...,18,18,...,,,,,,,Great experience every time I go. Dr. Patel an...,great experience every time i go dr patel and ...,18,117
993,1.158707e+20,Pat Flynn,1584051328558,4.0,The staff was very professional and Dr Patel i...,The staff was very professional and Dr Patel i...,the staff be very professional and dr patel be...,staff professional dr patel patience understan...,15,15,...,,,,,,,The staff was very professional and Dr Patel i...,the staff was very professional and dr patel i...,15,117
1209,1.042107e+20,Lindsay S.,1597333415485,5.0,Dr. Patel and his staff are awesome! My hygien...,Dr. Patel and his staff are awesome! My hygien...,dr. patel and his staff be awesome my hygienis...,dr. patel staff awesome hygienist adrianne gem...,16,17,...,,,,,,,Dr. Patel and his staff are awesome! My hygien...,dr patel and his staff are awesome my hygienis...,16,117
1701,1.078489e+20,Simone Brown,1576082488063,5.0,I always feel that I am listen to in my concer...,I always feel that I am listen to in my concer...,i always feel that i be listen to in my concer...,always feel listen concern ease talk dr. patel...,36,36,...,,,,,,,I always feel that I am listen to in my concer...,i always feel that i am listen to in my concer...,34,117
2717,1.076896e+20,Barb Fyman,1569764732123,5.0,I am very happy with the results of my surgery...,I am very happy with the results of my surgery...,i be very happy with the result of my surgery ...,happy result surgery dr. patel good doctor ans...,29,29,...,,,,,,,I am very happy with the results of my surgery...,i am very happy with the results of my surgery...,29,117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275770,1.160472e+20,Craig Colson,1610556992512,5.0,Staff is wonderful! Dr Patel is very caring an...,Staff is wonderful! Dr Patel is very caring an...,staff be wonderful dr patel be very caring and...,staff wonderful dr patel caring make sure get ...,17,17,...,,,,,,,Staff is wonderful! Dr Patel is very caring an...,staff is wonderful dr patel is very caring and...,17,117
286780,1.134118e+20,Lynn Lazdowski,1600887558897,5.0,Definitely thankful for Dr Patel and Aryanna. ...,Definitely thankful for Dr Patel and Aryanna. ...,definitely thankful for dr patel and aryanna q...,definitely thankful dr patel aryanna quality l...,39,39,...,,,,,,,Definitely thankful for Dr Patel and Aryanna. ...,definitely thankful for dr patel and aryanna q...,39,117
290359,1.075483e+20,Dave Zimel,1605056795664,5.0,Dr Patel is an excellent physician and surgeon...,Dr Patel is an excellent physician and surgeon...,dr patel be an excellent physician and surgeon...,dr patel excellent physician surgeon also gent...,16,16,...,,,,,,,Dr Patel is an excellent physician and surgeon...,dr patel is an excellent physician and surgeon...,16,117
299591,1.117447e+20,Ravi Desai,1584058508270,5.0,I came here just a couple of months ago. They ...,I came here just a couple of months ago. They ...,i come here just a couple of month ago they ca...,come couple month ago care much treatment dr. ...,36,36,...,,,,,,,I came here just a couple of months ago. They ...,i came here just a couple of months ago they c...,36,117


In [19]:
topic_x_reviews.to_csv("topic_33_reviews.csv", index=False)

**BAR CHART FOR TOPIC'S TOP KEYWORDS

In [20]:
fig1 = topic_model.visualize_barchart(top_n_topics=100)
fig1.write_html("topic_barchart.html")

In [21]:
fig1

**TOPIC RELATIONSHIP = INTERTOPIC DISTANCE MAP**

In [22]:
fig2 = topic_model.visualize_topics()
fig2

In [23]:
fig2.write_html("interdistance_matrix.html")

**HIERARCHICAL CLUSTERING DENDOGRAM**

In [24]:
# Get total number of topics 
total_topics = len(topic_model.get_topics())
# Use all topics in hierarchy
fig3 = topic_model.visualize_hierarchy(top_n_topics=total_topics)
fig3.show()

In [25]:
fig3.write_html("hierarchical_clustering_dendrogram.html")

# **EVALUATION**

### **1. GENSIM - WITHOUT OUTLIERS**

In [26]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel

In [27]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel

#tokenize documents
texts_tokenized = [doc.split() for doc in df['clean_text']]

#train bigram model
bigram_model = Phrases(texts_tokenized, min_count=3, threshold=10)
bigram_phraser = Phraser(bigram_model)

#apply bigram model to texts
texts_with_phrases = [bigram_phraser[doc] for doc in texts_tokenized]

#build dictionary and corpus 
dictionary = Dictionary(texts_with_phrases)
corpus = [dictionary.doc2bow(text) for text in texts_with_phrases]

#normalize topic words from bertopic
topic_words = []
for topic_id in topic_model.get_topic_info()['Topic']:
    if topic_id == -1:
        continue
    words_scores = topic_model.get_topic(topic_id)

    topic = []
    for word, _ in words_scores:
        # Replace spaces with underscores to match phrases
        word_token = word.replace(" ", "_")
        if word_token in dictionary.token2id:
            topic.append(word_token)

    if len(topic) >= 2:
        topic_words.append(topic)

print(f"Topics retained for coherence: {len(topic_words)}")

#COMPUTE COHERENCE
coherence_model = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    coherence='c_v'
)

coherence_model2 = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    corpus=corpus,
    coherence='u_mass'
)

coherence_model3 = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    coherence='c_uci'
)

coherence_model4 = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    coherence='c_npmi'
)

# Display results
print("Coherence Score (c_v):", coherence_model.get_coherence())
print("Coherence Score (u_mass):", coherence_model2.get_coherence())
print("Coherence Score (c_uci):", coherence_model3.get_coherence())
print("Coherence Score (c_npmi):", coherence_model4.get_coherence())



Topics retained for coherence: 163
Coherence Score (c_v): 0.6418538724648942
Coherence Score (u_mass): -3.789140839046333
Coherence Score (c_uci): 0.7755065627609954
Coherence Score (c_npmi): 0.11098602746447721


### **2. GENSIM - WITH OUTLIERS**

In [28]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel

#tokenize documents
texts_tokenized = [doc.split() for doc in df['clean_text']]

#train bigram model
bigram_model = Phrases(texts_tokenized, min_count=3, threshold=10)
bigram_phraser = Phraser(bigram_model)

#apply bigram model to texts
texts_with_phrases = [bigram_phraser[doc] for doc in texts_tokenized]

#build dictionary and corpus 
dictionary = Dictionary(texts_with_phrases)
corpus = [dictionary.doc2bow(text) for text in texts_with_phrases]

#normalize topic words from bertopic
topic_words = []
for topic_id in topic_model.get_topic_info()['Topic']:
    words_scores = topic_model.get_topic(topic_id)

    topic = []
    for word, _ in words_scores:
        # Replace spaces with underscores to match phrases
        word_token = word.replace(" ", "_")
        if word_token in dictionary.token2id:
            topic.append(word_token)

    if len(topic) >= 2:
        topic_words.append(topic)

print(f"Topics retained for coherence: {len(topic_words)}")

#COMPUTE COHERENCE
coherence_model = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    coherence='c_v'
)

coherence_model2 = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    corpus=corpus,
    coherence='u_mass'
)

coherence_model3 = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    coherence='c_uci'
)

coherence_model4 = CoherenceModel(
    topics=topic_words,
    texts=texts_with_phrases,
    dictionary=dictionary,
    coherence='c_npmi'
)

# Display results
print("Coherence Score (c_v):", coherence_model.get_coherence())
print("Coherence Score (u_mass):", coherence_model2.get_coherence())
print("Coherence Score (c_uci):", coherence_model3.get_coherence())
print("Coherence Score (c_npmi):", coherence_model4.get_coherence())



Topics retained for coherence: 164
Coherence Score (c_v): 0.6400038531491186
Coherence Score (u_mass): -3.7763903658609257
Coherence Score (c_uci): 0.7697043345998675
Coherence Score (c_npmi): 0.11019216159765678


## **3. TOPIC DIVERSITY**

In [29]:
def calculate_topic_diversity(topic_words):
    unique_words = set(word for topic in topic_words for word in topic)
    total_words = sum(len(topic) for topic in topic_words)
    return len(unique_words) / total_words

diversity_score = calculate_topic_diversity(topic_words)
print("Topic Diversity:", diversity_score)

Topic Diversity: 0.629657794676806
