In [None]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# --- Step 1: Define paths and model names ---
MODEL_PATH = "model_safetensors"  # Folder where your BERTopic model was saved
EMBEDDING_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"  # Must match training
EMBEDDINGS_PATH = "embeddings.npy"
CSV_PATH = "data_with_topics.csv"  # Your original file with 'clean_text' and 'topic' columns

# --- Step 2: Load embedding model ---
print(f"Loading embedding model: {EMBEDDING_MODEL_ID}...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_ID)
print("Embedding model loaded.")


# --- Step 3: Load BERTopic model using the embedding model ---
print(f"Loading BERTopic model from: {MODEL_PATH}...")
topic_model = BERTopic.load(MODEL_PATH, embedding_model=embedding_model)
print("BERTopic model loaded successfully.")

# --- Step 4: Load your data and precomputed embeddings ---
print("Loading data and embeddings...")
df = pd.read_csv(CSV_PATH)
texts = df["clean_text"].astype(str).tolist()
topics = df["topic"].tolist()
embeddings = np.load(EMBEDDINGS_PATH)
print("Data and embeddings loaded.")

# --- Step 5: Reduce outliers using embedding similarity ---
original_outlier_count = topics.count(-1)
print(f"\nOriginal number of outliers: {original_outlier_count}")

if original_outlier_count > 0:
    print("Reducing outliers using the 'embeddings' strategy...")

    new_topics = topic_model.reduce_outliers(
        documents=texts,
        embeddings=embeddings,
        topics=topics,
        strategy="embeddings",  # <-- make sure it's plural
        threshold=0.1          # try 0.05 for more aggressive reassignment
    )
    new_outlier_count = new_topics.count(-1)
    print(f"Outliers after reduction: {new_outlier_count}")
    print(f"Reassigned: {original_outlier_count - new_outlier_count} documents")

    # --- Step 6: Update topic representations ---
    print("Updating topic representations...")
    topic_model.update_topics(docs=texts, topics=new_topics)
    print("Topic representations updated.")

    # --- Step 7: Save new topics to CSV ---
    df["topic_reduced_embedding"] = new_topics
    df.to_csv("FINAL_REDUCED_DATA.csv", index=False)
    print("Saved to 'FINAL_REDUCED_DATA.csv'.")

    # --- Step 8: Save updated model ---
    topic_model.save(
        "MODEL_REDUCED_OUTLIERS",
        serialization="safetensors",
        save_embedding_model=EMBEDDING_MODEL_ID
    )
    print("Updated model saved to 'MODEL_REDUCED_OUTLIERS/'.")

else:
    print("No outliers found. Nothing to reduce.")


Loading embedding model: sentence-transformers/all-MiniLM-L6-v2...
Embedding model loaded.
Loading BERTopic model from: model_safetensors...
BERTopic model loaded successfully.
Loading data and embeddings...
Data and embeddings loaded.

Original number of outliers: 163812
Reducing outliers using the 'embeddings' strategy...




Outliers after reduction: 0
Reassigned: 163812 documents
Updating topic representations...
Topic representations updated.
Saved to 'FINAL_REDUCED_DATA.csv'.
Updated model saved to 'MODEL_REDUCED_OUTLIERS/'.


In [2]:
len(topic_model.get_topics())

163

In [3]:
fig1 = topic_model.visualize_barchart(top_n_topics = 163, n_words = 10)
fig1.write_html("TOPKEYWORDS_10_BARCHART.html")
fig1.show()

In [125]:
topic_x_reviews = df[df["topic"] == 30]
topic_x_reviews


Unnamed: 0,user_id,username,time,rating,original_text,translated_text,final_with_stopword,final_no_stopword,ori_wc,final_wc,...,category_17,category_18,category_19,category_20,category_21,cont_expanded,clean_text,wc,topic,topic_reduced_embedding
248,1.136550e+20,Michelle Hefner,1556110693799,5.0,Best place ever....From the Top dogs down ever...,Best place ever....From the Top dogs down ever...,best place ever from the top dog down everyone...,best place ever top dog everyone exceptional c...,29,32,...,,,,,,Best place ever....From the Top dogs down ever...,best place everfrom the top dogs down everyone...,29,30,30
649,1.058955e+20,Elizabeth Frost,1512573873088,3.0,Lots of red tape to actually see a therapist a...,Lots of red tape to actually see a therapist a...,lot of red tape to actually see a therapist at...,lot red tape actually see therapist center ann...,31,31,...,,,,,,Lots of red tape to actually see a therapist a...,lots of red tape to actually see a therapist a...,31,30,30
815,1.030429e+20,Christa Weigelt,1627993413617,4.0,I loved my counselor Dan but the company itsel...,I loved my counselor Dan but the company itsel...,i love my counselor dan but the company itself...,love counselor dan company terrible good pract...,31,31,...,,,,,,I loved my counselor Dan but the company itsel...,i loved my counselor dan but the company itsel...,31,30,30
862,1.179257e+20,Stephanie McDonald,1534303107505,4.0,This was a great resource for me to get some h...,This was a great resource for me to get some h...,this be a great resource for me to get some he...,great resource get help one main benefit felt ...,68,69,...,,,,,,This was a great resource for me to get some h...,this was a great resource for me to get some h...,69,30,30
1015,1.080845e+20,Linda Dorsett,1630250937110,5.0,Professional care with kindness and positive r...,Professional care with kindness and positive r...,professional care with kindness and positive r...,professional care kindness positive result the...,12,12,...,,,,,,Professional care with kindness and positive r...,professional care with kindness and positive r...,12,30,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303951,1.070847e+20,Vicki,1528332637706,5.0,The therapists​ & staff are great and the inte...,The therapists​ & staff are great and the inte...,the therapists​ staff be great and the interio...,therapists​ staff great interior nicely decora...,14,12,...,,,,,,The therapists​ & staff are great and the inte...,the therapists staff are great and the interio...,12,30,30
304334,1.163197e+20,Kristine Rhodes,1609877098711,4.0,I love both therapists I've been with. Right ...,I love both therapists I've been with. Right ...,i love both therapist i have be with right now...,love therapist right helen attentive give grea...,80,86,...,,,,,,I love both therapists I have been with. Righ...,i love both therapists i have been with right ...,86,30,30
304906,1.040886e+20,Yvonne Patton,1593043524336,5.0,Wonderful people who try to make your visit co...,Wonderful people who try to make your visit co...,wonderful people who try to make your visit co...,wonderful people try make visit comfortable pa...,30,30,...,,,,,,Wonderful people who try to make your visit co...,wonderful people who try to make your visit co...,30,30,30
305717,1.045578e+20,Lydia Anderson,1624824736583,5.0,"Awesome therapy experience, wonderful staff a...","Awesome therapy experience, wonderful staff a...",awesome therapy experience wonderful staff and...,awesome therapy experience wonderful staff kno...,8,8,...,,,,,,"Awesome therapy experience, wonderful staff a...",awesome therapy experience wonderful staff and...,8,30,30


In [25]:
# Number of top words to display per topic
top_n_words = 10  

# Get all topic IDs (excluding outlier -1 if you wish)
all_topic_ids = [t for t in topic_model.get_topic_info()["Topic"].tolist() if t != -1]

# Print topic words
for topic_id in all_topic_ids:
    words_scores = topic_model.get_topic(topic_id)  # List of (word, score) tuples
    if words_scores:
        words = [word for word, _ in words_scores[:top_n_words]]
        print(f"Topic {topic_id}: {', '.join(words)}")


Topic 0: dentist, dental, teeth, tooth, my, and, the, office, to, they
Topic 1: pharmacy, prescription, prescriptions, pharmacist, medication, ready, it, they, filled, refill
Topic 2: insurance, bill, billing, pay, they, paid, not, that, for, to
Topic 3: covid, test, testing, results, tested, rapid, for, in, get, they
Topic 4: staff, friendly, very, helpful, professional, great, knowledgeable, nice, always, efficient
Topic 5: dr, his, he, staff, him, great, very, is, recommend, best
Topic 6: massage, massages, therapist, relaxing, spa, elements, was, tissue, she, therapists
Topic 7: rude, desk, front, receptionist, she, lady, her, unprofessional, the, not
Topic 8: pharmacy, pharmacist, prescriptions, helpful, pharmacies, are, always, store, service, friendly
Topic 9: dispensary, flower, bud, weed, product, dispensaries, cannabis, prices, products, selection
Topic 10: prices, store, selection, shop, good, high, items, expensive, priced, deals
Topic 11: chiropractic, chiropractor, chirop