In [4]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# --- Step 1: Define paths and model names ---
MODEL_PATH = "model_safetensors_positive"  # Folder where your BERTopic model was saved
EMBEDDING_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"  # Must match training
EMBEDDINGS_PATH = "positive_embeddings.npy"
CSV_PATH = "data_with_topic.csv"  # Your original file with 'clean_text' and 'topic' columns

# --- Step 2: Load embedding model ---
print(f"Loading embedding model: {EMBEDDING_MODEL_ID}...")
embedding_model = SentenceTransformer(EMBEDDING_MODEL_ID)
print("Embedding model loaded.")

# --- Step 3: Load BERTopic model using the embedding model ---
print(f"Loading BERTopic model from: {MODEL_PATH}...")
topic_model = BERTopic.load(MODEL_PATH, embedding_model=embedding_model)
print("BERTopic model loaded successfully.")

# --- Step 4: Load your data and precomputed embeddings ---
print("Loading data and embeddings...")
df = pd.read_csv(CSV_PATH)
texts = df["clean_text"].astype(str).tolist()
topics = df["topic"].tolist()
embeddings = np.load(EMBEDDINGS_PATH)
print("Data and embeddings loaded.")

# --- Step 5: Reduce outliers using embedding similarity ---
original_outlier_count = topics.count(-1)
print(f"\nOriginal number of outliers: {original_outlier_count}")

if original_outlier_count > 0:
    print("Reducing outliers using the 'embeddings' strategy...")

    new_topics = topic_model.reduce_outliers(
        documents=texts,
        embeddings=embeddings,
        topics=topics,
        strategy="embeddings",  # <-- make sure it's plural
        threshold=0.2          # try 0.05 for more aggressive reassignment
    )

    new_outlier_count = new_topics.count(-1)
    print(f"Outliers after reduction: {new_outlier_count}")
    print(f"Reassigned: {original_outlier_count - new_outlier_count} documents")

    # --- Step 6: Update topic representations ---
    print("Updating topic representations...")
    topic_model.update_topics(docs=texts, topics=new_topics)
    print("Topic representations updated.")

    # --- Step 7: Save new topics to CSV ---
    df["topic_reduced_embedding"] = new_topics
    df.to_csv("data_with_topic_reduced_embedding.csv", index=False)
    print("Saved to 'data_with_topic_reduced_embedding.csv'.")

    # --- Step 8: Save updated model ---
    topic_model.save(
        "model_reduced_outliers_embedding",
        serialization="safetensors",
        save_embedding_model=EMBEDDING_MODEL_ID
    )
    print("Updated model saved to 'model_reduced_outliers_embedding/'.")

else:
    print("No outliers found. Nothing to reduce.")


Loading embedding model: sentence-transformers/all-MiniLM-L6-v2...
Embedding model loaded.
Loading BERTopic model from: model_safetensors_positive...
BERTopic model loaded successfully.
Loading data and embeddings...
Data and embeddings loaded.

Original number of outliers: 142616
Reducing outliers using the 'embeddings' strategy...




Outliers after reduction: 7
Reassigned: 142609 documents
Updating topic representations...
Topic representations updated.
Saved to 'data_with_topic_reduced_embedding.csv'.
Updated model saved to 'model_reduced_outliers_embedding/'.


## **INTERPRETATION**

In [9]:
import pandas as pd

# Load the new CSV file with the reduced outlier topic assignments
df_reduced = pd.read_csv("data_with_topic_reduced_embedding.csv")

# --- 1. Verify that all rows are still present ---
print(f"Total number of rows in the new file: {len(df_reduced)}")
print("As you can see, no rows were deleted from your dataset.")

# --- 2. See the 7 remaining outlier rows ---
print("\n--- Displaying the 7 remaining outlier documents ---")
# Filter the DataFrame to show only the rows where the new topic is -1
remaining_outliers = df_reduced[df_reduced["topic_reduced_embedding"] == -1]

# Display the full content of the text for these outliers
with pd.option_context('display.max_colwidth', None):
    display(remaining_outliers)

# --- 3. (Optional) See the new topic distribution ---
print("\n--- New Topic Distribution ---")
# This shows how the 142,609 reassigned documents were distributed
# among your existing topics.
print(df_reduced["topic_reduced_embedding"].value_counts())

Total number of rows in the new file: 305851
As you can see, no rows were deleted from your dataset.

--- Displaying the 7 remaining outlier documents ---


Unnamed: 0,user_id,username,time,rating,original_text,translated_text,final_with_stopword,final_no_stopword,ori_wc,final_wc,...,category_17,category_18,category_19,category_20,category_21,cont_expanded,clean_text,wc,topic,topic_reduced_embedding
11368,1.113031e+20,Natasha Spearman,1524081116563,1.0,Do u need a green card,Do u need a green card,do you need a green card,need green card,6,6,...,,,,,,Do you need a green card,do you need a green card,6,-1,-1
210598,1.146879e+20,Badass Mechanic,1603236104519,5.0,"In these troubling political times with gun violence peaking, human rights scandals on us soil, and ongoing corruption investigations, it’s always important to find the silver lining in things...\nInternational Relations with Russia have never been better!","In these troubling political times with gun violence peaking, human rights scandals on us soil, and ongoing corruption investigations, it’s always important to find the silver lining in things...\nInternational Relations with Russia have never been better!",in these troubling political time with gun violence peak human right scandal on u soil and ongoing corruption investigation it be always important to find the silver lining in thing international relation with russia have never be good,troubling political time gun violence peak human right scandal u soil ongoing corruption investigation always important find silver lining thing international relation russia never good,37,38,...,,,,,,"In these troubling political times with gun violence peaking, human rights scandals on us soil, and ongoing corruption investigations, it is always important to find the silver lining in things...\nInternational Relations with Russia have never been better!",in these troubling political times with gun violence peaking human rights scandals on us soil and ongoing corruption investigations it is always important to find the silver lining in things international relations with russia have never been better,38,-1,-1
223429,1.168669e+20,Bryan Klabunde,1600015360031,1.0,"Not sure why they feel the need to broadcast Fox news in times like these, but each to their own.","Not sure why they feel the need to broadcast Fox news in times like these, but each to their own.",not sure why they feel the need to broadcast fox news in time like these but each to their own,not sure feel need broadcast fox news time like,20,20,...,,,,,,"Not sure why they feel the need to broadcast Fox news in times like these, but each to their own.",not sure why they feel the need to broadcast fox news in times like these but each to their own,20,-1,-1
262338,1.001018e+20,Elijha Trott,1540242622936,1.0,There is a password and username for the guest WiFi.,There is a password and username for the guest WiFi.,there be a password and username for the guest wifi,password username guest wifi,10,10,...,,,,,,There is a password and username for the guest WiFi.,there is a password and username for the guest wifi,10,-1,-1
286341,1.002245e+20,Marley Shmenderson,1511050145046,2.0,Where is the WiFi password or someone who knows it!,Where is the WiFi password or someone who knows it!,where be the wifi password or someone who know it,wifi password someone know,10,10,...,,,,,,Where is the WiFi password or someone who knows it!,where is the wifi password or someone who knows it,10,-1,-1
289746,1.080675e+20,sara winkler,1598645248483,4.0,B no bc fbm b,B no bc fbm b,b no because fbm b,b no fbm b,5,5,...,,,,,,B no because fbm b,b no because fbm b,5,-1,-1
304977,1.164394e+20,Travis William,1609460203353,5.0,A bubble of civilization within the Fourth Reich.,A bubble of civilization within the Fourth Reich.,a bubble of civilization within the fourth reich,bubble civilization within fourth reich,8,8,...,,,,,,A bubble of civilization within the Fourth Reich.,a bubble of civilization within the fourth reich,8,-1,-1



--- New Topic Distribution ---
topic_reduced_embedding
 0      54016
 1       8157
 13      5111
 86      4395
 3       3744
        ...  
 336       44
 328       41
 339       38
 332       36
-1          7
Name: count, Length: 347, dtype: int64


## **INSPECT TOPICS FOR DEPLOYMENT**

In [10]:
len(topic_model.get_topics())


347

In [12]:
fig1 = topic_model.visualize_barchart(top_n_topics = 347, n_words = 10)
fig1.write_html("TOPKEYWORDS_10_BARCHART.html")
fig1.show()

groups = {
    "
}