In [None]:
import pandas as pd
from bertopic import BERTopic
import spacy
from tqdm import tqdm


nlp = spacy.load("en_core_web_sm")

df = pd.read_csv("../clustering/intermediate_data/clustered_embeddings.csv")
df = df[df["text"].notnull() & df["cluster"].notnull()]

def clean_text(text):
    doc = nlp(text)
    return " ".join([
        token.text.lower()
        for token in doc
        if token.is_alpha
        and not token.is_stop
        and token.pos_ != "ADP" 
        and len(token.text) > 3
    ])

print("Cleaning texts...")
df["clean_text"] = df["text"].apply(clean_text)
texts = df["clean_text"].tolist()
clusters = df["cluster"].tolist()

print(f"Training BERTopic on {len(texts)} cleaned documents...")
bertopic_model = BERTopic()
topics, _ = bertopic_model.fit_transform(texts)

df["bertopic_topic"] = topics
results = []

print("Processing clusters and saving top topics...")
for cluster_id in tqdm(sorted(df["cluster"].unique()), desc="Clusters"):
    cluster_df = df[df["cluster"] == cluster_id]

    topic_counts = cluster_df["bertopic_topic"].value_counts()
    top_topic_id = topic_counts.idxmax()

    topic_words = bertopic_model.get_topic(top_topic_id)
    if topic_words:
        filtered_words = [word for word, _ in topic_words if len(word) > 3 and word.lower() not in nlp.Defaults.stop_words]
        top_words = ", ".join(filtered_words)
    else:
        top_words = "<no words>"

    rep_docs = cluster_df["text"].dropna().tolist()[:3]
    rep_combined = " ||| ".join(rep_docs)

    results.append({
        "topic_num": cluster_id,
        "top_words": top_words,
        "representative_docs": rep_combined
    })


output_df = pd.DataFrame(results)
output_df.to_csv("BERTopic_clustered_topics.csv", index=False)
print("\n✅ Output saved to BERTopic_clustered_topics.csv")



Cluster -1
BERTopic: [('the', 0.12156235925511119), ('is', 0.10446538617401516), ('and', 0.0888109093128488), ('so', 0.08670870942497806), ('we', 0.08246129114379448), ('of', 0.0800898072539109), ('to', 0.07883009302490357), ('that', 0.07589316637397782), ('this', 0.06735298332505273), ('be', 0.049982558128675185)]

Cluster 0
BERTopic: [('the', 0.12156235925511119), ('is', 0.10446538617401516), ('and', 0.0888109093128488), ('so', 0.08670870942497806), ('we', 0.08246129114379448), ('of', 0.0800898072539109), ('to', 0.07883009302490357), ('that', 0.07589316637397782), ('this', 0.06735298332505273), ('be', 0.049982558128675185)]

Cluster 1
BERTopic: [('is', 0.13674509277258126), ('the', 0.13009239898492286), ('of', 0.11273507987014557), ('to', 0.09215218599523319), ('and', 0.09208061490885204), ('we', 0.07446044921846459), ('so', 0.06985334362642347), ('that', 0.06754340845366785), ('this', 0.062104722523206396), ('in', 0.05573063539470624)]

Cluster 2
BERTopic: [('the', 0.12156235925511