In [None]:
import pandas as pd
from gensim import corpora, models
import spacy
from tqdm import tqdm


df = pd.read_csv("../clustering/intermediate_data/clustered_embeddings.csv")

df = df[df["text"].notnull() & df["cluster"].notnull()]

nlp = spacy.load("en_core_web_sm")

grouped = df.groupby("cluster")["text"].apply(lambda x: " ".join(x)).reset_index()
grouped.columns = ["cluster", "combined_text"]


def apply_lda(texts, num_topics=1, num_words=5):
    tokenized = [
        [token.text.lower() for token in nlp(text) if token.is_alpha and not token.is_stop]
        for text in texts
    ]
    dictionary = corpora.Dictionary(tokenized)
    corpus = [dictionary.doc2bow(text) for text in tokenized]

    if len(dictionary) == 0:
        return [["<insufficient data>"]] * num_topics

    lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=10)
    return [[word for word, _ in lda_model.show_topic(i, topn=num_words)] for i in range(num_topics)]


results = []

for i, row in tqdm(grouped.iterrows(), total=len(grouped), desc="Processing clusters with LDA"):
    cluster_id = row["cluster"]
    combined_text = row["combined_text"]

    lda_words = apply_lda([combined_text])[0] 
    top_words_str = ", ".join(lda_words)

   
    rep_docs = df[df["cluster"] == cluster_id]["text"].dropna().tolist()[:3]
    rep_combined = " ||| ".join(rep_docs)

    results.append({
        "topic_num": cluster_id,
        "top_words": top_words_str,
        "representative_docs": rep_combined
    })


output_df = pd.DataFrame(results)
output_df.to_csv("lda_clustered_topics.csv", index=False)
print("\n✅ LDA output saved to lda_clustered_topics.csv")


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject