In [None]:
import pandas as pd
from top2vec import Top2Vec


df = pd.read_csv("../clustering/intermediate_data/clustered_embeddings.csv")
df = df[df["text"].notnull()].reset_index(drop=True)

texts = df["text"].tolist()
print(f"Training Top2Vec on {len(texts)} documents...")


model = Top2Vec(texts, speed="learn", workers=4)

topic_words, word_scores, topic_nums = model.get_topics()
n_docs_per_topic = 3

used_doc_ids = set()  
csv_data = []

for topic_id, (words, scores, topic_num) in enumerate(zip(topic_words, word_scores, topic_nums)):
    # Search more than needed to allow filtering overlaps
    docs, doc_scores, doc_ids = model.search_documents_by_topic(
        topic_num=topic_num,
        num_docs=n_docs_per_topic * 5  
    )

    selected_docs = []
    selected_ids = []

    for doc, doc_id in zip(docs, doc_ids):
        if doc_id not in used_doc_ids:
            selected_docs.append(doc)
            selected_ids.append(doc_id)
            used_doc_ids.add(doc_id)
        if len(selected_docs) == n_docs_per_topic:
            break

    if not selected_docs:
        continue  

    csv_data.append({
        "topic_num": topic_num,
        "top_words": ", ".join(words),
        "word_scores": ", ".join([f"{s:.4f}" for s in scores]),
        "representative_docs": " ||| ".join(selected_docs)
    })


output_df = pd.DataFrame(csv_data)
output_df.to_csv("top2vec_clustered_topics.csv", index=False)

print("✅ Topics and representative documents saved to top2vec_clustered_topics.csv (without overlap)")


2025-06-26 22:12:56,170 - top2vec - INFO - Pre-processing documents for training
2025-06-26 22:12:56,182 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


Training Top2Vec on 65 documents...


2025-06-26 22:13:17,184 - top2vec - INFO - Creating joint document/word embedding
2025-06-26 22:13:17,924 - top2vec - INFO - Creating lower dimension embedding of documents
2025-06-26 22:13:31,974 - top2vec - INFO - Finding dense areas of documents
2025-06-26 22:13:31,979 - top2vec - INFO - Finding topics


ValueError: need at least one array to concatenate