### DL-based approach

In [None]:
# ==== Setup ====
# pip install -U pandas bertopic sentence-transformers umap-learn hdbscan scikit-learn

import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
import hdbscan

# 1) Load data
path = "status_examples.csv"   # <- adjust path if needed
df = pd.read_csv(path)

# Choose the text column (falls back to 'statement')
TEXT_COL = "statement"
texts = df[TEXT_COL].astype(str).fillna("").tolist()

# 2) Pretrained embedding model (compact + multilingual ok)
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedder.encode(texts, batch_size=64, show_progress_bar=True, normalize_embeddings=True)

# 3) Dimensionality reduction & clustering backends
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, random_state=42)
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=5, metric="euclidean", cluster_selection_method="eom")

# 4) Fit BERTopic
topic_model = BERTopic(
    language="multilingual",            # or "english" if your data is purely English
    calculate_probabilities=True,
    verbose=True,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    # You can tweak representation model for nicer labels later via .set_representations(...)
)

topics, probs = topic_model.fit_transform(texts, embeddings)

# 5) Inspect topics
# topic_model.get_topic(topic_id) -> list of (word, weight)
topic_info = topic_model.get_topic_info()
print(topic_info.head(10))

# 6) Add assignments back to your dataframe
df_out = df.copy()
df_out["topic_id"] = topics
df_out["topic_label"] = df_out["topic_id"].map(
    lambda t: ", ".join([w for w, _ in topic_model.get_topic(t)[:4]]) if t != -1 else "outlier"
)

# 7) Save results
df_out.to_csv("topic_clusters_bertopic.csv", index=False)
print("Saved: topic_clusters_bertopic.csv")

# 8) Optional: visualize
# topic_model.visualize_topics().show()
# topic_model.visualize_barchart(top_n_topics=12).show()
# topic_model.visualize_hierarchy().show()
