# Embeddings + KMeans Topic Mining


## Step 0: Install & Imports

In [None]:
# !pip install sentence-transformers scikit-learn pandas
import pandas as pd

#Pretrained Transformer encoder to convert sentences/documents into embeddings (vectors that capture meaning).
from sentence_transformers import SentenceTransformer

#K-Means clustering algorithm.
from sklearn.cluster import KMeans

#Later used to pull top keywords per cluster.
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np, os, random
random.seed(42)

## Step 1: Load the CSV

In [None]:
csv_path = r"/mnt/data/topics_100.csv"
df = pd.read_csv(csv_path)
df.head()

## Step 2: Encode

In [None]:

#Loads a light, fast sentence embedding model.
#  It maps each text to a 384-dimensional vector (by default for this model).

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


#batch_size=64: encodes in mini-batches (speed/memory tradeoff).
#show_progress_bar=True: visual feedback while encoding.
#normalize_embeddings=True: L2-normalizes vectors so cosine similarity ~ dot product; often stabilizes clustering.

emb = model.encode(df["text"].astype(str).tolist(),
                   batch_size=64, show_progress_bar=True, normalize_embeddings=True)
emb.shape

## Step 3: KMeans

In [None]:
k = 10
kmeans = KMeans(n_clusters=k, n_init="auto", random_state=42)
labels = kmeans.fit_predict(emb)
df["cluster"] = labels
df.head()

## Step 4: Cluster Keywords

In [None]:
def keywords_for_cluster(texts, topn=12):
    vec = CountVectorizer(stop_words="english", max_features=5000, ngram_range=(1,2), min_df=2)
    X = vec.fit_transform(texts)
    freqs = np.asarray(X.sum(axis=0)).ravel()
    terms = vec.get_feature_names_out()
    idx = freqs.argsort()[-topn:][::-1]
    return [terms[i] for i in idx]

for c in range(k):
    docs_c = df.loc[df["cluster"] == c, "text"].tolist()
    if len(docs_c) == 0:
        print(f"Cluster {c}: (no docs)")
    else:
        print(f"Cluster {c}: " + ", ".join(keywords_for_cluster(docs_c, topn=12)))

## Step 5: Save

In [None]:
df.to_csv("embeddings_kmeans_topics_assigned.csv", index=False)
df.head()