# LDA Topic Modeling (Bag-of-Words)


## Step 0: Install & Imports

In [None]:
# !pip install scikit-learn pandas
import pandas as pd

#Bag-of-words vectorizer that turns text into word-count features.
from sklearn.feature_extraction.text import CountVectorizer

#Brings in scikit-learn’s LDA implementation.
from sklearn.decomposition import LatentDirichletAllocation as LDA
import numpy as np, os, random
random.seed(42)

## Step 1: Load the CSV

In [None]:
csv_path = r"topics_100.csv"
df = pd.read_csv(csv_path)
df.head()

## Step 2: Counts Vectorizer

In [None]:
#CountVectorizer(...) – Creates a vectorizer with these settings:
#stop_words="english" – Drops very common English stopwords (e.g., the, and).
#max_features=20000 – Caps vocabulary size; here it’s high and won’t bind for 100 docs.
#min_df=2 – Keep only words/bigrams that appear in ≥ 2 documents
#ngram_range=(1,2) – Use unigrams and bigrams (e.g., “login”, “login error”).


vec = CountVectorizer(stop_words="english", max_features=20000, min_df=2, ngram_range=(1,2))


#Learns the vocabulary from text and creates a sparse matrix X of shape (n_docs, vocab_size) with raw counts.
X = vec.fit_transform(df["text"].astype(str).tolist())


#List of vocabulary strings in the same order as X’s columns.
terms = vec.get_feature_names_out()
X.shape

## Step 3: Fit LDA

In [None]:
#n_topics = 10 – How many topics to extract
n_topics = 6


#LDA(...)Configure LDA:
#n_components=n_topics – Number of topics.
#learning_method="batch" – Full batch variational EM (stable for small/medium data). For very large corpora, try "online".
#random_state=42 – Reproducibility.
#max_iter=50 Max optimization iterations.

lda = LDA(n_components=n_topics, learning_method="batch", random_state=42, max_iter=50)

#W (docs × topics): how much of each topic per doc
#Fits the model to X and returns the document–topic matrix:
W = lda.fit_transform(X)

#H (topics × words): which words define each topic
#The topic–word matrix in counts space:
H = lda.components_


W.shape, H.shape

## Step 4: Top Words

In [None]:
def top_words_per_topic(H, terms, topn=12):
    for k, row in enumerate(H):
        top_idx = row.argsort()[-topn:][::-1]
        print(f"LDA Topic {k}: " + ", ".join(terms[i] for i in top_idx))
top_words_per_topic(H, terms, topn=12)

## Step 5: Assign & Save

In [None]:
df["dominant_topic"] = W.argmax(axis=1)
df.to_csv("lda_topics_assigned.csv", index=False)
df.head()

In [None]:
df