# NMF Topic Modeling (TF-IDF)


## Step 0: Install & Imports

In [1]:
# !pip install scikit-learn pandas
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy as np, os, random
random.seed(42)

## Step 1: Load the CSV

In [2]:
csv_path = r"topics_100.csv"
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,id,text
0,79,IPv6 users cannot reach the upload endpoint
1,4,Login form shows captcha error even for first ...
2,55,Audit logs missing entries for sensitive actions
3,3,SSO login loops back to the sign-in page repea...
4,72,Corporate proxy strips authorization headers


## Step 2: TF-IDF

In [3]:
vec = TfidfVectorizer(stop_words="english", max_features=20000, min_df=2, ngram_range=(1,2))
X = vec.fit_transform(df["text"].astype(str).tolist())
terms = vec.get_feature_names_out()
X.shape

(100, 81)

## Step 3: Fit NMF

In [4]:
n_topics = 10
nmf = NMF(n_components=n_topics, init="nndsvda", random_state=42, max_iter=400)
W = nmf.fit_transform(X)
H = nmf.components_
W.shape, H.shape

((100, 10), (10, 81))

## Step 4: Top Words

In [5]:
def top_words_per_topic(H, terms, topn=12):
    for k, row in enumerate(H):
        top_idx = row.argsort()[-topn:][::-1]
        print(f"Topic {k}: " + ", ".join(terms[i] for i in top_idx))
top_words_per_topic(H, terms, topn=12)

Topic 0: login, logout, cause, mobile, error, page, sso, dashboard, token, alert, triggers, password
Topic 1: fails, fields, missing, update, vpn, error, ip, devices, save, android, older, profile
Topic 2: links, app, frequently, exported, mode, headers, csv, android, quickly, switching, mobile, large
Topic 3: integration, does, sync, oauth, change, password, user, mode, save, link, reset, quickly
Topic 4: report, pagination, charts, custom, emails, arrive, resets, save, fields, link, email, hours
Topic 5: returns, api, file, requests, responses, match, webhooks, processed, events, card, payment, fails
Topic 6: slowly, older, dashboard, hours, devices, android, email, arrive, events, switching, app, quickly
Topic 7: shows, form, duplicate, error, email, invoice, sso, token, users, login, arrive, hours
Topic 8: date, switching, resets, invoice, match, does, duplicate, android, quickly, responses, app, custom
Topic 9: notifications, users, push, upload, events, mobile, mode, arrive, area

## Step 5: Assign & Save

In [None]:
df["dominant_topic"] = W.argmax(axis=1)
df.to_csv("nmf_topics_assigned.csv", index=False)
df.head()