<a href="https://colab.research.google.com/github/ganeshmukhiya/Topic-Modeling-NMF/blob/main/NMF_TOPIC_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [2]:
# Fetch the data
train_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [3]:
# Example: View initial raw data
print("Raw Text Data Example:")
print(train_data.data[0])

Raw Text Data Example:
I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [4]:
# Text Preprocessing: Cleaned text (you can implement additional custom preprocessing)
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
X_train = vectorizer.fit_transform(train_data.data)

In [5]:
# Example: View a sample of the TF-IDF matrix
print("\nTF-IDF Matrix Shape:", X_train.shape)
print("TF-IDF Matrix Sample (First Document):")
print(X_train[0])


TF-IDF Matrix Shape: (11314, 101322)
TF-IDF Matrix Sample (First Document):
  (0, 59071)	0.10043853867312116
  (0, 57250)	0.1063473585616558
  (0, 41874)	0.224548896412017
  (0, 49800)	0.11869932893481257
  (0, 46690)	0.12504220873599214
  (0, 73174)	0.16142029533900565
  (0, 99608)	0.09418459052541318
  (0, 84050)	0.16329311028814825
  (0, 37208)	0.1434127293323407
  (0, 62594)	0.13037295035007848
  (0, 87913)	0.25808578247347563
  (0, 54493)	0.06961997844491917
  (0, 23430)	0.12937103288512333
  (0, 77676)	0.12197186951739486
  (0, 81450)	0.1461308934288897
  (0, 24583)	0.19644480500804062
  (0, 16806)	0.1407774554706102
  (0, 83208)	0.11339406589538423
  (0, 76269)	0.08978258481915573
  (0, 34742)	0.17300821242559045
  (0, 24108)	0.24723134514216435
  (0, 25437)	0.10548299054214269
  (0, 11174)	0.20599311323287353
  (0, 35902)	0.1266709604197344
  (0, 9843)	0.20797700857530224
  (0, 55606)	0.13822596989753821
  (0, 57247)	0.1352084247105906
  (0, 84312)	0.16368392505928514
  (0, 34

In [6]:
# Apply NMF
nmf = NMF(n_components=20, random_state=1)
W_train = nmf.fit_transform(X_train)
H = nmf.components_

In [7]:
# Example: View the W matrix (Document-Topic Matrix)
print("\nW Matrix Shape:", W_train.shape)
print("W Matrix Sample (First Document):")
print(W_train[0])


W Matrix Shape: (11314, 20)
W Matrix Sample (First Document):
[0.         0.         0.         0.         0.         0.
 0.         0.03227418 0.1458308  0.         0.         0.
 0.         0.00085594 0.01571497 0.         0.         0.
 0.         0.        ]


In [8]:
# Example: View the H matrix (Topic-Term Matrix)
print("\nH Matrix Shape:", H.shape)
print("H Matrix Sample (First Topic):")
print(H[0])


H Matrix Shape: (20, 101322)
H Matrix Sample (First Topic):
[0. 0. 0. ... 0. 0. 0.]


In [10]:
# Assume X_train is the TF-IDF matrix obtained from preprocessing
# Set the number of topics
n_topics = 20

In [11]:
# Apply NMF
nmf = NMF(n_components=n_topics, random_state=1)
W_train = nmf.fit_transform(X_train)
H = nmf.components_

In [13]:
# Display the top words for each topic
feature_names = vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(H):
    top_terms = [feature_names[i] for i in topic.argsort()[:-11:-1]]  # Top 10 terms
    print(f"Topic #{topic_idx+1}: {', '.join(top_terms)}")

Topic #1: just, don, like, think, ve, good, really, time, say, ll
Topic #2: use, mac, software, apple, modem, port, memory, pc, printer, used
Topic #3: god, jesus, bible, believe, faith, christ, christian, christians, church, life
Topic #4: geb, dsl, n3jxp, chastity, pitt, cadre, shameful, intellect, skepticism, surrender
Topic #5: key, chip, encryption, clipper, keys, escrow, algorithm, government, secure, security
Topic #6: drive, disk, hard, drives, floppy, boot, ide, cd, controller, hd
Topic #7: game, team, games, year, players, season, play, hockey, win, league
Topic #8: thanks, mail, advance, hi, looking, address, info, email, send, post
Topic #9: car, bike, cars, engine, new, miles, dealer, good, insurance, price
Topic #10: card, video, monitor, vga, drivers, cards, bus, color, driver, ati
Topic #11: people, government, gun, law, right, guns, state, rights, crime, make
Topic #12: windows, dos, ms, running, os, version, microsoft, nt, drivers, driver
Topic #13: window, motif, ser

In [14]:
# Assign topics to documents
# Display the most dominant topic for the first document
document_topic_distribution = W_train[0]
dominant_topic = np.argmax(document_topic_distribution)
print(f"\nDocument 1 is most associated with Topic #{dominant_topic+1}")


Document 1 is most associated with Topic #9
