<a href="https://colab.research.google.com/github/ganeshmukhiya/Topic-Modeling-NMF/blob/main/NMF_TEST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF


In [1]:
#Verify Dataset Loading
from sklearn.datasets import fetch_20newsgroups

# Load the dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Verify dataset loading
print(f"Total number of documents: {len(newsgroups.data)}")
print(f"Number of categories: {len(newsgroups.target_names)}")
print(f"Categories: {newsgroups.target_names}")


Total number of documents: 18846
Number of categories: 20
Categories: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [2]:
#Verify TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the dataset using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = vectorizer.fit_transform(newsgroups.data)

# Verify TF-IDF matrix
print(f"Shape of TF-IDF matrix: {tfidf.shape}")


Shape of TF-IDF matrix: (18846, 51840)


In [3]:
#Verify NMF Model
from sklearn.decomposition import NMF

# Apply NMF for topic modeling
n_topics = 10
nmf_model = NMF(n_components=n_topics, random_state=42)
W = nmf_model.fit_transform(tfidf)
H = nmf_model.components_

# Verify NMF results
print(f"Shape of W (document-topic matrix): {W.shape}")
print(f"Shape of H (topic-term matrix): {H.shape}")


Shape of W (document-topic matrix): (18846, 10)
Shape of H (topic-term matrix): (10, 51840)


In [4]:
#Sample Topic Extraction
# Extract the feature names (words) from the TF-IDF vectorizer
feature_names = vectorizer.get_feature_names_out()

# Display the top words for each topic
for topic_idx, topic in enumerate(H):
    print(f"Topic #{topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))
    print("\n")


Topic #1:
don just like think know good ve time really want


Topic #2:
windows dos file program files window use using run running


Topic #3:
god jesus bible believe christ faith christian christians sin church


Topic #4:
drive scsi ide disk card controller hard drives bus floppy


Topic #5:
key chip encryption clipper keys government escrow use algorithm phone


Topic #6:
thanks does know mail advance hi info looking information help


Topic #7:
00 new 10 sale car price 50 20 shipping offer


Topic #8:
game games team year hockey baseball season players play espn


Topic #9:
edu geb dsl cadre n3jxp chastity pitt skepticism intellect shameful


Topic #10:
people government israel armenian jews armenians gun state did children




In [8]:
# Select a random document from the dataset
doc_id = np.random.randint(0, len(W))
print(f"Document #{doc_id} Topic Distribution: {W[doc_id]}")


Document #6517 Topic Distribution: [0.03759759 0.00012413 0.         0.         0.         0.
 0.00160295 0.         0.00090766 0.00592252]


In [6]:
# Display the top words for each topic
for topic_idx, topic in enumerate(H):
    print(f"Topic #{topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))
    print("\n")


Topic #1:
don just like think know good ve time really want


Topic #2:
windows dos file program files window use using run running


Topic #3:
god jesus bible believe christ faith christian christians sin church


Topic #4:
drive scsi ide disk card controller hard drives bus floppy


Topic #5:
key chip encryption clipper keys government escrow use algorithm phone


Topic #6:
thanks does know mail advance hi info looking information help


Topic #7:
00 new 10 sale car price 50 20 shipping offer


Topic #8:
game games team year hockey baseball season players play espn


Topic #9:
edu geb dsl cadre n3jxp chastity pitt skepticism intellect shameful


Topic #10:
people government israel armenian jews armenians gun state did children


