<a href="https://colab.research.google.com/github/ganeshmukhiya/Topic-Modeling-NMF/blob/main/removed_stopword.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import re

In [12]:
# Load the dataset
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))


In [15]:
# Define custom stop words with patterns
custom_stop_words_patterns = [
    r'\bhd\b', r'\bcd\b', r'\bscsi\b', r'\bhi\b',
    r'\b(don|just|like|think|know|good|ve|time|people|car)\b',
    r'\b(card|video|monitor|vga|bus|00|cards|color|drivers|ram)\b',
    r'\b(god|jesus|bible|believe|faith|christ|christian|christians|does|say)\b',
    r'\b(geb|dsl|chastity|n3jxp|pitt|cadre|shameful|intellect|skepticism|surrender)\b',
    r'\b(key|chip|encryption|clipper|keys|government|escrow|algorithm|use|public)\b',
    r'\b(drive|disk|drives|ide|hard|controller|floppy)\b',
    r'\b(game|team|games|year|players|season|play|hockey|win|league)\b',
    r'\b(thanks|mail|advance|info|looking|address|anybody)\b',
    r'\b(people|israel|government|armenian|jews|israeli|armenians|state|turkish|rights)\b',
    r'\b(windows|file|dos|window|files|program|using|running|version)\b'
]

In [16]:
# Preprocess the documents to remove words matching these patterns
processed_docs = []
for doc in newsgroups_train.data:
    for pattern in custom_stop_words_patterns:
        doc = re.sub(pattern, '', doc, flags=re.IGNORECASE)
    processed_docs.append(doc)

In [17]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)

In [18]:
# Fit and transform the dataset
tfidf = tfidf_vectorizer.fit_transform(processed_docs)

In [19]:
# Fit the NMF model
nmf = NMF(n_components=10, random_state=42)
nmf.fit(tfidf)

In [20]:
# Display the topics and top words
for i, topic in enumerate(nmf.components_):
    print(f"Topic #{i+1}:")
    print([tfidf_vectorizer.get_feature_names_out()[index] for index in topic.argsort()[-10:]])

Topic #1:
['no', 'as', 'are', 'what', 'the', 'there', 'this', 'not', 'that', 'is']
Topic #2:
['that', 'have', 'what', 'are', 're', 'can', 'do', 'if', 'your', 'you']
Topic #3:
['is', 'sale', 'or', 'new', 'software', 'system', 'at', 'with', 'and', 'for']
Topic #4:
['but', 'said', 'in', 'had', 'that', 'and', 'him', 'was', 'his', 'he']
Topic #5:
['by', 'when', 'same', 'first', 'at', 'from', 'was', 'in', 'on', 'the']
Topic #6:
['one', 'their', 'who', 'are', 'as', 'by', 'the', 'in', 'and', 'of']
Topic #7:
['we', 'as', 'want', 'this', 'do', 'have', 'will', 'would', 'be', 'to']
Topic #8:
['in', 'have', 'were', 'them', 'their', 'we', 'and', 'are', 'that', 'they']
Topic #9:
['have', 'can', 'this', 'or', 'am', 'if', 'please', 'anyone', 'any', 'me']
Topic #10:
['soon', 'about', 'on', 'too', 'with', 'was', 'but', 'and', 'my', 'it']
