<a href="https://colab.research.google.com/github/ganeshmukhiya/Topic-Modeling-NMF/blob/main/topic_extracted.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Fetch the data
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

# Vectorize the data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(newsgroups_train.data)

# Set the number of topics
n_topics = 5

# Apply NMF
nmf = NMF(n_components=n_topics, random_state=1)
W_train = nmf.fit_transform(X_train)
H = nmf.components_

# Extract and display the latent topics with unique top words
feature_names = vectorizer.get_feature_names_out()

# Set to track the words that have already been used
used_words = set()

for topic_idx, topic in enumerate(H):
    # Sort terms by their importance to the topic
    sorted_indices = topic.argsort()[::-1]

    # Find the first term that hasn't been used yet
    top_term = None
    for index in sorted_indices:
        word = feature_names[index]
        if word not in used_words:
            top_term = word
            used_words.add(word)
            break

    print(f"Topic #{topic_idx+1}: {top_term}")


Topic #1: don
Topic #2: windows
Topic #3: god
Topic #4: geb
Topic #5: key


In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Fetch the data
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

# Vectorize the data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(newsgroups_train.data)

# Set the number of topics
n_topics = 5

# Apply NMF
nmf = NMF(n_components=n_topics, random_state=1)
W_train = nmf.fit_transform(X_train)
H = nmf.components_

# Extract and display the latent topics with additional words associated with "god"
feature_names = vectorizer.get_feature_names_out()

# Set to track the words that have already been used
used_words = set()

for topic_idx, topic in enumerate(H):
    # Sort terms by their importance to the topic
    sorted_indices = topic.argsort()[::-1]

    # Find the top term that hasn't been used yet
    top_term = None
    for index in sorted_indices:
        word = feature_names[index]
        if word not in used_words:
            top_term = word
            used_words.add(word)
            break

    # Display the topic and additional words if the top word is "god"
    if top_term == "god":
        additional_words = [feature_names[i] for i in sorted_indices[1:6]]  # Next top 5 terms
        print(f"Topic #{topic_idx+1}: {top_term} (Additional words: {', '.join(additional_words)})")
    else:
        print(f"Topic #{topic_idx+1}: {top_term}")


Topic #1: don
Topic #2: windows
Topic #3: god (Additional words: jesus, bible, believe, faith, christian)
Topic #4: geb
Topic #5: key


In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
import gensim

# 1. Fetch the data
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data

# 2. Preprocess the text to get bigrams
def preprocess_text(texts):
    # Use gensim's simple_preprocess for tokenization
    return [simple_preprocess(doc, deacc=True) for doc in texts]

processed_docs = preprocess_text(documents)

# 3. Create bigrams using gensim's Phrases model
bigram = gensim.models.Phrases(processed_docs, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
bigram_docs = [bigram_mod[doc] for doc in processed_docs]

# 4. Create dictionary and corpus needed for coherence model
dictionary = Dictionary(bigram_docs)
corpus = [dictionary.doc2bow(doc) for doc in bigram_docs]

# 5. Convert the text back into a format suitable for TF-IDF (space-separated bigrams)
bigram_texts = [" ".join(doc) for doc in bigram_docs]

# 6. Apply TF-IDF Vectorizer with bigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=2, stop_words='english')
X_tfidf = vectorizer.fit_transform(bigram_texts)

# 7. Apply NMF for topic modeling
num_topics = 10  # Define the number of topics
nmf_model = NMF(n_components=num_topics, random_state=42)
W = nmf_model.fit_transform(X_tfidf)
H = nmf_model.components_

# 8. Display topics with top words (bi-grams)
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(H):
    print(f"Topic #{topic_idx+1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-11:-1]]))  # Top 10 words

# 9. Evaluate topic coherence using Gensim
def format_topics_nmf(H, feature_names, num_top_words):
    topics = []
    for topic_idx, topic in enumerate(H):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
        topics.append(top_words)
    return topics

topics = format_topics_nmf(H, feature_names, 10)

# Convert NMF topics into Gensim-compatible format for coherence
topics_gensim = [[word for word in topic] for topic in topics]

# Coherence Model
coherence_model_nmf = CoherenceModel(topics=topics_gensim, texts=bigram_docs, dictionary=dictionary, coherence='c_v')
coherence_nmf = coherence_model_nmf.get_coherence()
print(f"Coherence Score: {coherence_nmf}")




Topic #1:
just don like think good ve car know time really
Topic #2:
windows dos ms file os mouse problem run running drivers
Topic #3:
intellect geb_cadre edu shameful geb_cadre dsl_pitt geb_cadre shameful surrender dsl_pitt edu dsl_pitt surrender soon chastity intellect jxp_skepticism chastity
Topic #4:
god jesus bible believe faith christ christian christians sin say
Topic #5:
game team games year hockey baseball players season espn play
Topic #6:
drive scsi mb card ide mhz drives mac disk bit
Topic #7:
thanks does know advance thanks advance does know mail hi anybody info
Topic #8:
file program use software image available data edu graphics files
Topic #9:
people government israel jews state said did right children israeli
Topic #10:
key keys chip encryption government clipper use algorithm bit des
Coherence Score: 0.7375409049513133


In [3]:
# Extract and display the latent topics where the top term is a two-word bi-gram
feature_names = vectorizer.get_feature_names_out()

# Set to track the words that have already been used
used_words = set()

for topic_idx, topic in enumerate(H):
    # Sort terms by their importance to the topic
    sorted_indices = topic.argsort()[::-1]

    # Find the top term that is a two-word bi-gram and hasn't been used yet
    top_term = None
    for index in sorted_indices:
        word = feature_names[index]
        if len(word.split()) == 2 and word not in used_words:  # Check if the term is a bi-gram (2 words)
            top_term = word
            used_words.add(word)
            break

    # Display the bi-gram topic and additional words
    if top_term:
        additional_words = [feature_names[i] for i in sorted_indices[1:6]]  # Next top 5 terms
        print(f"Topic #{topic_idx+1}: {top_term} (Additional words: {', '.join(additional_words)})")


Topic #1: don know (Additional words: don, like, think, good, ve)
Topic #2: ms windows (Additional words: dos, ms, file, os, mouse)
Topic #3: intellect geb_cadre (Additional words: edu shameful, geb_cadre dsl_pitt, geb_cadre, shameful surrender, dsl_pitt edu)
Topic #4: believe god (Additional words: jesus, bible, believe, faith, christ)
Topic #5: baseball game (Additional words: team, games, year, hockey, baseball)
Topic #6: ide drive (Additional words: scsi, mb, card, ide, mhz)
Topic #7: thanks advance (Additional words: does, know, advance, thanks advance, does know)
Topic #8: computer graphics (Additional words: program, use, software, image, available)
Topic #9: right people (Additional words: government, israel, jews, state, said)
Topic #10: public key (Additional words: keys, chip, encryption, government, clipper)
