<a href="https://colab.research.google.com/github/ganeshmukhiya/Topic-Modeling-NMF/blob/main/SVD_Coherence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from sklearn.datasets import fetch_20newsgroups

# Fetch the dataset
newsgroups_data = fetch_20newsgroups(subset='train')
documents = newsgroups_data.data

# Step 1: Vectorize the documents using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

# Step 2: Apply SVD
n_topics = 5  # Specify the number of topics
svd = TruncatedSVD(n_components=n_topics, random_state=1)
W_train = svd.fit_transform(X_train)
H = svd.components_

# Step 3: Extract the top words for each topic
top_words = []
for topic_idx, topic in enumerate(H):
    top_terms = [feature_names[i] for i in topic.argsort()[:-11:-1]]  # Top 10 terms
    top_words.append(top_terms)

# Step 4: Prepare the data for Gensim
texts = [simple_preprocess(doc) for doc in documents]

# Create a dictionary representation of the documents
dictionary = Dictionary(texts)

# Convert documents to bag-of-words format
corpus = [dictionary.doc2bow(text) for text in texts]

# Step 5: Calculate coherence score using Gensim's CoherenceModel
coherence_model_svd = CoherenceModel(topics=top_words, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_svd.get_coherence()

print(f"SVD Model Coherence Score: {coherence_score}")


SVD Model Coherence Score: 0.713469150066705
