<a href="https://colab.research.google.com/github/ganeshmukhiya/Topic-Modeling-NMF/blob/main/LDA_Coherence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess

# Fetch the data
train_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

# Vectorize the data using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_data.data)

# Set the number of topics
n_topics = 5

# Apply LDA
lda = LatentDirichletAllocation(n_components=n_topics, random_state=1)
lda.fit(X_train)
H = lda.components_

# Extract the top words for each topic
feature_names = vectorizer.get_feature_names_out()
top_words = []
for topic_idx, topic in enumerate(H):
    top_terms = [feature_names[i] for i in topic.argsort()[:-11:-1]]  # Top 10 terms
    top_words.append(top_terms)

# Prepare the data for Gensim
# Convert documents to a list of words (simple tokenization)
texts = [simple_preprocess(doc) for doc in train_data.data]

# Create a dictionary representation of the documents
dictionary = Dictionary(texts)

# Convert documents to bag-of-words format
corpus = [dictionary.doc2bow(text) for text in texts]

# Calculate coherence score
coherence_model_lda = CoherenceModel(topics=top_words, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()

print(f"LDA Model Coherence Score: {coherence_score}")



LDA Model Coherence Score: 0.4455634450183593
