<a href="https://colab.research.google.com/github/ganeshmukhiya/Topic-Modeling-NMF/blob/main/cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
#Import Required Libraries
import numpy as np
from sklearn.decomposition import NMF, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.decomposition import LatentDirichletAllocation as LDA
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess


In [9]:
#Vectorize the Data
# Load the dataset (fetch_20newsgroups)
from sklearn.datasets import fetch_20newsgroups
data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes')).data

# Preprocess and vectorize the data
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(data)

In [10]:
#Define the Cross-Validation Procedure
kfold = KFold(n_splits=5, shuffle=True, random_state=1)  # 5-fold cross-validation


In [11]:
#Initialize Topic Models
n_topics = 5
nmf = NMF(n_components=n_topics, random_state=1)
svd = TruncatedSVD(n_components=n_topics, random_state=1)
lda = LDA(n_components=n_topics, random_state=1)


In [12]:
#Perform Cross-Validation
nmf_coherence_scores = []
svd_coherence_scores = []
lda_coherence_scores = []

for train_index, test_index in kfold.split(X):
    X_train, X_test = X[train_index], X[test_index]

    # Fit NMF
    W_train = nmf.fit_transform(X_train)
    H = nmf.components_

    # Fit SVD
    svd.fit(X_train)
    svd_topics = svd.components_

    # Fit LDA
    lda.fit(X_train)
    lda_topics = lda.components_

    # Get top words for each model
    feature_names = vectorizer.get_feature_names_out()
    nmf_top_words = [[feature_names[i] for i in topic.argsort()[:-11:-1]] for topic in H]
    svd_top_words = [[feature_names[i] for i in topic.argsort()[:-11:-1]] for topic in svd_topics]
    lda_top_words = [[feature_names[i] for i in topic.argsort()[:-11:-1]] for topic in lda_topics]

    # Convert documents to a list of words (simple tokenization)
    texts = [simple_preprocess(doc) for doc in data]
    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    # Calculate coherence score for NMF
    coherence_model_nmf = CoherenceModel(topics=nmf_top_words, texts=texts, dictionary=dictionary, coherence='c_v')
    nmf_coherence_scores.append(coherence_model_nmf.get_coherence())

    # Calculate coherence score for SVD
    coherence_model_svd = CoherenceModel(topics=svd_top_words, texts=texts, dictionary=dictionary, coherence='c_v')
    svd_coherence_scores.append(coherence_model_svd.get_coherence())

    # Calculate coherence score for LDA
    coherence_model_lda = CoherenceModel(topics=lda_top_words, texts=texts, dictionary=dictionary, coherence='c_v')
    lda_coherence_scores.append(coherence_model_lda.get_coherence())


In [13]:
#Calculate Average Coherence Scores
print(f"Average NMF Coherence Score: {np.mean(nmf_coherence_scores)}")
print(f"Average SVD Coherence Score: {np.mean(svd_coherence_scores)}")
print(f"Average LDA Coherence Score: {np.mean(lda_coherence_scores)}")


Average NMF Coherence Score: 0.6850835781610958
Average SVD Coherence Score: 0.5063859855274969
Average LDA Coherence Score: 0.526716650730926
