<a href="https://colab.research.google.com/github/ganeshmukhiya/Topic-Modeling-NMF/blob/main/K_VALUE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from sklearn.model_selection import KFold

# Load the dataset
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = newsgroups_data.data

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')  # Limit to top 1000 features
tfidf = tfidf_vectorizer.fit_transform(documents)

# Tokenize documents for Gensim
def tokenize_document(doc):
    return doc.lower().split()  # Basic tokenization

tokenized_docs = [tokenize_document(doc) for doc in documents]
dictionary = Dictionary(tokenized_docs)

# Initialize NMF parameters
num_topics = 10
k = 5  # Number of folds for cross-validation

# Initialize KFold
kf = KFold(n_splits=k, shuffle=True, random_state=42)
coherence_scores = []

fold = 1
for train_index, test_index in kf.split(documents):
    # Split data into training and test sets
    train_docs = [documents[i] for i in train_index]
    tfidf_train = tfidf[train_index]

    # Train the NMF model on the training data
    nmf_model = NMF(n_components=num_topics, random_state=42)
    nmf_model.fit(tfidf_train)

    # Extract topics (components) from the NMF model
    topics = []
    for topic_idx, topic in enumerate(nmf_model.components_):
        topic_words = [tfidf_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-11:-1]]
        topics.append(topic_words)

    # Calculate the coherence score using the 'c_v' measure
    coherence_model = CoherenceModel(
        topics=topics,
        texts=[tokenized_docs[i] for i in train_index],
        dictionary=dictionary,
        coherence='c_v'
    )
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)

    print(f"Fold {fold}: Coherence Score = {coherence_score:.4f}")
    fold += 1

# After the loop, all coherence scores have been printed
# If you want to use them further, you can access them from coherence_scores

print("\nAll Coherence Scores Across Folds:")
print(coherence_scores)


Fold 1: Coherence Score = 0.6020
Fold 2: Coherence Score = 0.6069
Fold 3: Coherence Score = 0.5973
Fold 4: Coherence Score = 0.5743
Fold 5: Coherence Score = 0.5731

All Coherence Scores Across Folds:
[0.6019699128340276, 0.6068739034349182, 0.5972773545790605, 0.5742667539329986, 0.5731046572037891]


In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from gensim.models import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from sklearn.model_selection import KFold

# Load the dataset
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = newsgroups_data.data

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')  # Limit to top 1000 features
tfidf = tfidf_vectorizer.fit_transform(documents)

# Tokenize documents for Gensim
def tokenize_document(doc):
    return doc.lower().split()  # Basic tokenization

tokenized_docs = [tokenize_document(doc) for doc in documents]
dictionary = Dictionary(tokenized_docs)

# Initialize parameters
num_topics = 5
k = 5  # Number of folds for cross-validation

# Initialize KFold
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Initialize lists to store coherence scores
nmf_coherence_scores = []
lda_coherence_scores = []
svd_coherence_scores = []

fold = 1
for train_index, test_index in kf.split(documents):
    # Split data into training and test sets
    train_docs = [documents[i] for i in train_index]
    tfidf_train = tfidf[train_index]

    # NMF Model
    nmf_model = NMF(n_components=num_topics, random_state=42)
    nmf_model.fit(tfidf_train)

    # Extract topics from NMF model
    nmf_topics = []
    for topic_idx, topic in enumerate(nmf_model.components_):
        nmf_topics.append([tfidf_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-11:-1]])

    # Calculate NMF coherence score
    nmf_coherence_model = CoherenceModel(
        topics=nmf_topics,
        texts=[tokenized_docs[i] for i in train_index],
        dictionary=dictionary,
        coherence='c_v'
    )
    nmf_coherence_score = nmf_coherence_model.get_coherence()
    nmf_coherence_scores.append(nmf_coherence_score)

    # LDA Model
    lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda_model.fit(tfidf_train)

    # Extract topics from LDA model
    lda_topics = []
    for topic_idx, topic in enumerate(lda_model.components_):
        lda_topics.append([tfidf_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-11:-1]])

    # Calculate LDA coherence score
    lda_coherence_model = CoherenceModel(
        topics=lda_topics,
        texts=[tokenized_docs[i] for i in train_index],
        dictionary=dictionary,
        coherence='c_v'
    )
    lda_coherence_score = lda_coherence_model.get_coherence()
    lda_coherence_scores.append(lda_coherence_score)

    # SVD Model
    svd_model = TruncatedSVD(n_components=num_topics, random_state=42)
    svd_model.fit(tfidf_train)

    # Extract topics from SVD model
    svd_topics = []
    for topic_idx, topic in enumerate(svd_model.components_):
        svd_topics.append([tfidf_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-11:-1]])

    # Calculate SVD coherence score
    svd_coherence_model = CoherenceModel(
        topics=svd_topics,
        texts=[tokenized_docs[i] for i in train_index],
        dictionary=dictionary,
        coherence='c_v'
    )
    svd_coherence_score = svd_coherence_model.get_coherence()
    svd_coherence_scores.append(svd_coherence_score)

    # Print coherence scores for the current fold
    print(f"Fold {fold}:")
    print(f"  NMF Coherence Score = {nmf_coherence_score:.4f}")
    print(f"  LDA Coherence Score = {lda_coherence_score:.4f}")
    print(f"  SVD Coherence Score = {svd_coherence_score:.4f}")
    fold += 1

# Print final results
print("\nAll Coherence Scores Across Folds:")
print("NMF Coherence Scores:", nmf_coherence_scores)
print("LDA Coherence Scores:", lda_coherence_scores)
print("SVD Coherence Scores:", svd_coherence_scores)


Fold 1:
  NMF Coherence Score = 0.6254
  LDA Coherence Score = 0.4441
  SVD Coherence Score = 0.4305
Fold 2:
  NMF Coherence Score = 0.6316
  LDA Coherence Score = 0.4104
  SVD Coherence Score = 0.4379
Fold 3:
  NMF Coherence Score = 0.5625
  LDA Coherence Score = 0.4322
  SVD Coherence Score = 0.4150
Fold 4:
  NMF Coherence Score = 0.6301
  LDA Coherence Score = 0.4166
  SVD Coherence Score = 0.4246
Fold 5:
  NMF Coherence Score = 0.6222
  LDA Coherence Score = 0.4198
  SVD Coherence Score = 0.4167

All Coherence Scores Across Folds:
NMF Coherence Scores: [0.6253758249129981, 0.631564507544338, 0.5624880728900317, 0.6300647067909858, 0.6221555595058417]
LDA Coherence Scores: [0.4441385667781782, 0.41037410627213544, 0.43223494277455243, 0.41655749895762534, 0.41979036803442005]
SVD Coherence Scores: [0.4304622142714766, 0.4378764190734475, 0.4149776079610268, 0.4245609436756025, 0.4166606393548677]
