In [None]:
import gensim
from gensim.models import LdaModel
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import silhouette_samples
from gensim.models import CoherenceModel
from sklearn.metrics.pairwise import cosine_similarity

# Fetch the dataset
data = fetch_20newsgroups(subset='all')['data']

# Preprocess the data
vectorizer = CountVectorizer(max_features=1000, max_df=0.5, min_df=5, stop_words='english')
X = vectorizer.fit_transform(data)

# Create the dictionary and corpus
id2word = {i: feature_name for i, feature_name in enumerate(vectorizer.get_feature_names_out())}
corpus = gensim.matutils.Sparse2Corpus(X.T)

# Specify the number of topics
num_topics = 10

# Train the LDA model
lda_model = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

# Get the topic distribution for each document
document_topics = [lda_model.get_document_topics(doc) for doc in corpus]

# Get the document-topic matrix
document_embeddings = lda_model[corpus]

# Convert the document embeddings to a compatible format for pairwise distances
embedding_matrix = [gensim.matutils.sparse2full(vec, num_topics) for vec in document_embeddings]

# Calculate pairwise distances between document embeddings
pairwise_distances = 1 - cosine_similarity(embedding_matrix)

# Calculate the silhouette coefficients
assigned_topics = [max(doc_topics, key=lambda x: x[1])[0] for doc_topics in document_topics]




In [None]:
silhouette_values = silhouette_samples(pairwise_distances, assigned_topics)

# Calculate the average silhouette coefficient
average_silhouette_score = silhouette_values.mean()

# Print the average silhouette coefficient
print("Average Silhouette Score:", average_silhouette_score)



Average Silhouette Score: 0.39238063
