**Topic Modeling on News Articles using LDA & LSA (Gensim)**

This script performs topic modeling on news article content using two unsupervised learning methods: LDA (Latent Dirichlet Allocation) and LSA (Latent Semantic Analysis). It includes text preprocessing (cleaning, stopword removal, tokenization, stemming), vectorization using Bag of Words, topic extraction, and coherence score evaluation to determine the optimal number of topics.

Main steps:

Load and clean the text data

Remove stopwords, tokenize, and stem words

Convert text into bag-of-words representation

Train LDA and LSA models to extract topics

Evaluate LSA coherence scores and visualize topic quality

Print top terms per topic



In [None]:
!conda install -c conda-forge gensim -y

In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import gensim
from gensim import corpora

In [None]:
data = pd.read_csv("news_articles.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
articles = data['content']

In [None]:
print(articles)

In [None]:
#text preparation

#lowercasing
articles = articles.str.lower().apply(lambda x: re.sub(r"([^\w\s])", "", x))

# stop words removal
en_stopwords = stopwords.words("english")
articles = articles.apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

# tokenize
articles = articles.apply(lambda x: word_tokenize(x))

#stemming (done for speed as we have a lot of text)
ps = PorterStemmer()
articles = articles.apply( lambda tokens: [ps.stem(token) for token in tokens])

In [None]:
articles

In [None]:
# create dictionary
dictionary = corpora.Dictionary(articles)

In [None]:
print(dictionary)

In [None]:
#vectorizing
doc_term = [dictionary.doc2bow(text) for text in articles]

In [None]:
print(doc_term)

In [None]:
# LDA Model (2 topics)

num_topics = 2

In [None]:
import gensim

In [None]:
lda_model = gensim.models.LdaModel(corpus = doc_term,
                                 id2word = dictionary,
                                 num_topics = num_topics)

In [None]:
# LSA Model (2 topics)

lda_model.print_topics(num_topics=num_topics, num_words=5)

In [None]:
from gensim.models import LsiModel

In [None]:
lsa_model = LsiModel(corpus=doc_term, id2word=dictionary, num_topics=num_topics)

In [None]:
print(lsa_model.print_topics(num_topics=num_topics, num_words=5))

In [None]:
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

In [None]:
# Check coherence values for multiple topic numbers (2–11)

coherence_values = []
model_list = []

In [None]:
min_topics = 2
max_topics = 11

In [None]:
for num_topics_i in range(min_topics, max_topics+1):
    model = LsiModel(doc_term, num_topics=num_topics_i, id2word = dictionary, random_seed=0)
    model_list.append(model)
    coherence_model = CoherenceModel(model=model, texts=articles, dictionary=dictionary, coherence='c_v')
    coherence_values.append(coherence_model.get_coherence())

In [None]:
# Plot coherence values to find the optimal number of topics

plt.plot(range(min_topics, max_topics+1), coherence_values)

In [None]:
# Final LSA Model with optimal topic number

final_num_topics = 3
final_lsa_model = LsiModel(corpus=doc_term, id2word=dictionary, num_topics=final_num_topics)
print(final_lsa_model.print_topics(num_topics=final_num_topics, num_words=10))