In [1]:
# Imports

import pandas as pd    
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer  #
from sklearn.decomposition import LatentDirichletAllocation  
               

import warnings
warnings.simplefilter('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
# Read the CSV (NPR Corpus)

df = pd.read_csv('../data/npr.csv')

# Display the first few rows to understand the structure
df.shape
df.head()
df.info()

(11992, 1)

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11992 entries, 0 to 11991
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Article  11992 non-null  object
dtypes: object(1)
memory usage: 93.8+ KB


In [4]:
# Initialize a CountVectorizer to convert text to a matrix of token counts
vectorizer = CountVectorizer(
    max_df=0.95         # ignore terms appearing in >95% of documents
    , min_df=2          # ignore terms appearing in <2 documents
    , stop_words='english'  # remove common English stopwords
)

# Fit the vectorizer and transform the text data into a document-term matrix
dtm = vectorizer.fit_transform(df['Article'])

In [None]:
# Initialize the LDA model with your chosen number of topics (n_components)
lda = LatentDirichletAllocation(
    n_components=7          # Number of topics/clusters
    , random_state=42       # Ensures reproducibility
    , learning_method='batch'   # 'batch' or 'online'
)

# Fit the LDA model to the document-term matrix
lda.fit(dtm)

In [5]:
# Get feature (word) names from the vectorizer
feature_names = vectorizer.get_feature_names_out()

# Function to display top words for each topic
def print_top_words(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx + 1}:")
        # Get indices of the n_top_words with highest weights in this topic
        top_indices = topic.argsort()[::-1][:n_top_words]
        # Map indices to words
        top_words = [feature_names[i] for i in top_indices]
        print("  ", " ".join(top_words))
        print()

# Print the top 10 words for each topic
print_top_words(
    lda, feature_names, n_top_words=10
)

Topic #1:
   says said health people care million company government percent new

Topic #2:
   said trump president police told people news says reports npr

Topic #3:
   says like people just food years new city water time

Topic #4:
   says people health women like study children just patients disease

Topic #5:
   trump said clinton president state people campaign republican court obama

Topic #6:
   like just people think know time really music way new

Topic #7:
   says school students like new education just schools time people



In [6]:
# Transform the document-term matrix to get topic probabilities for each document
topic_distributions = lda.transform(dtm)

# Assign the topic with the highest probability to each article
df['Topic'] = np.argmax(topic_distributions, axis=1)

# Show the DataFrame with assigned topics
df[['Article', 'Topic']].head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2


In [7]:
lda.perplexity(dtm)

5537.59372192337