In [1]:
# Imports
import pandas as pd    
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import warnings
warnings.simplefilter('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
# Read the CSV (NPR Corpus)

df = pd.read_csv('../data/npr.csv')

# Display the first few rows to understand the structure
df.shape
df.head()
df.info()

(11992, 1)

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11992 entries, 0 to 11991
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Article  11992 non-null  object
dtypes: object(1)
memory usage: 93.8+ KB


In [None]:
# Instantiate a TfidfVectorizer w/ params (stopwords='english')
vectorizer = TfidfVectorizer(
    max_df=0.95           # Ignore terms that appear in >95% of documents
    , min_df=2            # Ignore terms that appear in fewer than 2 documents
    , stop_words='english' # Remove common English stopwords
)

# Fit and transform the text data into a TF-IDF matrix
# Assuming the column with text is called 'text'
tfidf = vectorizer.fit_transform(df['text'])



In [None]:
# Print the shape of the resulting TF-IDF matrix
print('TF-IDF matrix shape:', tfidf.shape)

In [None]:
# Set the number of topics (components) for NMF
# 7 is a common starting point; you can experiment with this
num_topics = 7

# Instantiate and fit the NMF model
nmf_model = NMF(
    n_components=num_topics
    , random_state=42     # Ensures reproducibility
    , max_iter=200        # Increase if convergence warnings
)

# Fit the model to the TF-IDF matrix
nmf_model.fit(tfidf)

# Get the feature names (words) from the vectorizer
# Use .get_feature_names_out() for compatibility with scikit-learn >= 1.0
feature_names = vectorizer.get_feature_names_out()

# Function to display the top words for each topic
def display_topics(model, feature_names, no_top_words):
    # Loop over each topic in the model
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        # Get the indices of the most significant words for this topic
        top_indices = topic.argsort()[::-1][:no_top_words]
        # Print the top words for this topic
        print("  ", " | ".join([feature_names[i] for i in top_indices]))
        print()

# Show the top 10 words for each topic
display_topics(nmf_model, feature_names, no_top_words=10)

# If you want to get the topic distribution for a document
# Transform the TF-IDF representation of the document
# For the first document in the DataFrame:
doc_topic_dist = nmf_model.transform(tfidf[0])
print("Topic distribution for first document:", doc_topic_dist)

# To assign the most probable topic to each document:
# This will add a new column to your DataFrame
df['topic'] = np.argmax(nmf_model.transform(tfidf), axis=1)
df[['text', 'topic']].head()
