In [17]:
# Imports
import pandas as pd    
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

import warnings
warnings.simplefilter('ignore')

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [18]:
# Read the CSV (NPR Corpus)

df = pd.read_csv('../data/npr.csv')

# Display the first few rows to understand the structure
df.shape
df.head()
df.info()

(11992, 1)

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11992 entries, 0 to 11991
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Article  11992 non-null  object
dtypes: object(1)
memory usage: 93.8+ KB


In [19]:
# Instantiate a TfidfVectorizer w/ params (stopwords='english')
vectorizer = TfidfVectorizer(
    max_df=0.95           # Ignore terms that appear in >95% of documents
    , min_df=2            # Ignore terms that appear in fewer than 2 documents
    , stop_words='english' # Remove common English stopwords
)

# Fit and transform the text data into a TF-IDF matrix
tfidf = vectorizer.fit_transform(df['Article'])

In [20]:
# Print the shape of the resulting TF-IDF matrix
print('TF-IDF matrix shape:', tfidf.shape)

TF-IDF matrix shape: (11992, 54777)


In [21]:
# Set the number of topics (components) for NMF
# 7 is a common starting point; you can experiment with this (plugged into a var in case we want to tweak;)
num_topics = 7

# Instantiate and fit the NMF model
nmf_model = NMF(
    n_components=num_topics
    , random_state=27     # Ensures reproducibility
    , max_iter=200        # Increase if convergence warnings
)

# Fit the model to the TF-IDF matrix
nmf_model.fit(tfidf)

0,1,2
,n_components,7
,init,
,solver,'cd'
,beta_loss,'frobenius'
,tol,0.0001
,max_iter,200
,random_state,27
,alpha_W,0.0
,alpha_H,'same'
,l1_ratio,0.0


In [23]:
# Get feature (word) names from the vectorizer
feature_names = vectorizer.get_feature_names_out()

# Function to display top words for each topic
def print_top_words(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx + 1}:")
        # Get indices of the n_top_words with highest weights in this topic
        top_indices = topic.argsort()[::-1][:n_top_words]
        # Map indices to words
        top_words = [feature_names[i] for i in top_indices]
        print("  ", " ".join(top_words))
        print()

# Print the top 10 words for each topic
print_top_words(
    nmf_model, feature_names
    , n_top_words=10
)

Topic #1:
   says zika people food water study virus women percent disease

Topic #2:
   trump president said campaign donald house white obama republican election

Topic #3:
   health care insurance medicaid coverage obamacare affordable republicans plan tax

Topic #4:
   police said court reports president attack state government russia isis

Topic #5:
   clinton sanders voters campaign hillary democratic state vote delegates party

Topic #6:
   like music just think people know really life song time

Topic #7:
   students school schools education student teachers kids college children devos



In [25]:
# Transform the TFIDF to get topic distros for each document
topic_distributions = nmf_model.transform(tfidf)

# Assign the topic to each article
df['Topic'] = np.argmax(topic_distributions, axis=1)

# Show the DataFrame with assigned topics
df[['Article', 'Topic']].head()

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [12]:
# Temporarily suppress duplicate outputs (have to do this w/ px otherwise, it will output dupe charts!)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'last_expr'

In [None]:
# Plotly Express Bar Plot (Interactive)
fig = px.bar(
    x=topic_labels,
    y=topic_counts.values,
    labels={'x':'Topic (Top Words)', 'y':'Article Count'},
    title='Number of Articles per Topic (Plotly Express)',
    text=topic_counts.values
)
fig.update_layout(
    xaxis_tickangle=-45
)
fig.show()

In [26]:
df['Article'].value_counts().sort_index()

Article
      American farms should be in full swing right now. But some farmers are running behind, waiting on work visas for planters and pickers from out of the country. The   visa program is delayed for the third year in a row. It sounds like the setup to a bad joke: A professor and a doctor walk onto a farm. Kathleen Terrence, a pediatrician, kneels in an onion field outside Lisbon, N. Y. with a bunch of kids. As they prepare to plant some 30, 000 onions, they’re all taking tips from Mark Sturges  —   but he’s no farmer, either. He’s a literary critic. ”Are you guys ready?” he asks them, laughing gently. ”It’s gonna be fun.” These are just some of the volunteers who stepped up to plant onions for Kent Family Growers, in upstate New York. But farm owner Dan Kent says he’s worried about the rest of his growing season. The workers he hired are a month late. ”I am assuming that our guys are still in southern Mexico, where they live, waiting for word that they have an appointment at th

In [24]:
# Some Data Viz:

# Count how many articles are assigned to each topic
topic_counts = df['Article'].value_counts().sort_index()

# Create a bar plot with Seaborn
plt.figure(figsize=(10, 6))

sns.barplot(
    x=topic_counts.index # Topics are 0-indexed; shift to 1-based for display
    , y=topic_counts.values
    , palette='viridis'
)

plt.xlabel('Topic')
plt.ylabel('Number of Articles')

plt.title('Number of Articles per Topic')
plt.xticks(topic_counts.index + 1)

plt.tight_layout()
plt.show();

TypeError: can only concatenate str (not "int") to str

Error in callback <function _draw_all_if_interactive at 0x12be427a0> (for post_execute):


KeyboardInterrupt: 

Error in callback <function flush_figures at 0x179125b20> (for post_execute):


KeyboardInterrupt: 

In [6]:
# Get the feature names (words) from the vectorizer
# Use .get_feature_names_out() for compatibility with scikit-learn >= 1.0
feature_names = vectorizer.get_feature_names_out()

# Function to display the top words for each topic
def display_topics(model, feature_names, no_top_words):
    # Loop over each topic in the model
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        # Get the indices of the most significant words for this topic
        top_indices = topic.argsort()[::-1][:no_top_words]
        # Print the top words for this topic
        print("  ", " | ".join([feature_names[i] for i in top_indices]))
        print()

# Show the top 10 words for each topic
display_topics(nmf_model, feature_names, no_top_words=10)

Topic 1:
   says | zika | people | food | water | study | virus | women | percent | disease

Topic 2:
   trump | president | said | campaign | donald | house | white | obama | republican | election

Topic 3:
   health | care | insurance | medicaid | coverage | obamacare | affordable | republicans | plan | tax

Topic 4:
   police | said | court | reports | president | attack | state | government | russia | isis

Topic 5:
   clinton | sanders | voters | campaign | hillary | democratic | state | vote | delegates | party

Topic 6:
   like | music | just | think | people | know | really | life | song | time

Topic 7:
   students | school | schools | education | student | teachers | kids | college | children | devos

