In [1]:
import pandas as pd

In [2]:
npr = pd.read_csv("files/npr.csv")

In [3]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf

In [6]:
dtm = tfidf.fit_transform(npr['Article'])

In [7]:
dtm

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3033388 stored elements and shape (11992, 54777)>

# NMF

In [8]:
from sklearn.decomposition import NMF

In [9]:
nmf_model = NMF(n_components=7, random_state=43)

In [10]:
nmf_model.fit(dtm)

In [11]:
len(tfidf.get_feature_names_out())

54777

In [12]:
import random

for i in range(20):
    word_id = random.randint(0, 54776)
    print(word_id, tfidf.get_feature_names_out()[word_id])

6804 brainwaves
25359 institutes
29187 loudoun
31216 metros
42131 rothko
46571 starves
11275 convulsions
40277 reeks
5905 births
18923 flaky
40156 rectitude
52481 viewing
5088 bauer
34937 ottoman
41826 roadside
20468 genders
29989 manse
41731 ripen
40468 regression
34164 obfuscate


In [13]:
len(nmf_model.components_)

7

In [14]:
nmf_model.components_

array([[0.00000000e+00, 2.51363662e-01, 0.00000000e+00, ...,
        1.71442007e-03, 2.39336762e-04, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 8.23609951e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 3.12435122e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.91016111e-03, 0.00000000e+00, 1.50586104e-03, ...,
        7.06635993e-04, 5.86567726e-04, 6.91383950e-04],
       [4.02862210e-03, 5.32569748e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [15]:
nmf_model.components_[0]

array([0.00000000e+00, 2.51363662e-01, 0.00000000e+00, ...,
       1.71442007e-03, 2.39336762e-04, 0.00000000e+00])

In [16]:
first_topic = nmf_model.components_[0]

In [17]:
# Returns the indices that would sort this array.
first_topic.argsort()

array([    0, 27208, 27206, ..., 36283, 54692, 42993])

In [20]:
# Word least representative of this topic
first_topic[27208]

0.0

In [21]:
# Word most representative of this topic
first_topic[42993]

2.016958171907914

In [22]:
# Top 10 words for this topic:
first_topic.argsort()[-10:]

array([14441, 36310, 53989, 52615, 47218, 53152, 19307, 36283, 54692,
       42993])

In [23]:
top_word_indices = first_topic.argsort()[-10:]

for index in top_word_indices:
    print(tfidf.get_feature_names_out()[index])

disease
percent
women
virus
study
water
food
people
zika
says


In [25]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


THE TOP 15 WORDS FOR TOPIC #1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


THE TOP 15 WORDS FOR TOPIC #2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


THE TOP 15 WORDS FOR TOPIC #3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


THE TOP 15 WORDS FOR TOPIC #4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


THE TOP 15 WORDS FOR TOPIC #5
['love', 've', 'don', 'al

# attaching topic labels

In [26]:
dtm

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3033388 stored elements and shape (11992, 54777)>

In [27]:
dtm.shape

(11992, 54777)

In [28]:
len(npr)

11992

In [29]:
topic_results = nmf_model.transform(dtm)

In [30]:
topic_results.shape

(11992, 7)

In [31]:
topic_results[0]

array([0.        , 0.12080539, 0.00139813, 0.05914114, 0.01518506,
       0.        , 0.        ])

In [32]:
topic_results[0].round(2)

array([0.  , 0.12, 0.  , 0.06, 0.02, 0.  , 0.  ])

In [33]:
topic_results[0].argmax()

1

In [34]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [35]:
topic_results.argmax(axis=1)

array([1, 1, 1, ..., 0, 4, 3])

In [36]:
npr['Topic'] = topic_results.argmax(axis=1)

In [38]:
npr.head(10)

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6
5,I did not want to join yoga class. I hated tho...,5
6,With a who has publicly supported the debunk...,0
7,"I was standing by the airport exit, debating w...",0
8,"If movies were trying to be more realistic, pe...",0
9,"Eighteen years ago, on New Year’s Eve, David F...",5
