In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
npr = pd.read_csv('../05-Topic-Modeling/npr.csv')
npr.tail()

Unnamed: 0,Article
11987,The number of law enforcement officers shot an...
11988,"Trump is busy these days with victory tours,..."
11989,It’s always interesting for the Goats and Soda...
11990,The election of Donald Trump was a surprise to...
11991,Voters in the English city of Sunderland did s...


In [5]:
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')

In [6]:
dtm = cv.fit_transform(npr.Article)

In [7]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.decomposition import LatentDirichletAllocation

In [9]:
lda = LatentDirichletAllocation(n_components=7, random_state=23)

In [10]:
lda.fit(dtm)

In [16]:
# grab the vocabulary of words
len(cv.get_feature_names_out())

54777

In [13]:
# grab the topics

In [19]:
lda.components_.shape

(7, 54777)

In [20]:
single_topic = lda.components_[0]

In [21]:
single_topic.argsort()

array([49173, 52258,  8877, ..., 38079, 42561, 50426])

In [23]:
top_ten_words = single_topic.argsort()[-10:]

In [25]:
for i, topic in enumerate(lda.components_):
    print(f"TOP 15 WORDS FOR TOPIC #{i}")
    print([cv.get_feature_names_out()[index] for index in topic.argsort()[-15:]])
    print('\n')
    print('\n')

TOP 15 WORDS FOR TOPIC #0
['political', 'party', 'republican', 'election', 'new', 'white', 'people', 'state', 'obama', 'house', 'campaign', 'clinton', 'president', 'said', 'trump']




TOP 15 WORDS FOR TOPIC #1
['home', 'city', 'world', 'day', 'don', 'family', 'years', 'time', 'said', 'women', 'just', 'like', 'children', 'people', 'says']




TOP 15 WORDS FOR TOPIC #2
['ve', 'years', 'don', 'life', 'music', 'way', 'really', 'new', 'says', 'know', 'time', 'think', 'people', 'just', 'like']




TOP 15 WORDS FOR TOPIC #3
['drug', 'just', 'research', 'new', 'patients', 'insurance', 'said', 'students', 'like', 'study', 'percent', 'care', 'people', 'says', 'health']




TOP 15 WORDS FOR TOPIC #4
['according', '000', 'industry', 'year', 'north', 'just', 'like', 'companies', 'said', 'china', 'water', 'people', 'new', 'company', 'says']




TOP 15 WORDS FOR TOPIC #5
['country', 'department', 'case', 'new', 'school', 'years', 'government', 'federal', 'people', 'court', 'law', 'state', 'police', 

In [26]:
topic_results = lda.transform(dtm)

In [28]:
topic_results.shape

(11992, 7)

In [29]:
topic_results[0].argmax()

0

In [30]:
npr['Topic'] = topic_results.argmax(axis=1)

In [31]:
npr

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",0
1,Donald Trump has used Twitter — his prefe...,0
2,Donald Trump is unabashedly praising Russian...,0
3,"Updated at 2:50 p. m. ET, Russian President Vl...",0
4,"From photography, illustration and video, to d...",5
...,...,...
11987,The number of law enforcement officers shot an...,5
11988,"Trump is busy these days with victory tours,...",0
11989,It’s always interesting for the Goats and Soda...,1
11990,The election of Donald Trump was a surprise to...,0
