# lecture 21 - latent dirichlet allocation

In [26]:
import numpy as np
import pandas as pd

In [3]:
npr = pd.read_csv('./datasets_files/npr.csv')

In [4]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [5]:
npr['Article'][2000]

'Updated at 7:50 p. m. ET, The death toll in Wednesday’s terrorist attack near the U. K. Parliament has risen, after a    man died of his injuries. Police say Leslie Rhodes of south London is the fourth person killed by   Khalid Masood, who was killed in the attack. Other victims include Kurt Cochran of Utah, 54, whose wife was also seriously injured Police Constable Keith Palmer, 48 and Aysha Frade, 43, who was reportedly on her way to pick her children up from school. Police arrested 11 people in connection to the case, but have released eight of them with no action, and released one on bail. Providing an update on the wounded, Metropolitan Police Deputy Commissioner Mark Rowley said, ”two people remain in hospital in what is described as a critical condition, and one person is considered to have life threatening injuries.” ”Two of our officers who were injured on Westminster Bridge in the attack also remain in hospital and also have sustained significant injuries,” Rowley said. When

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(max_df=0.9, # get rid of words that are reallty common 
                     min_df=2, # minimum word frequency - so the words has to appear in at least 2 documents to be kept
                     stop_words='english'
)

In [8]:
dtm = cv.fit_transform(npr['Article'])

In [None]:
dtm # 11992 number of articles(documents) and 54777 words

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

In [12]:
LDA = LatentDirichletAllocation(n_components=7, # I want 7 general topics to be returned --> no wrong or correct choice depend on the user
                                random_state=42)

In [13]:
LDA.fit(dtm)

0,1,2
,n_components,7
,doc_topic_prior,
,topic_word_prior,
,learning_method,'batch'
,learning_decay,0.7
,learning_offset,10.0
,max_iter,10
,batch_size,128
,evaluate_every,-1
,total_samples,1000000.0


In [14]:
# we now grab the voculabory of words, the grat the topics and finally the highest probability words per topic

In [15]:
len(cv.get_feature_names_out())

54777

In [17]:
cv.get_feature_names_out()[5000]

'bask'

In [18]:
import random

In [19]:
random_word_id = random.randint(0,54777)

In [None]:
cv.get_feature_names_out()[random_word_id] # so this is how we grab vocabolary

'relying'

In [None]:
len(LDA.components_) # numpy array containing probability for each words

7

In [24]:
single_topic = LDA.components_[0]

In [25]:
single_topic.argsort()

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993], dtype=int64)

In [27]:
arr = np.array([10,200,1])

In [None]:
arr.argsort() # sort based on index position from least to greates

array([2, 0, 1], dtype=int64)

In [30]:
single_topic.argsort()[-10:] # grab top 10 words for this topic, so 10 greatest values

array([33390, 36310, 21228, 10425, 31464,  8149, 36283, 22673, 42561,
       42993], dtype=int64)

In [31]:
top_ten_words = single_topic.argsort()[-10:]

In [33]:
for index in top_ten_words:
    print(cv.get_feature_names_out()[index])

new
percent
government
company
million
care
people
health
said
says


so meaby it is a business related article

In [None]:
# we are now going to create a loop that is going to pick the 15 most important words for each of the 7 components

In [38]:
for i,topic in enumerate(LDA.components_):
    print(f'the top 15 words for topic #{i}')
    print([cv.get_feature_names_out()[index] for index in topic.argsort()[-15:]])
    print('\n')

the top 15 words for topic #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']


the top 15 words for topic #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']


the top 15 words for topic #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']


the top 15 words for topic #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']


the top 15 words for topic #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']


the top 15 words for topic #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'think',

In [40]:
topic_results = LDA.transform(dtm)

In [41]:
topic_results

array([[1.61040465e-02, 6.83341493e-01, 2.25376318e-04, ...,
        2.99652737e-01, 2.25479379e-04, 2.25497980e-04],
       [3.63424997e-02, 8.86130697e-01, 4.40751747e-04, ...,
        7.57636804e-02, 4.40866779e-04, 4.40835574e-04],
       [3.28569485e-04, 6.96344889e-01, 3.28302105e-04, ...,
        3.02012902e-01, 3.28724083e-04, 3.28352652e-04],
       ...,
       [1.44467964e-02, 1.60696622e-01, 1.73678310e-01, ...,
        2.24636569e-02, 3.98728349e-04, 3.98359730e-04],
       [4.33560738e-04, 3.53196803e-02, 4.33022554e-04, ...,
        9.62512640e-01, 4.33971991e-04, 4.33490254e-04],
       [3.98777533e-01, 2.54376049e-04, 3.59290659e-01, ...,
        2.40914375e-01, 2.54445555e-04, 2.54253739e-04]])

In [42]:
topic_results.shape

(11992, 7)

In [44]:
topic_results[0].round(2) # probability of a document to belong to a particular topic 

array([0.02, 0.68, 0.  , 0.  , 0.3 , 0.  , 0.  ])

so in this case document 0 has high probability to belongin to topic 1

In [45]:
topic_results[0].argmax()

1

In [46]:
npr['Topic'] = topic_results.argmax(axis=1)

In [47]:
npr

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
...,...,...
11987,The number of law enforcement officers shot an...,1
11988,"Trump is busy these days with victory tours,...",4
11989,It’s always interesting for the Goats and Soda...,3
11990,The election of Donald Trump was a surprise to...,4
