# Data

We will be using articles from NPR (National Public Radio), obtained from their website [www.npr.org](http://www.npr.org)

In [1]:
import pandas as pd

In [2]:
npr = pd.read_csv('C:/Users/danca/Music/NLP/TextFiles/npr.csv')

In [3]:
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
cv = CountVectorizer(max_df=0.9,min_df=2,stop_words='english')

In [6]:
dtm = cv.fit_transform(npr['Article'])

In [7]:
# Performing LDA

In [8]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [9]:
from sklearn.decomposition import LatentDirichletAllocation

In [10]:
LDA = LatentDirichletAllocation(n_components=10,random_state=42)

In [11]:
LDA.fit(dtm)

LatentDirichletAllocation(random_state=42)

In [22]:
# Grab the vocabulary of words 

In [16]:
len(cv.get_feature_names())

54777

In [18]:
cv.get_feature_names()[5500]

'benzodiazepines'

In [21]:
# getting random words 
import random

random_word_id = random.randint(0,54000)

cv.get_feature_names()[random_word_id]

'nipples'

In [15]:
# Grab the topic

In [25]:
len(LDA.components_)

10

In [27]:
type(LDA.components_)

numpy.ndarray

In [28]:
LDA.components_.shape

(10, 54777)

In [29]:
single_topic = LDA.components_[0]

In [30]:
# Returning the index position that will sort the array
single_topic.argsort()

array([18302,  2475, 44967, ..., 10425, 42561, 42993], dtype=int64)

In [35]:
## lets grab the top 10 Words for this topic 
top_twenty_words = single_topic.argsort()[-20:]

In [36]:
for i in top_twenty_words:
    print(cv.get_feature_names()[i]) 

industry
tax
business
percent
pay
people
care
government
year
insurance
000
federal
new
money
companies
million
health
company
said
says


In [37]:
# Grabing the highest probability words per topic

In [38]:
for i,topic in enumerate(LDA.components_):
    print(f"The Top 15 Words from topic #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n')
    print('\n')

The Top 15 Words from topic #0




['people', 'care', 'government', 'year', 'insurance', '000', 'federal', 'new', 'money', 'companies', 'million', 'health', 'company', 'said', 'says']




The Top 15 Words from topic #1
['npr', 'intelligence', 'security', 'new', 'told', 'russian', 'campaign', 'obama', 'news', 'white', 'russia', 'house', 'president', 'said', 'trump']




The Top 15 Words from topic #2
['know', 'little', 'home', 'make', 'way', 'day', 'water', 'time', 'years', 'people', 'food', 'new', 'just', 'like', 'says']




The Top 15 Words from topic #3
['don', 'food', 'work', 'day', 'life', 'time', 'family', 'children', 'years', 'just', 'women', 'world', 'like', 'people', 'says']




The Top 15 Words from topic #4
['supreme', 'order', 'city', 'states', 'federal', 'country', 'president', 'rights', 'government', 'people', 'law', 'state', 'said', 'court', 'says']




The Top 15 Words from topic #5
['going', 've', 'story', 'life', 'don', 'new', 'way', 'time', 'really', 'know', 'think', 'music', 'people', 'just', 'like']


In [39]:
topic_results = LDA.transform(dtm)

In [42]:
topic_results[0].argmax()

1

In [None]:
npr['Topic'] = topic_results.argmax()