In [None]:
import numpy as np
import pandas as pd

# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
'''
downloaded = drive.CreateFile({'id':'1q9Yh9GorYkl_xf3O_P4zBbPYBXtTcuWx'}) 
downloaded.GetContentFile('moviereviews.tsv') 

df= pd.read_csv("moviereviews.tsv", sep='\t')

df.head()

'''

'\ndownloaded = drive.CreateFile({\'id\':\'1q9Yh9GorYkl_xf3O_P4zBbPYBXtTcuWx\'}) \ndownloaded.GetContentFile(\'moviereviews.tsv\') \n\ndf= pd.read_csv("moviereviews.tsv", sep=\'\t\')\n\ndf.head()\n\n'

In [None]:
downloaded = drive.CreateFile({'id':'1N8lD07IGqQnEQMvvc0NEogtZNPpT3Bw2'}) 
downloaded.GetContentFile('npr.csv') 

articles= pd.read_csv("npr.csv")

articles.head()

# Notice how we don't have the topic of the articles! 
# Let's use LDA to attempt to figure out clusters of the articles.

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


# Non-Negative Matric Factorization

It is an unsupervised algorithm that simultaneously performs `dimensionality reduction and clustering`

We can use it in conjunction with TF-IDF to model topics acroos documents

The result is a `document term matrix with TF-IDF Vectorisation`

> We can repeat thet `topic modeling` task from the previous lecture, but this time, we will use NMF instead of LDA.

>> Like in LDA, we still need to select the number (k) of topics beforehand, and will again have to interpret the topics based off the coefficient values of the words per topic

Overall, LDA produce more coherent topics than NMF (see article in desktop folder)

## Preprocessing

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# max_df -> [0,1] ignoring words with high document frequency, the higher the number the more you ignore
# it will will discard words which appear in 95% of documents

# min_df-> [0,1] or int=raw number of documents, minimun document frequency 
# here min_df=2 => words much appear at least in 2 documtns

# it will remove stopwods


tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [None]:
document_term_matrix = tfidf.fit_transform(articles['Article'])
document_term_matrix

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

## NMF

In [None]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=7,random_state=42)

# This can take awhile, we're dealing with a large amount of documents!
nmf_model.fit(document_term_matrix)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Displaying Topics

In [None]:
len(tfidf.get_feature_names())

54777

In [None]:
import random

for index in range(10):
    random_word_id = random.randint(0,54776)
    print(tfidf.get_feature_names()[random_word_id])

tubes
retiring
braces
incarcerations
milken
plateau
ulterior
handwriting
wernick
orientation


In [None]:
for index in range(10):
    random_word_id = random.randint(0,54776)
    print(tfidf.get_feature_names()[random_word_id])

abolitionists
squelch
buprenorphine
quantifiable
distanced
precocious
outboard
clashing
huntley
squeaked


In [None]:
len(nmf_model.components_)

7

In [None]:
nmf_model.components_

array([[0.00000000e+00, 2.49950821e-01, 0.00000000e+00, ...,
        1.70313822e-03, 2.37544362e-04, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 8.22048918e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 3.12379960e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.89723338e-03, 0.00000000e+00, 1.50186440e-03, ...,
        7.06428924e-04, 5.85500542e-04, 6.89536542e-04],
       [4.01763234e-03, 5.31643833e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [None]:
len(nmf_model.components_[0])

54777

In [None]:
single_topic = nmf_model.components_[0]
single_topic

array([0.00000000e+00, 2.49950821e-01, 0.00000000e+00, ...,
       1.70313822e-03, 2.37544362e-04, 0.00000000e+00])

In [None]:
# Returns the indices that would sort this array.
single_topic.argsort()

array([    0, 27208, 27206, ..., 36283, 54692, 42993])

In [None]:
# Word least representative of this topic
single_topic[18302]

0.0

In [None]:
# Word most representative of this topic
single_topic[42993]

2.005055165418585

In [None]:
# Top 10 words for this topic:
single_topic.argsort()[-10:]

array([14441, 36310, 53989, 52615, 47218, 53152, 19307, 36283, 54692,
       42993])

In [None]:
top_word_indices = single_topic.argsort()[-10:]

In [None]:
for index in top_word_indices:
    print(tfidf.get_feature_names()[index])

disease
percent
women
virus
study
water
food
people
zika
says


These look like business articles perhaps... Let's confirm by using .transform() on our vectorized articles to attach a label number. But first, let's view all the 10 topics found.

In [None]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    # the values with the highest coefficients
    # before with LDA was with the highest probabilities
    print([tfidf.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


THE TOP 15 WORDS FOR TOPIC #1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


THE TOP 15 WORDS FOR TOPIC #2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


THE TOP 15 WORDS FOR TOPIC #3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


THE TOP 15 WORDS FOR TOPIC #4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


THE TOP 15 WORDS FOR TOPIC #5
['love', 've', 'don', 'al

### Attaching Discovered Topic Labels to Original Articles

In [None]:
document_term_matrix

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [None]:
document_term_matrix.shape

(11992, 54777)

In [None]:
topic_results = nmf_model.transform(document_term_matrix)
topic_results

array([[0.        , 0.12075603, 0.00140297, ..., 0.01518909, 0.        ,
        0.        ],
       [0.00600706, 0.12631211, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.14147252, 0.        , ..., 0.0226561 , 0.        ,
        0.        ],
       ...,
       [0.03188623, 0.        , 0.00840979, ..., 0.00373073, 0.02440375,
        0.        ],
       [0.        , 0.03796415, 0.0107136 , ..., 0.12669893, 0.01177688,
        0.00099946],
       [0.02172572, 0.006454  , 0.0007123 , ..., 0.0123984 , 0.01282932,
        0.00155022]])

In [None]:
topic_results[0] # coefficients of the 1st article

array([0.        , 0.12075603, 0.00140297, 0.05919954, 0.01518909,
       0.        , 0.        ])

In [None]:
# and we are interested in the highest coefficients (in LDA-> probabilities)
# and we want the index position with the most representative target/topic
# ==>
topic_results[0].argmax()

1

In [None]:
# and if we want to do this across the entire array!!!!==>
# we do it more below ->
# topic_results.argmax(axis=1)

In [None]:
topic_results.shape

(11992, 7)

In [None]:
topic_results[0]

array([0.        , 0.12075603, 0.00140297, 0.05919954, 0.01518909,
       0.        , 0.        ])

In [None]:
topic_results[0].round(2)

array([0.  , 0.12, 0.  , 0.06, 0.02, 0.  , 0.  ])

In [None]:
topic_results[0].argmax()

#This means that our model thinks that the first article belongs to topic #1.

1

### Combining with Original Data

In [None]:
articles.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [None]:
topic_results.argmax(axis=1)

array([1, 1, 1, ..., 0, 4, 3])

In [None]:
# adding new column "Topic"
articles['Topic'] = topic_results.argmax(axis=1)
articles.head(5)

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [None]:
articles.Topic.nunique()

7

# How to assign a single topic in the "top words per topic" (because remember we have to decide the topic label both in LDA and NMF)->

In [None]:
# dictionary:
my_topic_dictionary = {0: "Health", 1: "Elections", 2:"Politics", 3:"Election",4:"Music", 5:"Education", 6: "whatever"}
articles["Topic_Label"] = articles["Topic"].map(my_topic_dictionary)
articles["Topic_Label"]

0        Elections
1        Elections
2        Elections
3         Election
4         whatever
           ...    
11987     Election
11988    Elections
11989       Health
11990        Music
11991     Election
Name: Topic_Label, Length: 11992, dtype: object

In [None]:
articles.head()

Unnamed: 0,Article,Topic,Topic_Label
0,"In the Washington of 2016, even when the polic...",1,Elections
1,Donald Trump has used Twitter — his prefe...,1,Elections
2,Donald Trump is unabashedly praising Russian...,1,Elections
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,Election
4,"From photography, illustration and video, to d...",6,whatever


In [None]:
articles.Topic_Label.nunique()

7

In [None]:
articles.Topic.nunique()

7