In [1]:
import pickle

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import nltk
from nltk.corpus import stopwords

In [2]:
nltk_stop = list(stopwords.words('english'))  

In [3]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [4]:
data = pd.read_pickle("pickle/n2_tokenized.pick")
data = data[:50]

## TF/IDF Vectorizer



In [5]:
from stop_words import stop_words as custom_stop
stop_words = custom_stop + nltk_stop
print(len(stop_words))

283


In [6]:
v_tfidf = TfidfVectorizer(max_df=0.15, min_df=0.0001, stop_words=stop_words)
doc_word_ti = v_tfidf.fit_transform(data.tweet)

## NMF

In [7]:
nmf_model = NMF(n_components=7, init='nndsvda')
doc_topic = nmf_model.fit_transform(doc_word_ti)
print(f"Shape: {doc_topic.shape}")
print(f"Number of iterations used: {nmf_model.n_iter_}")

pd.DataFrame(doc_topic)

Shape: (50, 7)
Number of iterations used: 126


Unnamed: 0,0,1,2,3,4,5,6
0,0.000000,0.000000,0.195982,0.0,0.129470,0.124709,0.000000
1,0.000000,0.000000,0.000000,0.0,0.065434,0.000000,0.000000
2,0.021523,0.000000,0.000000,0.0,0.000000,0.006746,0.006081
3,0.000000,0.000000,0.000000,0.0,0.000000,0.392895,0.000000
4,0.239711,0.000000,0.352549,0.0,0.015609,0.000000,0.000000
...,...,...,...,...,...,...,...
45,0.000000,0.000000,0.000000,0.0,0.184746,0.000000,0.000000
46,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.615793
47,0.000000,0.000000,0.453973,0.0,0.000000,0.000000,0.000000
48,0.007957,0.000000,0.000000,0.0,0.000000,0.015588,0.005105


In [8]:
data.to_pickle("pickle/n2_tokenized.pick")

From lecture: The **doc_topic** matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. We don't know yet what the topics are.

From lecture: The **topic_word** matrix shows us the 2 resulting topics, and the terms that are associated with each topic. By looking at the words below, we an figure out what the topics are.


In [9]:
words = v_tfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['win', 'think', 'black', 'enough', 'probably', 'future'],
 ['get', 'trying', 'would', 'everything', 'wait', 'kyle'],
 ['joe', 'bye', 'showing', 'via', 'video', 'often'],
 ['la', 'cest', 'pendant', 'pour', 'fils', 'corruption'],
 ['kamala_harris', 'notch', 'casting', 'sanity', 'nation', 'return'],
 ['dont', 'believe', 'able', 'irrational', 'know', 'want'],
 ['wrong', 'based', 'banana', 'voting', 'hate', 'circle']]

Additional words to combine:

```
['voter_supression', 'black_women', 'white_women', 'white_men', 'black_men', 'white_men', 'people_of_color', 'whole_world', 'stock_market', 'orange_man', 'campaign_rally', 'cover_up']
```

In [10]:
nmf_model.components_

array([[0.00000000e+00, 0.00000000e+00, 3.40482907e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        8.75069114e-02, 0.00000000e+00, 8.88691891e-03],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 8.28936949e-03],
       [2.46933962e-01, 1.99277281e-18, 3.22654021e-02, ...,
        0.00000000e+00, 0.00000000e+00, 3.96632926e-02],
       [0.00000000e+00, 0.00000000e+00, 6.30639090e-02, ...,
        0.00000000e+00, 2.32160956e-01, 0.00000000e+00]])

In [11]:
doc_topic

array([[0.00000000e+00, 0.00000000e+00, 1.95981638e-01, 0.00000000e+00,
        1.29469566e-01, 1.24708791e-01, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        6.54337354e-02, 0.00000000e+00, 0.00000000e+00],
       [2.15234056e-02, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 6.74584962e-03, 6.08128226e-03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 3.92895219e-01, 0.00000000e+00],
       [2.39710772e-01, 0.00000000e+00, 3.52548670e-01, 0.00000000e+00,
        1.56086739e-02, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 6.53070963e-32,
        1.62188434e-02, 2.93762378e-01, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.34322434e-24,
        0.00000000e+00, 0.00000000e+00, 4.96126629e-16],
       [2.16229720e-01, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0

## LDA

In [12]:
lda_model = LatentDirichletAllocation(n_components=5)
doc_topic = lda_model.fit_transform(doc_word_ti)
doc_topic.shape

(50, 5)

In [13]:
words = cv.get_feature_names()
t = lda_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

NameError: name 'cv' is not defined

## Sentiment Analysis

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [None]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for text in data.tweet:
    sentiment.append(sid_obj.polarity_scores(text))
    
pd.concat([data,pd.DataFrame(sentiment)], axis=1)