In [1]:
import pickle

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [3]:
data = pd.read_pickle("pickle/n2_tokenized.pick")

In [4]:
data.sample(3)

Unnamed: 0,trump,biden,hashtags,user_id,original,tweet
46715,False,True,[],114111998,"@rebeccabardess @ActiveHomeRenew @FI_Playbook @ProjectLincoln And I agree! They are pulling in a chaotic element, I know the dems cast a wide net, and I get they need all the votes they can take. But they're kinda letting a poison in, Biden wanting Kasich and Cindy McCain into his cabinet? Kasich is... https://t.co/KM2wfdsOnS",agree chaotic element cast wide net need take theyre poison joe_biden wanting cabinet
39203,False,True,[],254854633,It’s so hard seeing really compassionate really intelligent people who I used to view as mentors singing Biden’s praises. Like Honors College professors I still love with my whole heart.,hard seeing really compassionate really intelligent people used view singing joe_biden like college still love whole heart
81267,True,False,[],79810128,"@marklevinshow What is the matter with you, Levin? You used to be a man of truth, but you self immolated because of Trump. I will never understand what you and Limbaugh have done for him.",matter levin used man truth self donald_trump never understand done


## TF/IDF Vectorizer



In [29]:
stop_words = [
    'people',
    'like',
    'even',
    'need',
    'said',
    'see',
    'let',
    'day',
    'today',
    'tomorrow',
    'ask',
    'look',
    'people',
    'tony',
    'someone',
    'really',
    'say',
    'way',
    'pa',
    'everyone',
    'run',
    'state',
    'person',
    'make',
    'much',
    'gave',
    'still',
    'could',
    'right',
    'highest',
    'total',
    'went',
    'instead',
    'take',
    'better',
    'also',
    'well',
    'thing',
    'good',
    'never',
    'new',
    'york',
    'done',
    'back',
    'made',
    'show',
    'nothing',
    'come',
    'simple',
    'support',
    'thats',
    'quite',
    'entire',
    'every',
    'many',
    'count',
    'election',
    'night',
    'bring',
    'thank',
    'president',
    'country',
    'please',
    'city',
    'michigan',
    'traverse'
]

In [30]:
v_tfidf = TfidfVectorizer(max_df=0.2, stop_words=stop_words)
doc_word_ti = v_tfidf.fit_transform(data.tweet)

## NMF

In [31]:
nmf_model = NMF(n_components=10, max_iter=500, init='nndsvd')
doc_topic = nmf_model.fit_transform(doc_word_ti)
print(f"Shape: {doc_topic.shape}")
print(f"Number of iterations used: {nmf_model.n_iter_}")



Shape: (46159, 10)
Number of iterations used: 43


From lecture: The **doc_topic** matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. We don't know yet what the topics are.

From lecture: The **topic_word** matrix shows us the 2 resulting topics, and the terms that are associated with each topic. By looking at the words below, we an figure out what the topics are.


In [32]:
words = v_tfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-10:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['vote',
  'blue',
  'change',
  'popular',
  'red',
  'fame',
  'famous',
  'bidenharris',
  'lady'],
 ['man',
  'world',
  'trying',
  'believe',
  'care',
  'ever',
  'last',
  'party',
  'love'],
 ['covid',
  'exceed',
  'combined',
  'war',
  'badly',
  'losing',
  'foreign',
  'death',
  'locked'],
 ['china',
  'million',
  'swindling',
  'evidence',
  'breaking',
  'xi',
  'russia',
  'communist',
  'usa'],
 ['win',
  'electoral',
  'college',
  'popular',
  'lose',
  'landslide',
  'big',
  'hope',
  'chance'],
 ['voting',
  'terrible',
  'problem',
  'son',
  'account',
  'accumulate',
  'father',
  'wise',
  'daddy'],
 ['white',
  'house',
  'around',
  'black',
  'racist',
  'united',
  'evict',
  'loser',
  'disseminate'],
 ['kamala_harris',
  'hope',
  'terrible',
  'vice',
  'united',
  'joe',
  'elect',
  'son',
  'father'],
 ['national',
  'pretty',
  'scale',
  'currently',
  'debt',
  'history',
  'worst',
  'modern',
  'least'],
 ['rally',
  'watch',
  'live',
  'ma

In [33]:
nmf_model.components_

array([[0.        , 0.00074683, 0.0013498 , ..., 0.        , 0.        ,
        0.        ],
       [0.00199609, 0.        , 0.00397857, ..., 0.        , 0.00041632,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.00159712, 0.00089953, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.00436477, ..., 0.        , 0.01065002,
        0.06456401]])

In [34]:
doc_topic

array([[2.61936970e-02, 1.64250000e-02, 0.00000000e+00, ...,
        0.00000000e+00, 9.00551581e-04, 0.00000000e+00],
       [2.25302073e-03, 1.26445181e-02, 4.60508835e-05, ...,
        9.57676137e-05, 0.00000000e+00, 2.77774346e-03],
       [0.00000000e+00, 8.32593281e-03, 0.00000000e+00, ...,
        2.56043932e-03, 4.28948979e-03, 4.74049488e-03],
       ...,
       [3.77743587e-02, 2.47412607e-02, 0.00000000e+00, ...,
        0.00000000e+00, 1.88910194e-03, 0.00000000e+00],
       [0.00000000e+00, 1.20110895e-02, 0.00000000e+00, ...,
        3.08801237e-03, 2.00323496e-04, 6.72587954e-03],
       [9.43072050e-04, 1.47143549e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 3.35364303e-04]])

## LDA

In [None]:
lda_model = LatentDirichletAllocation(n_components=5)
doc_topic = lda_model.fit_transform(doc_word_ti)
doc_topic.shape

In [None]:
words = cv.get_feature_names()
t = lda_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

## Sentiment Analysis

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [None]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for text in data.tweet:
    sentiment.append(sid_obj.polarity_scores(text))
    
pd.concat([data,pd.DataFrame(sentiment)], axis=1)