In [76]:
import pickle

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [3]:
data = pd.read_pickle("pickle/n2_tokenized.pick")

In [117]:
mask = data.tweet.str.len() > 60
data = data[mask]
data.shape

(46159, 6)

In [118]:
data.sample(3)

Unnamed: 0,trump,biden,hashtags,user_id,original,tweet
21008,False,True,[],1273194782937026560,@Daniell76509317 Biden is the favorite candidate of China. They want him elected so they can resume stealing our jobs and technology. Joe will give them the keys to our country!,joe_biden favorite candidate china resume stealing technology joe country
97185,True,False,[],17486154,Keith Obermann is RIGHT! The network that let's Trump break level #1 will live in infamy. The Twitterverse will literally never stop talking about you as THAT miserable loser network. Do NOT further cripple our DEMOCRACY.,right network donald_trump break level live infamy literally never stop talking miserable loser network cripple democracy
81776,False,True,[],1313772660019560448,"@FLOTUS: ""Why should we trust Joe Biden when he suggests he can do a better job? All you have to do is look back on his 47 years in political life to determine whether you think he's suddenly capable of putting the American people first."" https://t.co/ZDxIUOyhuN",trust joe_biden better job look back political life determine whether suddenly capable people first


## Count Vectorizer

In [120]:
cv = CountVectorizer(max_df=0.1, stop_words=stop_words)
doc_words = cv.fit_transform(data.tweet)

In [121]:
data.iloc[5]

trump                                                                                                                                                                                                                                                                                                 False
biden                                                                                                                                                                                                                                                                                                  True
hashtags                                                                                                                                                                                                                                                                                                 []
user_id                                                                                             

## TF/IDF Vectorizer



In [183]:
stop_words = [
    'people',
    'like'
]

In [184]:
v_tfidf = TfidfVectorizer(max_df=0.2)
doc_word_ti = v_tfidf.fit_transform(data.tweet)

## NMF

In [191]:
nmf_model = NMF(n_components=10, max_iter=500, init='nndsvd')
doc_topic = nmf_model.fit_transform(doc_word_ti)
print(f"Shape: {doc_topic.shape}")
print(f"Number of iterations used: {nmf_model.n_iter_}")



Shape: (46159, 10)
Number of iterations used: 39


From lecture: The **doc_topic** matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. We don't know yet what the topics are.

From lecture: The **topic_word** matrix shows us the 2 resulting topics, and the terms that are associated with each topic. By looking at the words below, we an figure out what the topics are.


In [192]:
words = v_tfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['win', 'even', 'need', 'see', 'said', 'good'],
 ['vote', 'please', 'voting', 'tomorrow', 'kamala_harris', 'let'],
 ['election', 'day', 'tomorrow', 'night', 'outcome', 'general'],
 ['covid', 'today', 'exceed', 'combined', 'war', 'badly'],
 ['president', 'kamala_harris', 'united', 'rally', 'history', 'love'],
 ['people', 'voting', 'terrible', 'problem', 'person', 'black'],
 ['china', 'million', 'support', 'made', 'tony', 'swindling'],
 ['country', 'ask', 'run', 'incredibly', 'selfish', 'narcissist'],
 ['like', 'feel', 'look', 'really', 'someone', 'something'],
 ['white', 'house', 'around', 'never', 'black', 'racist']]

In [175]:
nmf_model.components_

array([[0.00037721, 0.        , 0.00222538, ..., 0.00060699, 0.00926637,
        0.        ],
       [0.00108038, 0.00057542, 0.00302353, ..., 0.        , 0.00792848,
        0.00798595]])

In [107]:
doc_topic

array([[0.01562575, 0.01831046],
       [0.00063259, 0.01221557],
       [0.00033668, 0.00859431],
       ...,
       [0.00074711, 0.01100041],
       [0.00024427, 0.00837314],
       [0.00092973, 0.01882236]])

## LDA

In [100]:
lda_model = LatentDirichletAllocation(n_components=5)
doc_topic = lda_model.fit_transform(doc_word_ti)
doc_topic.shape

KeyboardInterrupt: 

In [None]:
words = cv.get_feature_names()
t = lda_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

## Sentiment Analysis

In [28]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [16]:
sid_obj = SentimentIntensityAnalyzer()
sentiment = []
for text in data.tweet:
    sentiment.append(sid_obj.polarity_scores(text))
    
pd.concat([data,pd.DataFrame(sentiment)], axis=1)

ValueError: Shape of passed values is (285793, 10), indices imply (191265, 10)