In [1]:
import pickle

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import nltk
from nltk.corpus import stopwords

In [2]:
nltk_stop = list(stopwords.words('english'))  

In [3]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [4]:
data = pd.read_pickle("pickle/N2TK_TEMP.pick")

In [5]:
data.head(3)

Unnamed: 0,id,date,time,user_id,username,hashtags,trump,biden,original,tweet
181142,1323379284434669568,2020-11-02,21:39:43,2820503362,artistacriseida,[],False,True,"All these articles showing that Biden is in the lead.... IGNORE THAT AND STILL GO VOTE. All of these maps showing information that may or may not be correct won’t matter on Election Day. Hillary was also in the lead last election, just do your part. ⁽ᶠᵘᶜᵏ ᵗʳᵘᵐᵖ⁾",showing joe_biden lead ignore go_vote showing information may may correct matter election_day lead last_election part
0,1323414585995526144,2020-11-02,23:59:59,1312487180258820096,annapieters17,[],False,True,@FoxNews Lady Gaga’s a nobody. Can’t figure out her own life and can’t even see nobody can help Biden. He’s out of the game from the day he gets in the game.,lady nobody figure life nobody help joe_biden game game
4,1323414585232293888,2020-11-02,23:59:59,2335763630,kylechwatt,[],False,True,"@The_Grupp “It is purely a fortuity that this isn’t one of the great mass casualty events in American history,” Ron Klain, who was Biden’s chief of staff at the time, said of H1N1 in 2019.” https://t.co/Umi317supK",purely fortuity one great mass casualty history joe_biden chief staff time


## TF/IDF Vectorizer



In [6]:
stop_words = [
    'believe',
    'family', 
#     'hope',
    'abcd'
    
]

In [7]:
v_tfidf = TfidfVectorizer(max_df=0.20, min_df=0.0001, stop_words=stop_words)
doc_word_ti = v_tfidf.fit_transform(data.tweet)

## NMF

In [8]:
nmf_model = NMF(n_components=12, init='nndsvda')
doc_topic = nmf_model.fit_transform(doc_word_ti)
print(f"Shape: {doc_topic.shape}")
print(f"Number of iterations used: {nmf_model.n_iter_}")

pd.DataFrame(doc_topic)

Shape: (96000, 12)
Number of iterations used: 83


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.001750,0.000140,0.000551,0.003347,0.001436,0.003589,0.003483,0.002183,0.002483,0.002128,0.002709,0.003931
1,0.001090,0.000861,0.000973,0.002905,0.001090,0.000701,0.001116,0.001272,0.002492,0.003854,0.000857,0.008037
2,0.000000,0.000000,0.000339,0.000952,0.000000,0.000000,0.001641,0.058189,0.000000,0.003089,0.001606,0.011817
3,0.000687,0.000047,0.000460,0.002277,0.000042,0.000542,0.000741,0.002020,0.010360,0.002786,0.009283,0.002262
4,0.000175,0.000110,0.001108,0.000000,0.000216,0.076623,0.000000,0.000000,0.000000,0.000335,0.000000,0.005503
...,...,...,...,...,...,...,...,...,...,...,...,...
95995,0.000000,0.131143,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
95996,0.000000,0.046719,0.000080,0.000000,0.000128,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000835
95997,0.000263,0.000000,0.000212,0.000000,0.000000,0.000000,0.052176,0.050152,0.000532,0.000031,0.000070,0.000000
95998,0.000568,0.001478,0.000000,0.000079,0.000374,0.001868,0.000000,0.000021,0.000000,0.000000,0.000439,0.000335


From lecture: The **topic_word** matrix shows us the 2 resulting topics, and the terms that are associated with each topic. By looking at the words below, we can figure out what the topics are.


In [9]:
words = v_tfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-15:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['vote',
  'bidenharris',
  'blue',
  'popular',
  'dont',
  'party',
  'change',
  'black',
  'save',
  'republican',
  'usa',
  'democracy',
  'red',
  'ballot'],
 ['la',
  'si',
  'al',
  'ha',
  'ya',
  'presidente',
  'ser',
  'gane',
  'solo',
  'blanca',
  'yo',
  'usa',
  'hay',
  'bien'],
 ['kamala_harris',
  'love',
  'terrible',
  'hope',
  'joe',
  'change',
  'bidenharris',
  'vice_president',
  'elect',
  'boost',
  'best',
  'usa',
  'campaign',
  'troll'],
 ['get',
  'want',
  'rid',
  'covid',
  'going',
  'national',
  'election_day',
  'pretty',
  'dont',
  'question',
  'currently',
  'scale',
  'help',
  'doesnt'],
 ['via',
  'rally',
  'live',
  'maga',
  'watch',
  'opportunity',
  'campaign',
  'michigan',
  'china',
  'mi',
  'yahoo',
  'breaking',
  'plan',
  'evidence'],
 ['win',
  'going',
  'lose',
  'hope',
  'landslide',
  'big',
  'electoral_college',
  'want',
  'popular',
  'wrong',
  'god',
  'matter',
  'need',
  'bidenharris'],
 ['voting',
  'want'

In [10]:
nmf_model.components_

array([[5.04167657e-04, 0.00000000e+00, 2.44254170e-04, ...,
        0.00000000e+00, 1.03066845e-03, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        9.40557379e-04, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.68991338e-04, 0.00000000e+00, ...,
        0.00000000e+00, 4.06720275e-04, 6.84963707e-04],
       ...,
       [6.99250368e-04, 4.74348523e-04, 2.76565022e-03, ...,
        4.72712033e-03, 4.83826847e-03, 1.42972992e-04],
       [8.21652299e-05, 8.08438889e-04, 7.77167159e-04, ...,
        1.10123885e-03, 0.00000000e+00, 0.00000000e+00],
       [2.80668473e-04, 1.63968889e-03, 0.00000000e+00, ...,
        1.15045893e-03, 4.06741279e-04, 1.30137218e-03]])

In [11]:
doc_topic

array([[1.74985180e-03, 1.40207384e-04, 5.50517630e-04, ...,
        2.12841727e-03, 2.70936304e-03, 3.93105202e-03],
       [1.09003221e-03, 8.61214468e-04, 9.73207306e-04, ...,
        3.85402327e-03, 8.57081934e-04, 8.03733671e-03],
       [0.00000000e+00, 0.00000000e+00, 3.38964311e-04, ...,
        3.08896965e-03, 1.60593430e-03, 1.18165075e-02],
       ...,
       [2.62518327e-04, 0.00000000e+00, 2.11557388e-04, ...,
        3.11270352e-05, 7.00913277e-05, 0.00000000e+00],
       [5.67930182e-04, 1.47751046e-03, 0.00000000e+00, ...,
        0.00000000e+00, 4.39037967e-04, 3.34989975e-04],
       [6.95471036e-04, 3.70765263e-05, 9.31469817e-04, ...,
        1.81521834e-03, 0.00000000e+00, 5.60372731e-03]])

## LDA

In [12]:
# lda_model = LatentDirichletAllocation(n_components=5)
# doc_topic = lda_model.fit_transform(doc_word_ti)
# doc_topic.shape

In [13]:
# words = cv.get_feature_names()
# t = lda_model.components_.argsort(axis=1)[:,-1:-7:-1]
# topic_words = [[words[e] for e in l] for l in t]
# topic_words

## Sentiment Analysis

In [14]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [15]:
# sid_obj = SentimentIntensityAnalyzer()
# sentiment = []
# for text in data.tweet:
#     sentiment.append(sid_obj.polarity_scores(text))
    
# pd.concat([data,pd.DataFrame(sentiment)], axis=1)