In [1]:
import pickle

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import nltk
from nltk.corpus import stopwords as nltk_stop
from stop_words import stop_words as custom_stop

In [2]:
nltk_stop = list(nltk_stop.words('english'))  

In [3]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [4]:
data = pd.read_pickle("pickle/n2_tokenized_eff.pick")

In [5]:
data.sample(3)

Unnamed: 0,id,date,time,user_id,username,hashtags,trump,biden,original,tweet,num_tokens
6195,1323412817819697153,2020-11-02,23:52:58,907198734,weather_talk,[],False,True,"@rcmahoney @JoeBiden Ryan, your 15 minutes are up. Please, retreat to your safe space prior to the election results, you're not going to like what you see.",joebiden minute retreat safe space prior result,7
10474,1323411631343341568,2020-11-02,23:48:15,768510050822414337,bl00zer,[],False,True,@IIDaraII Personally my biggest gripe about biden’s side of the election is the anti-gun bullshit. the idea that banning legal guns will drop the violent crime rate is stupid and factually incorrect. apart from that i personally feel he’d fuck the 1st amendment too,personally biggest gripe joebiden side idea banning legal gun drop violentcrime rate stupid factually incorrect apart personally feel amendment,19
61146,1323396731963322369,2020-11-02,22:49:03,1960715316,noahcross02,[],False,True,"@lhjh70 @Bowden4Senate @JoeBiden Law abiding citizens aren’t commuting these crimes? So why strip the constitutional right away form millions of law abiding citizens? Doesn’t sound too great does it..and to answer your question about why we have laws, it’s quite simply really",joebiden law abiding citizen commuting crime strip constitutional away form million law abiding citizen sound answer question law simply,19


## TF/IDF Vectorizer



In [6]:
stop_words = custom_stop + nltk_stop

In [7]:
v_tfidf = TfidfVectorizer(max_df=0.16, min_df=0.0002, stop_words=stop_words)
doc_word_ti = v_tfidf.fit_transform(data.tweet)

## NMF

In [8]:
nmf_model = NMF(n_components=12, init='nndsvda')
doc_topic = nmf_model.fit_transform(doc_word_ti)
print(f"Shape: {doc_topic.shape}")
print(f"Number of iterations used: {nmf_model.n_iter_}")

# pd.DataFrame(doc_topic)

Shape: (96000, 12)
Number of iterations used: 54


From lecture: The **topic_word** matrix shows us the 2 resulting topics, and the terms that are associated with each topic. By looking at the words below, we can figure out what the topics are.


In [9]:
words = v_tfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-31:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['vote',
  'popular',
  'electionday',
  'electoral',
  'party',
  'ballot',
  'electoralcollege',
  'cast',
  'democracy',
  'attorneygeneral',
  'victory',
  'mail',
  'voice',
  'life',
  'nation',
  'bidenharris',
  'enough',
  'voter',
  'counting',
  'voteblue',
  'govote',
  'line',
  'remember',
  'hillaryclinton',
  'former',
  'black',
  'future',
  'lost',
  'change',
  'blue'],
 ['kamalaharris',
  'wise',
  'cover',
  'foreign',
  'vicepresident',
  'bidenharris',
  'blue',
  'legend',
  'change',
  'farm',
  'elect',
  'boost',
  'troll',
  'social',
  'play',
  'ticket',
  'berniesanders',
  'best',
  'msnbc',
  'home',
  'future',
  'luck',
  'woman',
  'opportunity',
  'choose',
  'communism',
  'together',
  'voteblue',
  'month',
  'miss'],
 ['barackobama',
  'cnn',
  'steal',
  'speech',
  'hillaryclinton',
  'pointer',
  'campaign',
  'hammer',
  'sweet',
  'added',
  'bush',
  'crowd',
  'economy',
  'msnbc',
  'miss',
  'news',
  'administration',
  'running',
  

In [None]:
# write out the different topics to CSV files to find optimal number of topics

import csv

# with open(f"topic_words_{}.csv", "w", newline="") as f:
#     writer = csv.writer(f)
#     writer.writerows(a)

for i in range(2, 15):
    nmf_model = NMF(n_components=i, init='nndsvda')
    doc_topic = nmf_model.fit_transform(doc_word_ti)
    words = v_tfidf.get_feature_names()
    t = nmf_model.components_.argsort(axis=1)[:,-1:-31:-1]
    topic_words = [[words[e] for e in l] for l in t]
    tw_csv = np.array(topic_words).T
    
    with open(f"../etc/topic-words/topic_words_{i:02d}.csv", "w+", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(['topic-' +  str(x) for x in range(1, i+1)])
        writer.writerows(tw_csv)

In [None]:
# nmf_model.components_

In [None]:
# doc_topic

## LDA

In [None]:
# lda_model = LatentDirichletAllocation(n_components=5)
# doc_topic = lda_model.fit_transform(doc_word_ti)
# doc_topic.shape

In [None]:
# words = cv.get_feature_names()
# t = lda_model.components_.argsort(axis=1)[:,-1:-7:-1]
# topic_words = [[words[e] for e in l] for l in t]
# topic_words

## Sentiment Analysis

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [None]:
# sid_obj = SentimentIntensityAnalyzer()
# sentiment = []
# for text in data.tweet:
#     sentiment.append(sid_obj.polarity_scores(text))
    
# pd.concat([data,pd.DataFrame(sentiment)], axis=1)