In [1]:
import pickle

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import nltk
from nltk.corpus import stopwords as nltk_stop
from stop_words import stop_words as custom_stop

In [2]:
nltk_stop = list(nltk_stop.words('english'))  

In [3]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [4]:
data = pd.read_pickle("pickle/n2_tokenized_eff.pick")

In [5]:
data.sample(3)

Unnamed: 0,id,date,time,user_id,username,hashtags,trump,biden,original,tweet,num_tokens
1613,1323414108092420097,2020-11-02,23:58:05,116649183,oriyod,[],False,True,@roguesnradvisor I’ve done all I can. Let me live in the Hope for today. I voted for the next President : Joe Biden. the subconsciously cynical will use my mind as it’s personal playground tomorrow.,subconsciously cynical use mind personal playground,6
81956,1323400816485634048,2020-11-02,23:05:16,1237790695622148096,listenhere12,[],True,False,@realDonaldTrump This has to be the most egregious lies that Trump has ever told ... get out your dictionaries. REPORT,egregious lie told dictionary report,5
106741,1323382621976453120,2020-11-02,21:52:58,11575102,tifotter,[],False,True,"@CivMilAir @atrupar Joe Biden will adjust your thermostat, reset all your passwords, leave your car dome light on, leave dishes in the sink, rake leaves and then just leave them. In a pile!",adjust thermostat reset password leave car dome light leave dish sink rake leaf leave pile,15


## TF/IDF Vectorizer



In [6]:
stop_words = custom_stop + nltk_stop

In [7]:
v_tfidf = TfidfVectorizer(max_df=0.16, min_df=0.0002, stop_words=stop_words)
doc_word_ti = v_tfidf.fit_transform(data.tweet)

## NMF

Using the code below, I was able to identify the proper number of topics to use with NMF. 

In [8]:
# write out the different topics to CSV files to find optimal number of topics

import csv

# with open(f"topic_words_{}.csv", "w", newline="") as f:
#     writer = csv.writer(f)
#     writer.writerows(a)

# for i in range(2, 18):
#     nmf_model = NMF(n_components=i, init='nndsvda', max_iter=400)
#     doc_topic = nmf_model.fit_transform(doc_word_ti)
#     words = v_tfidf.get_feature_names()
#     t = nmf_model.components_.argsort(axis=1)[:,-1:-31:-1]
#     topic_words = [[words[e] for e in l] for l in t]
#     tw_csv = np.array(topic_words).T
    
#     with open(f"../etc/topic-words/topic_words_{i:02d}.csv", "w+", newline="") as f:
#         writer = csv.writer(f)
#         writer.writerow(['topic-' +  str(x) for x in range(1, i+1)])
#         writer.writerows(tw_csv)

It turns out, the best result is 15. Each topic represents a logical & relevant concept, with enough separation to separate similar ones, but not too much separation to the point the topics blend together.

In [9]:
nmf_model = NMF(n_components=15, init='nndsvda')
doc_topic = nmf_model.fit_transform(doc_word_ti)
print(f"Shape: {doc_topic.shape}")
print(f"Number of iterations used: {nmf_model.n_iter_}")

# pd.DataFrame(doc_topic)

Shape: (96000, 15)
Number of iterations used: 32


In [10]:
words = v_tfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['vote',
  'electoral',
  'party',
  'ballot',
  'cast',
  'democracy',
  'attorneygeneral'],
 ['barackobama',
  'cnn',
  'steal',
  'speech',
  'hillaryclinton',
  'pointer',
  'hammer'],
 ['voting', 'wise', 'cover', 'foreign', 'voteblue', 'bidenharris', 'party'],
 ['year', 'job', 'tax', 'michigan', 'history', 'growth', 'two'],
 ['voter', 'poll', 'black', 'believe', 'care', 'life', 'guy'],
 ['lady', 'job', 'legend', 'video', 'woman', 'brave', 'change'],
 ['trumpsupporter',
  'racist',
  'blacklivesmatter',
  'white',
  'block',
  'riot',
  'police'],
 ['votejoebiden', 'change', 'woman', 'government', 'fame', 'famous', 'brave'],
 ['rally',
  'hold',
  'michigan',
  'watching',
  'crowd',
  'supporter',
  'superspreader'],
 ['covid', 'death', 'war', 'world', 'coviddeaths', 'foreign', 'losing'],
 ['votedonaldtrump',
  'friend',
  'black',
  'family',
  'hillaryclinton',
  'breakdown',
  'stupid'],
 ['trump', 'plan', 'lie', 'tax', 'coup', 'television', 'whitehouse'],
 ['china', 'evidence

## Topics (NMF `n_components=15`)

**pro-vote** 
\['vote',
  'electoral',
  'party',
  'ballot',
  'cast',
  'democracy',
  'attorneygeneral'\],
  
**past-presidencies**
 \['barackobama',
  'cnn',
  'steal',
  'speech',
  'hillaryclinton',
  'pointer',
  'hammer'\],
  
  **vote-blue**
 \['voting', 'wise', 'cover', 'foreign', 'voteblue', 'bidenharris', 'party'\],
 
 **economy**
 \['year', 'job', 'tax', 'michigan', 'history', 'growth', 'two'\],
 
 **racism**
 \['voter', 'poll', 'black', 'believe', 'care', 'life', 'guy'\],
 
 **female-vp**
 \['lady', 'job', 'legend', 'video', 'woman', 'brave', 'change'\],
 
 **anti-trump-supporter**
 \['trumpsupporter',
  'racist',
  'blacklivesmatter',
  'white',
  'block',
  'riot',
  'police'\],
  
**pro-biden**
 \['votejoebiden', 'change', 'woman', 'government', 'fame', 'famous', 'brave'\],
 
**trump-rallies-during-covid**
 \['rally',
  'hold',
  'michigan',
  'watching',
  'crowd',
  'supporter',
  'superspreader'\],
  
 **covid-19**
 \['covid', 'death', 'war', 'world', 'coviddeaths', 'foreign', 'losing'\],
 
**pro-trump**
 \['votedonaldtrump',
  'friend',
  'black',
  'family',
  'hillaryclinton',
  'breakdown',
  'stupid'\],
  
**trump-lies**
 \['trump', 'plan', 'lie', 'tax', 'coup', 'television', 'whitehouse'\],
 
**foreign-affairs**
 \['china', 'evidence', 'breaking', 'payment', 'swindling', 'family', 'money'\],
 
**voter-supression**
 \['electionday',
  'pretty',
  'question',
  'national',
  'currently',
  'scale',
  'call'\],
  
**campaign-info**
 \['campaign',
  'michigan',
  'release',
  'bidenharris',
  'bus',
  'ballot',
  'opportunity'\]

In [11]:
# nmf_model.components_

In [12]:
# doc_topic