In [1]:
import pickle

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import nltk
from nltk.corpus import stopwords as nltk_stop
from stop_words import stop_words as custom_stop

In [2]:
nltk_stop = list(nltk_stop.words('english'))  

In [3]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [4]:
data = pd.read_pickle("pickle/n2tk_limited.pick")

## TF/IDF Vectorizer



In [5]:
stop_words = custom_stop + nltk_stop

In [6]:
v_tfidf = TfidfVectorizer(max_df=0.4, min_df=0.00015, stop_words=stop_words)
doc_word_ti = v_tfidf.fit_transform(data.tweet)

## NMF

Using the code below, I was able to identify the proper number of topics to use with NMF. 

In [7]:
# # write out the different topics to CSV files to find optimal number of topics

# import csv

# for i in range(15, 26):
#     nmf_model = NMF(n_components=i, init='nndsvda', max_iter=400)
#     doc_topic = nmf_model.fit_transform(doc_word_ti)
#     words = v_tfidf.get_feature_names()
#     t = nmf_model.components_.argsort(axis=1)[:,-1:-31:-1]
#     topic_words = [[words[e] for e in l] for l in t]
#     tw_csv = np.array(topic_words).T
    
#     with open(f"../etc/topic-words/topic_words_{i:02d}.csv", "w+", newline="") as f:
#         writer = csv.writer(f)
#         writer.writerow(['topic-' +  str(x) for x in range(1, i+1)])
#         writer.writerows(tw_csv)

It turns out, the best result is 15. Each topic represents a logical & relevant concept, with enough separation to separate similar ones, but not too much separation to the point the topics blend together.

In [8]:
# nmf_model = NMF(n_components=15, init='nndsvda')
# doc_topic = nmf_model.fit_transform(doc_word_ti)
# print(f"doc_topic shape: {doc_topic.shape}")
# print(f"Number of iterations used: {nmf_model.n_iter_}")
# with open("pickle/nmf_model.pick", 'wb') as f:
#     pickle.dump( nmf_model, f)

# with open("pickle/nmf_doc_topics.pick", 'wb') as f:
#     pickle.dump( doc_topic, f)
    
# pd.DataFrame(doc_topic)

In [9]:
with open("pickle/nmf_model.pick", 'rb') as f:
    nmf_model = pickle.load( f)

with open("pickle/nmf_doc_topics.pick", 'rb') as f:
    doc_topic = pickle.load( f)

In [10]:
# words = v_tfidf.get_feature_names()
# t = nmf_model.components_.argsort(axis=1)[:,-1:-21:-1]
# topic_words = [[words[e] for e in l] for l in t]

# with open("pickle/topic_words.pick", 'wb') as f:
#     pickle.dump( topic_words, f)

# topic_words

[['vote',
  'votejoebiden',
  'ballot',
  'cast',
  'votedonaldtrump',
  'party',
  'democracy',
  'friend',
  'voice',
  'mail',
  'voteblue',
  'govote',
  'enough',
  'counting',
  'victory',
  'line',
  'change',
  'nation',
  'electoralcollege',
  'future'],
 ['kamalaharris',
  'wise',
  'cover',
  'foreign',
  'farm',
  'social',
  'troll',
  'boost',
  'vicepresident',
  'elect',
  'legend',
  'campaign',
  'change',
  'votejoebiden',
  'blue',
  'candidate',
  'bidenharris',
  'ticket',
  'together',
  'wish'],
 ['trump',
  'lie',
  'believe',
  'tax',
  'life',
  'whitehouse',
  'hate',
  'god',
  'fact',
  'help',
  'world',
  'end',
  'money',
  'party',
  'away',
  'truth',
  'oh',
  'tweet',
  'change',
  'child'],
 ['barackobama',
  'steal',
  'cnn',
  'pointer',
  'speech',
  'sweet',
  'hammer',
  'crowd',
  'msnbc',
  'bush',
  'economy',
  'miss',
  'hillaryclinton',
  'fire',
  'speaking',
  'administration',
  'listening',
  'size',
  'cut',
  'change'],
 ['voting',

In [13]:
with open("pickle/topic_words.pick", 'rb') as f:
    topic_words = pickle.load( f)

## Topics (NMF `n_components=15`)


**hillary clinton mask photo**
**vote-republican**
\['vote',
  'votedonaldtrump',
  'ballot',
  'cast',
  'democracy',
  'party',
  'friend',
  'life',
  'voice',
  'care',
  'voteblue',
  'mail',
  'counting',
  'govote'\]
  
**anti-hillary clinton**
 \['wa',
  'hillaryclinton',
  'talking',
  'born',
  'since',
  'bus',
  'taken',
  'video',
  'truck',
  'running',
  'sign',
  'watching',
  'heaven',
  'fault'\]
  
  
  
 \['tax',
  'plan',
  'pay',
  'cut',
  'virus',
  'job',
  'money',
  'riot',
  'economy',
  'covid',
  'stockmarket',
  'plummeted',
  'increase',
  'business'\],
 \['death',
  'war',
  'covid',
  'world',
  'coviddeaths',
  'foreign',
  'losing',
  'exceed',
  'badly',
  'civil',
  'case',
  'trumprally',
  'rally',
  'study'\],
 \['barackobama',
  'cnn',
  'speech',
  'crowd',
  'msnbc',
  'economy',
  'bush',
  'administration',
  'hillaryclinton',
  'miss',
  'size',
  'speaking',
  'cut',
  'campaign'\],
 \['year',
  'job',
  'history',
  'two',
  'debt',
  'least',
  'worst',
  'michigan',
  'unemployment',
  'growth',
  'term',
  'hell',
  'left',
  'yet'\],
 \['voting',
  'friend',
  'party',
  'ballot',
  'bidenharris',
  'line',
  'mail',
  'govote',
  'criminal',
  'block',
  'believe',
  'caravan',
  'choice',
  'candidate'\],
 \['china',
  'evidence',
  'breaking',
  'payment',
  'swindling',
  'communist',
  'russia',
  'money',
  'xi',
  'mob',
  'job',
  'hunterbiden',
  'corrupt',
  'virus'\],
 \['trumpsupporter',
  'black',
  'racist',
  'white',
  'hate',
  'care',
  'woman',
  'blacklivesmatter',
  'life',
  'police',
  'video',
  'gun',
  'friend',
  'stay'\],
 \['voter',
  'poll',
  'campaign',
  'ballot',
  'close',
  'suppression',
  'fraud',
  'lead',
  'winning',
  'believe',
  'number',
  'call',
  'michigan',
  'victory'\],
 \['votejoebiden',
  'change',
  'woman',
  'lady',
  'government',
  'famous',
  'fame',
  'help',
  'congratulation',
  'brave',
  'red',
  'wi',
  'sending',
  'born'\],
 \['trump',
  'rally',
  'covid',
  'whitehouse',
  'call',
  'coup',
  'television',
  'supporter',
  'lie',
  'superspreader',
  'anthonyfauci',
  'pandemic',
  'leader',
  'word'\],
 \['mask',
  'wearing',
  'photo',
  'report',
  'plane',
  'covid',
  'wear',
  'without',
  'mandate',
  'pandemic',
  'anthonyfauci',
  'dead',
  'taken',
  'votedonaldtrump'\],
 \['electionday',
  'lie',
  'question',
  'national',
  'pretty',
  'scale',
  'currently',
  'fact',
  'nation',
  'corruption',
  'enrichment',
  'illicit',
  'truth',
  'whitehouse'\]\]

In [11]:
# nmf_model.components_

In [12]:
# doc_topic