In [1]:
import pickle

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import nltk
from nltk.corpus import stopwords as nltk_stop
from stop_words import stop_words as custom_stop

In [2]:
nltk_stop = list(nltk_stop.words('english'))  

In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [4]:
data = pd.read_pickle("pickle/n2tk_limited.pick")
data.shape
data.dtypes.head(50)

id                    object
date          datetime64[ns]
time                  object
user_id               object
username              object
hashtags              object
trump                   bool
biden                   bool
original              object
tweet                 object
num_tokens             int64
dtype: object

In [5]:
data.drop_duplicates(subset='tweet', inplace=True)
data.to_pickle("pickle/n2tk_limited.pick")
data.shape

(363310, 11)

## TF/IDF Vectorizer



In [6]:
stop_words = custom_stop + nltk_stop

In [7]:
v_tfidf = TfidfVectorizer(max_df=0.4, min_df=0.00015, stop_words=stop_words)
doc_word_ti = v_tfidf.fit_transform(data.tweet)

## NMF

Using the code below, I was able to identify the proper number of topics to use with NMF. 

In [8]:
# # write out the different topics to CSV files to find optimal number of topics

import csv

# for i in range(15, 26):
#     nmf_model = NMF(n_components=i, init='nndsvda', max_iter=400)
#     doc_topic = nmf_model.fit_transform(doc_word_ti)
#     words = v_tfidf.get_feature_names()
#     t = nmf_model.components_.argsort(axis=1)[:,-1:-31:-1]
#     topic_words = [[words[e] for e in l] for l in t]
#     tw_csv = np.array(topic_words).T
    
#     with open(f"../etc/topic-words/topic_words_{i:02d}.csv", "w+", newline="") as f:
#         writer = csv.writer(f)
#         writer.writerow(['topic-' +  str(x) for x in range(1, i+1)])
#         writer.writerows(tw_csv)

It turns out, the best result is 13. Each topic represents a logical & relevant concept, with enough separation to separate similar ones, but not too much separation to the point the topics blend together.

In [14]:
nmf_model = NMF(n_components=13, init='nndsvda')
doc_topic = nmf_model.fit_transform(doc_word_ti)
print(f"doc_topic shape: {doc_topic.shape}")
print(f"Number of iterations used: {nmf_model.n_iter_}")
with open("pickle/nmf_model.pick", 'wb') as f:
    pickle.dump( nmf_model, f)

with open("pickle/nmf_doc_topics.pick", 'wb') as f:
    pickle.dump( doc_topic, f)
    
pd.DataFrame(doc_topic)

doc_topic shape: (363310, 13)
Number of iterations used: 64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.000836,0.000300,0.000000,0.000892,0.000000,0.000066,0.000000,0.000729,0.000038,0.003112,0.003755,0.000000,0.000308
1,0.003752,0.001583,0.000750,0.000000,0.000577,0.000377,0.000000,0.006958,0.001502,0.000000,0.000000,0.000266,0.001017
2,0.001226,0.001190,0.000000,0.000064,0.047775,0.000000,0.000047,0.006574,0.000035,0.000000,0.000134,0.000000,0.000000
3,0.001565,0.001057,0.002666,0.049943,0.000000,0.003429,0.000857,0.000000,0.000000,0.004633,0.002868,0.000472,0.000000
4,0.000000,0.028202,0.000000,0.000000,0.000000,0.000222,0.000520,0.000000,0.000000,0.000000,0.020264,0.038826,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
363305,0.004147,0.001326,0.000000,0.000000,0.002541,0.000889,0.000000,0.010283,0.001982,0.001314,0.005024,0.000000,0.002434
363306,0.002246,0.000000,0.000000,0.000141,0.000198,0.000482,0.000055,0.000000,0.055385,0.029034,0.000036,0.000000,0.000688
363307,0.002394,0.000766,0.007797,0.001437,0.000000,0.002167,0.001311,0.000000,0.000000,0.002420,0.001564,0.002703,0.006103
363308,0.011595,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [15]:
# with open("pickle/nmf_model.pick", 'rb') as f:
#     nmf_model = pickle.load( f)

# with open("pickle/nmf_doc_topics.pick", 'rb') as f:
#     doc_topic = pickle.load( f)

In [16]:
words = v_tfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-21:-1]
topic_words = [[words[e] for e in l] for l in t]

with open("pickle/topic_words.pick", 'wb') as f:
    pickle.dump( topic_words, f)

topic_words

[['hunterbiden',
  'story',
  'twitter',
  'news',
  'real',
  'cnn',
  'life',
  'trumpsupporter',
  'help',
  'world',
  'foxnews',
  'fact',
  'since',
  'call',
  'maybe',
  'video',
  'post',
  'oh',
  'question',
  'getting'],
 ['vote',
  'votejoebiden',
  'ballot',
  'votedonaldtrump',
  'party',
  'voter',
  'candidate',
  'change',
  'poll',
  'cast',
  'electionday',
  'democracy',
  'enough',
  'bidenharris',
  'save',
  'line',
  'mail',
  'life',
  'help',
  'voteblue'],
 ['covid',
  'positive',
  'test',
  'death',
  'whitehouse',
  'pandemic',
  'hospital',
  'tested',
  'virus',
  'negative',
  'response',
  'plan',
  'doctor',
  'die',
  'diagnosis',
  'infected',
  'dead',
  'treatment',
  'economy',
  'getting'],
 ['kamalaharris',
  'mikepence',
  'votejoebiden',
  'nancypelosi',
  'ballot',
  'plan',
  'bidenharris',
  'ticket',
  'vicepresident',
  'amendment',
  'senator',
  'woman',
  'socialist',
  'berniesanders',
  'blue',
  'whitehouse',
  'sign',
  'future',

In [17]:
tmp = pd.DataFrame(topic_words)
tmp.to_csv("topic_words.csv")

In [18]:
with open("pickle/topic_words.pick", 'rb') as f:
    topic_words = pickle.load( f)

topic_words

[['hunterbiden',
  'story',
  'twitter',
  'news',
  'real',
  'cnn',
  'life',
  'trumpsupporter',
  'help',
  'world',
  'foxnews',
  'fact',
  'since',
  'call',
  'maybe',
  'video',
  'post',
  'oh',
  'question',
  'getting'],
 ['vote',
  'votejoebiden',
  'ballot',
  'votedonaldtrump',
  'party',
  'voter',
  'candidate',
  'change',
  'poll',
  'cast',
  'electionday',
  'democracy',
  'enough',
  'bidenharris',
  'save',
  'line',
  'mail',
  'life',
  'help',
  'voteblue'],
 ['covid',
  'positive',
  'test',
  'death',
  'whitehouse',
  'pandemic',
  'hospital',
  'tested',
  'virus',
  'negative',
  'response',
  'plan',
  'doctor',
  'die',
  'diagnosis',
  'infected',
  'dead',
  'treatment',
  'economy',
  'getting'],
 ['kamalaharris',
  'mikepence',
  'votejoebiden',
  'nancypelosi',
  'ballot',
  'plan',
  'bidenharris',
  'ticket',
  'vicepresident',
  'amendment',
  'senator',
  'woman',
  'socialist',
  'berniesanders',
  'blue',
  'whitehouse',
  'sign',
  'future',