In [1]:
import pickle

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import nltk
from nltk.corpus import stopwords as nltk_stop
from stop_words import stop_words as custom_stop

In [2]:
nltk_stop = list(nltk_stop.words('english'))  

In [3]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [4]:
data = pd.read_pickle("pickle/n2_tokenized_eff.pick")

In [5]:
data.sample(3)

Unnamed: 0,id,date,time,user_id,username,hashtags,trump,biden,original,tweet,num_tokens
13580,1323410823516246018,2020-11-02,23:45:02,285108919,booking_it_fast,[],False,True,"Hey @JoeBiden, my kindergartner has some advice for you. https://t.co/XZ4tm4kUp0",hey kindergartner advice,3
43978,1323402012810240005,2020-11-02,23:10:02,23838260,ethannichtern,[],False,True,"@traceydurning They need a story, that's not really true in the aggregate. @FiveThirtyEight has the odds of a Biden victory higher than last week (I only check weekly).",story true odds victory higher week check weekly,8
83015,1323400588189720576,2020-11-02,23:04:22,2259760956,kayla_hinty,[],True,False,the fact that trumpies are planning attacks on bipoc and members of the lgbtq+ community is very very telling of Donald Trump’s presidency and what he represents.,fact attack member community presidency,5


## TF/IDF Vectorizer



In [6]:
stop_words = custom_stop + nltk_stop

In [7]:
v_tfidf = TfidfVectorizer(max_df=0.16, min_df=0.0002, stop_words=stop_words)
doc_word_ti = v_tfidf.fit_transform(data.tweet)

## NMF

In [8]:
nmf_model = NMF(n_components=12, init='nndsvda')
doc_topic = nmf_model.fit_transform(doc_word_ti)
print(f"Shape: {doc_topic.shape}")
print(f"Number of iterations used: {nmf_model.n_iter_}")

# pd.DataFrame(doc_topic)

Shape: (96000, 12)
Number of iterations used: 137


From lecture: The **topic_word** matrix shows us the 2 resulting topics, and the terms that are associated with each topic. By looking at the words below, we can figure out what the topics are.


In [9]:
words = v_tfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-31:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['vote',
  'electionday',
  'electoral',
  'party',
  'ballot',
  'cast',
  'attorneygeneral',
  'democracy',
  'voter',
  'mail',
  'voice',
  'bidenharris',
  'victory',
  'nation',
  'electoralcollege',
  'voteblue',
  'counting',
  'life',
  'enough',
  'govote',
  'line',
  'remember',
  'future',
  'former',
  'change',
  'black',
  'blue',
  'steal',
  'may',
  'save'],
 ['barackobama',
  'cnn',
  'steal',
  'speech',
  'hillaryclinton',
  'pointer',
  'hammer',
  'sweet',
  'added',
  'bush',
  'crowd',
  'miss',
  'economy',
  'msnbc',
  'running',
  'administration',
  'news',
  'change',
  'fire',
  'speaking',
  'birthdayparty',
  'listening',
  'choose',
  'size',
  'cut',
  'watching',
  'former_president',
  'superspreader',
  'funny',
  'anthonyfauci'],
 ['voting',
  'wise',
  'cover',
  'foreign',
  'voteblue',
  'bidenharris',
  'party',
  'ballot',
  'friend',
  'poll',
  'voter',
  'block',
  'family',
  'caravan',
  'mail',
  'registered',
  'black',
  'line',
  '

In [10]:
# write out the different topics to CSV files to find optimal number of topics

import csv

# with open(f"topic_words_{}.csv", "w", newline="") as f:
#     writer = csv.writer(f)
#     writer.writerows(a)

for i in range(2, 15):
    nmf_model = NMF(n_components=i, init='nndsvda')
    doc_topic = nmf_model.fit_transform(doc_word_ti)
    words = v_tfidf.get_feature_names()
    t = nmf_model.components_.argsort(axis=1)[:,-1:-31:-1]
    topic_words = [[words[e] for e in l] for l in t]
    tw_csv = np.array(topic_words).T
    
    with open(f"../etc/topic-words/topic_words_{i:02d}.csv", "w+", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(['topic-' +  str(x) for x in range(1, i+1)])
        writer.writerows(tw_csv)



In [11]:
# nmf_model.components_

In [12]:
# doc_topic

## LDA

In [13]:
# lda_model = LatentDirichletAllocation(n_components=5)
# doc_topic = lda_model.fit_transform(doc_word_ti)
# doc_topic.shape

In [14]:
# words = cv.get_feature_names()
# t = lda_model.components_.argsort(axis=1)[:,-1:-7:-1]
# topic_words = [[words[e] for e in l] for l in t]
# topic_words

## Sentiment Analysis

In [15]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [16]:
# sid_obj = SentimentIntensityAnalyzer()
# sentiment = []
# for text in data.tweet:
#     sentiment.append(sid_obj.polarity_scores(text))
    
# pd.concat([data,pd.DataFrame(sentiment)], axis=1)