In [1]:
import pickle
import re
import string
import emoji
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob


In [2]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [3]:
df = pd.read_pickle("pickle/df_t_raw.pick")
df.shape

(879311, 38)

In [4]:
df['trump'] = df['trump'].astype(int)
df['biden'] = df['biden'].astype(int)

In [5]:
mask = df.biden==0
df[mask]['tweet'].iloc[10]

"@Complex Taliban endorse Trump because he wants to bring home American troops. That's a good thing."

In [6]:
df.sample(10)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
3605,1318702356285038592,1318688179621400576,2020-10-20 23:55:16 UTC,2020-10-20,23:55:16,0,282306807,metsupdate,Tootie,,...,,,,"[{'screen_name': 'tonybeef32', 'name': 'Fat Do...",,,,,0,1
11754,1316525052138061825,1316522656796160000,2020-10-14 23:43:26 UTC,2020-10-14,23:43:26,0,1286417985201217542,knowthyselfneo,Thomas Anderson,,...,,,,"[{'screen_name': 'JamesOKeefeIII', 'name': ""Ja...",,,,,0,1
7400,1317249825633636352,1317172943189835779,2020-10-16 23:43:26 UTC,2020-10-16,23:43:26,0,3355913447,angelisthais,Thais de angelis,,...,,,,"[{'screen_name': 'JusticiaDorada', 'name': 'JU...",,,,,1,0
115,1315079601958334464,1315078880982687744,2020-10-10 23:59:44 UTC,2020-10-10,23:59:44,0,1260000492425482247,freespe84645435,Free Speech,,...,,,,"[{'screen_name': 'charliekirk11', 'name': 'Cha...",,,,,0,1
6350,1319063264186294272,1319063264186294272,2020-10-21 23:49:23 UTC,2020-10-21,23:49:23,0,221988097,diabeticjew,Reds Bergamo,,...,,,,[],,,,,1,0
40955,1323407202304974848,1323406337955536896,2020-11-02 23:30:39 UTC,2020-11-02,23:30:39,0,1323174854829334529,dorkinski,Mike Dorkinski,,...,,,,"[{'screen_name': 'KeithOlbermann', 'name': 'Ke...",,,,,1,0
4412,1319064621500780546,1319059524184117250,2020-10-21 23:54:47 UTC,2020-10-21,23:54:47,0,1301242339,lakeeriejake,Jake Peruski,,...,,,,"[{'screen_name': 'Trump_Detester', 'name': 'cα...",,,,,0,1
5362,1315439299081498624,1315439299081498624,2020-10-11 23:49:03 UTC,2020-10-11,23:49:03,0,1287394314297933824,robertlthomps10,Robert L Thompson,,...,,,,[],,,,,0,1
11004,1316525326915305478,1316525326915305478,2020-10-14 23:44:32 UTC,2020-10-14,23:44:32,0,187818698,mtnbiker1971,Joel Hamilton,,...,,,,[],,,,,0,1
12683,1316524731269603328,1316523803451817992,2020-10-14 23:42:10 UTC,2020-10-14,23:42:10,0,972145840319008768,johnpark214,john P.,,...,,,,"[{'screen_name': 'danielascholz7', 'name': 'da...",,,,,0,1


## Preprocessing

We'll start small by simply removing numbers & punctuation and converting each tweet to lowercase. 

In [7]:
# keep only necessary columns
data = df.loc[:,['tweet', 'trump', 'biden']]
data.head(3)

Unnamed: 0,tweet,trump,biden
0,@capriaaf @JoeBiden Plenty of results for #Tru...,1,1
1,@Changinglenses @greger_mary @JRubinBlogger @G...,0,1
2,"Inside a Biden v. Trump marriage: ""you woke me...",1,1


In [8]:
no_links = lambda x: re.sub(r"https?:\/\/\S+", "", x)
no_handles = lambda x: re.sub(r"@[\d\w_]+", "", x)
alphanum = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

In [9]:
data['tweet'] = data['tweet'].map(no_handles).map(no_links).map(punc_lower).map(alphanum)

In [10]:
data.head(3)

Unnamed: 0,tweet,trump,biden
0,plenty of results for trumpcrimefamily and ...,1,1
1,he left washington before impeachment,0,1
2,inside a biden v trump marriage you woke me...,1,1


In [11]:
# remove tweets in which trump and biden are both named
mask = (data.trump==1) & (data.biden==1)
data = data[mask]

## spaCy

In [13]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [14]:
data.tweet.iloc[15]

'all of you are the worst sort of faux  journalists    biased to the point of bigoted and totally in the tank for biden   if you had a shred of integrity you would look at the bobulinski allegations   but you don t   a deplorable phd      trump  '

## NLTK Tweet Tokenizer

In [None]:
mask = (df.trump == 0) & (df.biden == 1)
biden_tweets = df[mask]['tweet']

mask = (df.trump == 1) & (df.biden == 0)
trump_tweets = df[mask]['tweet']


In [None]:
trump_tweets.shape

In [None]:
biden_tweets.shape

In [None]:
biden_tweets = biden_tweets[:500]
trump_tweets = trump_tweets[:500]

In [None]:
trump_tweets.head(4)

In [None]:
biden_tweets.head(4)

In [None]:
puncs = [c for c in string.punctuation if c not in ["#", ":"]]
print(puncs)

In [None]:
def tweet_tokenize(tweets):
    """Get all of the tokens in a set of tweets"""
    twt = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
    tokens = [token for tweet in tweets for token in twt.tokenize(tweet)]
    # combine stop words and punctuation
    puncs = [c for c in string.punctuation if c not in ["#", ":"]]
    stop = stopwords.words("english") + puncs + ['”']
    stemmer = PorterStemmer()
    tokens = [ stemmer.stem(token) for tweet in tweets
              for token in twt.tokenize(tweet)
              if token.lower() not in stop]

    
    
    return tokens

In [None]:
%%time
trump_tokens = tweet_tokenize(trump_tweets)

In [None]:
%%time
biden_tokens = tweet_tokenize(biden_tweets)

In [None]:
%%time
trump_count = Counter(trump_tokens)

In [None]:
%%time
biden_count = Counter(biden_tokens)

In [None]:
trump_count.most_common(20)

In [None]:
biden_count.most_common(20)

In [None]:
len(biden_tokens)

In [None]:
trump_tokens[:50]

Nice. Now we need to get these back into string form, and send them through a vectorizer.

In [None]:
tmp = (' ').join(biden_count)
type(tmp)

## NMF

In [None]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.075)

In [None]:
doc_word = vectorizer.fit_transform()

In [None]:
biden_tweets.fileids()