In [1]:
import pandas as pd
import numpy as np
import pickle
import re
import string
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter


In [2]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [3]:
df = pd.read_pickle("pickle/df_t_raw.pick")
df.shape

(879311, 38)

In [4]:
df.head(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,...,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
0,1321602647368019968,1321554078376226818,2020-10-28 23:59:59 UTC,2020-10-28,23:59:59,0,828075570219085824,jessferg2253,Jessa,,...,,,,"[{'screen_name': 'capriaaf', 'name': 'Capria',...",,,,,True,True
1,1321602647129051136,1321493697565880320,2020-10-28 23:59:59 UTC,2020-10-28,23:59:59,0,255255428,margculbster,margaret c,,...,,,,"[{'screen_name': 'Changinglenses', 'name': 'La...",,,,,True,False
2,1321602646973755395,1321602646973755395,2020-10-28 23:59:59 UTC,2020-10-28,23:59:59,0,15863393,katehinds,Kate Hinds,,...,,,,[],,,,,True,True


In [5]:
df['trump'] = df['trump'].astype(int)
df['biden'] = df['biden'].astype(int)

## Preprocessing

We'll start small by simply removing numbers & punctuation and converting each tweet to lowercase. 

In [6]:
# keep only necessary columns
data = df.loc[:,['tweet', 'trump', 'biden']]
data.head(3)

Unnamed: 0,tweet,trump,biden
0,@capriaaf @JoeBiden Plenty of results for #Tru...,1,1
1,@Changinglenses @greger_mary @JRubinBlogger @G...,0,1
2,"Inside a Biden v. Trump marriage: ""you woke me...",1,1


In [7]:
no_links = lambda x: re.sub(r"https?:\/\/\S+", "", x)
no_handles = lambda x: re.sub(r"@[\d\w_]+", "", x)
alphanum = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

In [8]:
data['tweet'] = data['tweet'].map(no_handles).map(no_links).map(punc_lower).map(alphanum)

In [9]:
data.head(3)

Unnamed: 0,tweet,trump,biden
0,plenty of results for trumpcrimefamily and ...,1,1
1,he left washington before impeachment,0,1
2,inside a biden v trump marriage you woke me...,1,1


In [10]:
len(data) * 0.8

703448.8

In [11]:
cv = CountVectorizer(stop_words='english')
X_train = data[:50]['tweet']

In [13]:
X_train_cv = cv.fit_transform(X_train)

In [14]:
# pd.DataFrame(X_train_cv.toarray(), columns=cv.get_feature_names()).head()

## NLTK Tweet Tokenizer

In [35]:
mask = (df.trump == 0) & (df.biden == 1)
biden_tweets = df[mask]['tweet']
biden_tweets = biden_tweets[:50000]

mask = (df.trump == 1) & (df.biden == 0)
trump_tweets = df[mask]['tweet']
trump_tweets = trump_tweets[:50000]

In [36]:
trump_tweets.shape

(50000,)

In [37]:
biden_tweets.head(4)

1    @Changinglenses @greger_mary @JRubinBlogger @G...
6        Loving all these Republicans endorsing Biden.
7    WATCH: 'Ballot chaser' boasts she got $55,000 ...
8                   @JoeBiden  https://t.co/qTxhMODuIH
Name: tweet, dtype: object

In [67]:
puncs = [c for c in string.punctuation if c not in ["#", ":"]]
print(puncs)

None


In [77]:
def tweet_tokenize(tweets):
    """Get all of the tokens in a set of tweets"""
    twt = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
    tokens = [token for tweet in tweets for token in twt.tokenize(tweet)]
    # combine stop words and punctuation
    puncs = [c for c in string.punctuation if c not in ["#", ":"]]
    stop = stopwords.words("english") + puncs + ['”']
    stemmer = PorterStemmer()
    tokens = [ stemmer.stem(token) for tweet in tweets
              for token in twt.tokenize(tweet)
              if token.lower() not in stop]

    
    
    return tokens

In [78]:
%%time
trump_tokens = tweet_tokenize(trump_tweets)

CPU times: user 19.5 s, sys: 55.8 ms, total: 19.6 s
Wall time: 19.6 s


In [79]:
%%time
biden_tokens = tweet_tokenize(biden_tweets)

CPU times: user 16.1 s, sys: 27.9 ms, total: 16.1 s
Wall time: 16.1 s


In [80]:
%%time
trump_count = Counter(trump_tokens)

CPU times: user 87.2 ms, sys: 4.01 ms, total: 91.2 ms
Wall time: 90.8 ms


In [81]:
%%time
biden_count = Counter(biden_tokens)

CPU times: user 70.5 ms, sys: 35 µs, total: 70.5 ms
Wall time: 70.3 ms


In [82]:
trump_count.most_common(20)

[('trump', 51607),
 ('’', 16639),
 ('...', 5126),
 (':', 4563),
 ('vote', 4250),
 ('like', 4233),
 ('presid', 4088),
 ('peopl', 3983),
 ('get', 3351),
 ('support', 3288),
 ('say', 3110),
 ('donald', 2850),
 ("trump'", 2749),
 ('go', 2746),
 ('know', 2624),
 ('“', 2545),
 ('one', 2474),
 ('think', 2439),
 ('would', 2426),
 ('covid', 2310)]

In [83]:
biden_count.most_common(20)

[('biden', 29246),
 ('’', 13829),
 ('joe', 7578),
 ('vote', 7189),
 ('...', 4689),
 (':', 3231),
 ('like', 3072),
 ('get', 2956),
 ('peopl', 2898),
 ('hunter', 2646),
 ('go', 2637),
 ('say', 2541),
 ('would', 2442),
 ('win', 2310),
 ('know', 2298),
 ('presid', 2265),
 ('one', 2191),
 ('harri', 2078),
 ('elect', 2058),
 ('think', 2025)]

In [84]:
len(biden_tokens)

567732

In [85]:
trump_tokens[:50]

['surpris',
 'see',
 'travel',
 'way',
 'america',
 'sweden',
 'meet',
 'presid',
 'mock',
 '’',
 'want',
 'reveng',
 'trump',
 'disrespect',
 'trump',
 'littl',
 'lacki',
 '🚨',
 'trump',
 'malta',
 'mason',
 'serv',
 'chabad',
 'synagogu',
 'satan',
 ');',
 'thu',
 'recommend',
 'jump',
 'jesu',
 'train',
 'brother',
 'lot',
 'inform',
 '...',
 'lot',
 'could',
 'chang',
 'time',
 'period',
 '—',
 'adjust',
 'tax',
 'code',
 'increas',
 'revenu',
 'spend',
 'cut',
 'would',
 'alter']