In [1]:
import pickle
import re
import string
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

from langdetect import detect

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [3]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [4]:
data = pd.read_pickle("pickle/balanced_nov2_tweets.pick")
data.shape

(96000, 9)

In [5]:
# keep only necessary columns
data['original'] = data.tweet
data.drop(columns='tweet', inplace=True)
data.head(3)

Unnamed: 0,id,date,time,user_id,username,hashtags,trump,biden,original
181142,1323379284434669568,2020-11-02,21:39:43,2820503362,artistacriseida,[],False,True,"All these articles showing that Biden is in the lead.... IGNORE THAT AND STILL GO VOTE. All of these maps showing information that may or may not be correct won’t matter on Election Day. Hillary was also in the lead last election, just do your part. ⁽ᶠᵘᶜᵏ ᵗʳᵘᵐᵖ⁾"
0,1323414585995526144,2020-11-02,23:59:59,1312487180258820096,annapieters17,[],False,True,@FoxNews Lady Gaga’s a nobody. Can’t figure out her own life and can’t even see nobody can help Biden. He’s out of the game from the day he gets in the game.
4,1323414585232293888,2020-11-02,23:59:59,2335763630,kylechwatt,[],False,True,"@The_Grupp “It is purely a fortuity that this isn’t one of the great mass casualty events in American history,” Ron Klain, who was Biden’s chief of staff at the time, said of H1N1 in 2019.” https://t.co/Umi317supK"


## Pre-Processing Pipeline

Now it's time to tokenize our tweets. Here are our pre-processing steps:
* Remove URLs
* Remove Twitter handles
* Remove numbers
* Convert to lowercase
* Remove punctuation
* Remove repeated letters so spell check will work ('aaaaand' -> 'aand')
* Remove non-English words
* Remove stop words

Since we're working with so many different words, I've chosen to use **lemmatization** instead of stemming for two reasons:
1. Lemmatization accurately reduces words to true meaning
2. Inxreased word reduction (handles synonyms)

In [6]:
# nltk.download() # must run first time (download 'popular')

In [7]:
# custom word dictionaries
from more_words import more_words as custom_words
from stop_words import stop_words as custom_stop_words
from bigrams import bigrams

import time
import nltk
from nltk.corpus import words, stopwords

def clean_tweet(tweet):
    """Pre-processing pipeline."""
    
    tweet = tweet.lower()
    tweet = re.sub(r"u\.s\. ", "usa", tweet)
    tweet = re.sub(r"https?:\/\/\S+", "", tweet)
    tweet = re.sub(r'\w*\d\w*', ' ', tweet)
    tweet = re.sub(r'\.{2,6}', ' ', tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tweet = re.sub(r"([a-z])\1{2,5}", r'\1', tweet)
    tweet = ' '.join(tweet.split())
    
    for old, new in bigrams:
        tweet = re.sub(old, new, tweet) # ex: ('white house', 'whitehouse')
    
    return tweet

def tokenize(tweet, all_words, stop_words):
    """
    Returns all of the tokens in a cleaned tweet
    Parameters:
        - tweet (Series, required)
        - stop_words (set, required)
        - all_words (set, required) 
        
    Note:
        - any words not included in all_words here will be
          removed from tokens (including bigrams)
    """
    
    tweet = clean_tweet(tweet)
    
    twt = TweetTokenizer()    
    lemm = WordNetLemmatizer()  
    # lemmatize tokens & remove stop words
    tokens = [lemm.lemmatize(token) for token in twt.tokenize(tweet) if token not in stop_words]
    # only include words that are in our customized list of words
    tokens = [token for token in tokens if token in all_words]
    combined_tokens = ' '.join(tokens)

    return combined_tokens

def clean_and_tokenize(original_tweets):
    """
    Efficiently cleans, tokenizes, lemmatizes, and implements customized
    bigrams on a list of tweets.
    
    Parameters:
        original_tweets (required, pd.Series)
    
    Returns:
        cleaned, tokenized tweets (list)
    """
    tweet_array = list(original_tweets)

    all_words = set(list(words.words('en')) + custom_words + [new for _, new in bigrams])
    stop_words = set(list(stopwords.words('english')) + custom_stop_words)
    
    print("num words: ", len(all_words))
    print("num stop words: ", len(stop_words))
    
    clean_tweets = [tokenize(tweet, all_words, stop_words) for tweet in tweet_array]
    
    return clean_tweets

In [8]:
%%time

data['tweet'] = clean_and_tokenize(data.original)
data.shape

num words:  236140
num stop words:  426
CPU times: user 40.7 s, sys: 126 ms, total: 40.9 s
Wall time: 40.9 s


(96000, 10)

In [9]:
data['num_tokens'] = data['tweet'].str.count(' ') + 1

mask = data['num_tokens'] >= 5
data_minlen = data[mask]
data_minlen.head()

Unnamed: 0,id,date,time,user_id,username,hashtags,trump,biden,original,tweet,num_tokens
181142,1323379284434669568,2020-11-02,21:39:43,2820503362,artistacriseida,[],False,True,"All these articles showing that Biden is in the lead.... IGNORE THAT AND STILL GO VOTE. All of these maps showing information that may or may not be correct won’t matter on Election Day. Hillary was also in the lead last election, just do your part. ⁽ᶠᵘᶜᵏ ᵗʳᵘᵐᵖ⁾",article showing joebiden lead ignore govote map showing information correct electionday hillaryclinton lead lastelection doyourpart,15
0,1323414585995526144,2020-11-02,23:59:59,1312487180258820096,annapieters17,[],False,True,@FoxNews Lady Gaga’s a nobody. Can’t figure out her own life and can’t even see nobody can help Biden. He’s out of the game from the day he gets in the game.,foxnews ladygaga nobody figure life nobody help joebiden game get game,11
4,1323414585232293888,2020-11-02,23:59:59,2335763630,kylechwatt,[],False,True,"@The_Grupp “It is purely a fortuity that this isn’t one of the great mass casualty events in American history,” Ron Klain, who was Biden’s chief of staff at the time, said of H1N1 in 2019.” https://t.co/Umi317supK",purely fortuity mass casualty event joebiden chief staff,8
5,1323414585097908224,2020-11-02,23:59:59,3282973915,medianpolitics,[],False,True,@MSNBC What? When words just don't match reality. BLM .com Antifa . com link to Biden. @ Billion worth of property damage. Innocent dinners being shouted at and forced to leave. Mobs roaming neighborhoods demanding you give them your house. This happened. Where were Dem's? https://t.co/TOMXgURZKb,msnbc word match reality blacklivesmatter link joebiden billion worth property damage innocent dinner forced leave mob roaming neighborhood demanding house,20
8,1323414583483256834,2020-11-02,23:59:59,100625142,michaelmrw,[],False,True,Biden in his Union if I win is just revolting and nauseating! He no more means Peace than Did Joe Stalin Biden has been the Kreepy!,joebiden union revolting mean peace joebiden joebiden,7


In [10]:
data_minlen.shape

(70282, 11)

In [11]:
mask = (data_minlen.trump == 0) & (data_minlen.biden == 1)
biden = data_minlen[mask]
mask = (data_minlen.trump ==1) & (data_minlen.biden == 0)
trump = data_minlen[mask]

df_limited = pd.concat([biden, trump])

print(f"\n Trump Tweets {len(trump)}\n\n Biden Tweets: {len(biden)}",)




 Trump Tweets 36439

 Biden Tweets: 33843


In [12]:
df_limited.shape

(70282, 11)

In [13]:
df_limited.to_pickle("pickle/n2tk_limited.pick")

In [14]:
data[['original', 'tweet']].sample(15)

Unnamed: 0,original,tweet
77688,This Republican lawyer just totally exposed the Trump campaign's voter suppression efforts https://t.co/zWmrmvlcXL,lawyer totally exposed donaldtrump campaign voter suppression effort
18551,@PassItToAuston @Buds_All_Day It's why I said earlier if Trump loses we won't see the same craziness we would if he wins,donaldtrump wont craziness win
55415,"Porque todos están apoyando al otro viejo, solo para que trump pierda. Me vale verga trump, pero tampoco el otro viejo no me da confianza.",donaldtrump vale donaldtrump
16190,@CNN @JohnAvlon @CNNOpinion Trump borrowed your car. He said it was the best car in the world. Then he wrecked the car. Then he blamed the manufacturers of the car. Then the previous owner. Then he celebrated that the trunk still opens. Then told you the next person will wreck it worse,cnn donaldtrump car car world car blamed manufacturer car previous owner celebrated trunk open wreck worse
22845,"If Joe Biden wins, Universal Healthcare will force sex changes on your guns.",joebiden win universalhealthcare force sex change gun
...,...,...
36747,"@JoeBiden we love you, sir! My St. Jude candle is on for you. Sending you good vibes and prayers from Austin, Texas! #BidenHarris2020 #RidenWithBiden2020 #ElectionEve #Election2020 #BidenHarris",joebiden st candle sending prayer electioneve bidenharris
53204,Look from @maitlis says it all. Absolutely batshit crazy trump supporter@#Newsnight,say absolutely crazy
73245,"Now this coward wants to get real, but I think Biden would knock the hot air out of him. #MassMurderers",coward want real joebiden knock hot air
67898,@PrezidentTweety @markknoller That's ok. I cancelled your vote. I voted Trump,vote donaldtrump


In [15]:
# data.to_pickle("pickle/tweets_df_5000tw.pick")

In [16]:
sample = 'tweet tokenize me please mr. biden helloaskldjalksfj I  pence    voting rights am asking for a favor continuous breakdown American Americans'

clean_and_tokenize(pd.Series(sample))


num words:  236140
num stop words:  426


['tweet joebiden mikepence votingrights continuous breakdown']

In [17]:
lemm = WordNetLemmatizer()
lemm.lemmatize('organizations')

'organization'