In [1]:
import pickle
import re
import string
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

from langdetect import detect

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [3]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [4]:
data = pd.read_pickle("pickle/balanced_nov2_tweets.pick")
data.shape

(96000, 9)

In [5]:
# keep only necessary columns
data['original'] = data.tweet
data.drop(columns='tweet', inplace=True)
data.head(3)

Unnamed: 0,id,date,time,user_id,username,hashtags,trump,biden,original
181142,1323379284434669568,2020-11-02,21:39:43,2820503362,artistacriseida,[],False,True,"All these articles showing that Biden is in the lead.... IGNORE THAT AND STILL GO VOTE. All of these maps showing information that may or may not be correct won’t matter on Election Day. Hillary was also in the lead last election, just do your part. ⁽ᶠᵘᶜᵏ ᵗʳᵘᵐᵖ⁾"
0,1323414585995526144,2020-11-02,23:59:59,1312487180258820096,annapieters17,[],False,True,@FoxNews Lady Gaga’s a nobody. Can’t figure out her own life and can’t even see nobody can help Biden. He’s out of the game from the day he gets in the game.
4,1323414585232293888,2020-11-02,23:59:59,2335763630,kylechwatt,[],False,True,"@The_Grupp “It is purely a fortuity that this isn’t one of the great mass casualty events in American history,” Ron Klain, who was Biden’s chief of staff at the time, said of H1N1 in 2019.” https://t.co/Umi317supK"


## Pre-Processing Pipeline

Now it's time to tokenize our tweets. Here are our pre-processing steps:
* Remove URLs
* Remove Twitter handles
* Remove numbers
* Convert to lowercase
* Remove punctuation
* Remove repeated letters so spell check will work ('aaaaand' -> 'aand')
* Remove non-English words
* Remove stop words

Since we're working with so many different words, I've chosen to use **lemmatization** instead of stemming for two reasons:
1. Lemmatization accurately reduces words to true meaning
2. Inxreased word reduction (handles synonyms)

In [6]:
# nltk.download() # must run first time (download 'popular')

In [15]:
# custom word dictionaries
from more_words import more_words as custom_words
from stop_words import stop_words as custom_stop_words
from bigrams import bigrams

import time
import nltk
from nltk.corpus import words, stopwords

def clean_tweet(tweet):
    """Pre-processing pipeline."""
    
    tweet = tweet.lower()
    tweet = re.sub(r"u\.s\. ", "usa", tweet)
    tweet = re.sub(r"https?:\/\/\S+", "", tweet)
    tweet = re.sub(r'\w*\d\w*', ' ', tweet)
    tweet = re.sub(r'\.{2,6}', ' ', tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tweet = re.sub(r"([a-z])\1{2,5}", r'\1', tweet)
    tweet = ' '.join(tweet.split())
    
    for old, new in bigrams:
        tweet = re.sub(old, new, tweet) # ex: ('white house', 'whitehouse')
    
    return tweet

def tokenize(tweet, all_words, stop_words):
    """
    Returns all of the tokens in a cleaned tweet
    Parameters:
        - tweet (Series, required)
        - stop_words (set, required)
        - all_words (set, required) 
        
    Note:
        - any words not included in all_words here will be
          removed from tokens (including bigrams)
    """
    
    tweet = clean_tweet(tweet)
    
    twt = TweetTokenizer()    
    lemm = WordNetLemmatizer()  
    # lemmatize tokens & remove stop words
    tokens = [lemm.lemmatize(token) for token in twt.tokenize(tweet) if token not in stop_words]
    # only include words that are in our customized list of words
    tokens = [token for token in tokens if token in all_words]
    combined_tokens = ' '.join(tokens)

    return combined_tokens


def clean_and_tokenize(original_tweets):
    """
    Parameters:
        original_tweets (required, pd.Series)
    
    Returns:
        cleaned, tokenized tweets (np.array)
    """
    tweet_array = original_tweets.to_numpy()

    all_words = list(words.words('en')) + custom_words
    stop_words = set(list(stopwords.words('english')) + custom_stop_words)
    
    for _, new in bigrams:
        all_words.append(new)

    all_words = set(all_words)
    print("num words: ", len(all_words))
    print("num stop words: ", len(stop_words))
    
    clean_tweets = [tokenize(tweet, all_words, stop_words) for tweet in tweet_array]
    
    return clean_tweets

In [16]:
%%time

data['tweet'] = clean_and_tokenize(data.original)
data.shape

num words:  236014
num stop words:  346
CPU times: user 30 s, sys: 27.9 ms, total: 30 s
Wall time: 30 s


(96000, 11)

In [9]:
data['num_tokens'] = data['tweet'].str.count(' ') + 1

mask = data['num_tokens'] >= 5
data_minlen = data[mask]
data_minlen.shape

(74248, 11)

In [10]:
mask = (data_minlen.trump == 0) & (data_minlen.biden == 1)
biden = data_minlen[mask]
mask = (data_minlen.trump ==1) & (data_minlen.biden == 0)
trump = data_minlen[mask]

print(f"\n Trump Tweets {len(trump)}\n\n Biden Tweets: {len(biden)}")




 Trump Tweets 38402

 Biden Tweets: 35846


In [11]:
data.to_pickle("pickle/n2_tokenized_eff.pick")

In [12]:
data[['original', 'tweet']].sample(15)

Unnamed: 0,original,tweet
70143,@Rational1414 @NRA Me &amp; my wife and my children all want Trump for 4 more !!! And horus paine quit drinking the kool aid !!!,wife child donaldtrump quit drinking aid
66896,"É mto bom ver mina moderninha na internet querendo revolucionar os EUA se posicionando contra o Trump, como se fosse fazer diferença",bom mina o contra donaldtrump fosse
47630,#NPO. Mensen zijn onpeilbaar en nog iets. ben t ff kwijt. Moet je nagaan hoe verheven zij zichzelf vind. Trump wint. https://t.co/NnFs51yEUs,nog ben hoe donaldtrump wint
4758,@SteadyasweDodo I know!!! I haven’t slept in days. I wake up every couple of hours since two weeks ago...and yes...with Trump nightmares.,slept day wake couple hour since two week ago donaldtrump nightmare
29262,"tomorrow could either be the icing on the cake of 2020 or the best day of 2020. you choose. #voteblue - - - @JoeBiden , we got this💪",either icing cake best choose voteblue joebiden
...,...,...
73803,"'Appalling, disturbing and criminal,' Galvin says of Trump's comments about voting https://t.co/w22yX3n72u via @Yahoo yes, it's criminal..................",appalling disturbing criminal say trump comment voting yahoo criminal
19895,"@KamalaHarris @USPS @JoeBiden should Stop with the phony narrative!! Seems like he pull the Scranton card only when it’s convenient for him, what have he done for Scranton over 47 years. BTW, his Eagles jacket is a UD Blue Hens jacket #MAGA #MAGA2020LandslideVictory #2020Election #CorruptJoeBiden https://t.co/IwBVaWWedR",kamalaharris joebiden stop phony narrative pull card convenient year eagle jacket blue hen jacket maga
16581,@Third_Witness @Jarmer16 @music_jeb @certifiedpoints @JoeBiden Lmao the green line is what you're saying caused the rise. https://t.co/JSpgGA3Mcm,joebiden green line rise
77470,"Joe is good at lies lies and more lies. Trump took DJIA from 16,000 to over 30,000 before pandemic. Obama took DJIA from 12,000 to 16,000 in 8 years. You and your moronic moments. Joe you don't give Trump and the pandemic enough credit. You just want to play the blame game.",joebiden lie lie lie donaldtrump took pandemic barackobama took year moronic moment joebiden give donaldtrump pandemic enough credit play blame game


In [13]:
# data.to_pickle("pickle/tweets_df_5000tw.pick")

In [14]:
sample = 'tweet tokenize me please mr. biden helloaskldjalksfj I  pence    voting rights am asking for a favor continuous breakdown American Americans'

tweet_tokenize(sample)

NameError: name 'tweet_tokenize' is not defined

In [None]:
data_min4.shape

In [None]:
data_min4.sample(20)

In [None]:
lemm = WordNetLemmatizer()
lemm.lemmatize('organizations')

In [None]:
# data.to_pickle("pickle/n2_tokenized_min4.pick")