In [1]:
import pickle
import re
import string
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

from langdetect import detect

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [3]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [4]:
data = pd.read_pickle("pickle/balanced_nov2_tweets.pick")
data.shape

(96000, 9)

In [5]:
# keep only necessary columns
data['original'] = data.tweet
data.drop(columns='tweet', inplace=True)
data.head(3)

Unnamed: 0,id,date,time,user_id,username,hashtags,trump,biden,original
181142,1323379284434669568,2020-11-02,21:39:43,2820503362,artistacriseida,[],False,True,"All these articles showing that Biden is in the lead.... IGNORE THAT AND STILL GO VOTE. All of these maps showing information that may or may not be correct won’t matter on Election Day. Hillary was also in the lead last election, just do your part. ⁽ᶠᵘᶜᵏ ᵗʳᵘᵐᵖ⁾"
0,1323414585995526144,2020-11-02,23:59:59,1312487180258820096,annapieters17,[],False,True,@FoxNews Lady Gaga’s a nobody. Can’t figure out her own life and can’t even see nobody can help Biden. He’s out of the game from the day he gets in the game.
4,1323414585232293888,2020-11-02,23:59:59,2335763630,kylechwatt,[],False,True,"@The_Grupp “It is purely a fortuity that this isn’t one of the great mass casualty events in American history,” Ron Klain, who was Biden’s chief of staff at the time, said of H1N1 in 2019.” https://t.co/Umi317supK"


## Pre-Processing Pipeline

Now it's time to tokenize our tweets. Here are our pre-processing steps:
* Remove URLs
* Remove Twitter handles
* Remove numbers
* Convert to lowercase
* Remove punctuation
* Remove repeated letters so spell check will work ('aaaaand' -> 'aand')
* Remove non-English words
* Remove stop words

Since we're working with so many different words, I've chosen to use **lemmatization** instead of stemming for two reasons:
1. Lemmatization accurately reduces words to true meaning
2. Inxreased word reduction (handles synonyms)

In [6]:
# nltk.download() # must run first time (download 'popular')

In [7]:
# custom word dictionaries
from more_words import more_words as custom_words
from stop_words import stop_words as custom_stop_words
from bigrams import bigrams

import time
import nltk
from nltk.corpus import words, stopwords

def clean_tweet(tweet):
    """Pre-processing pipeline."""
    
    tweet = tweet.lower()
    tweet = re.sub(r"u\.s\. ", "usa", tweet)
    tweet = re.sub(r"https?:\/\/\S+", "", tweet)
    tweet = re.sub(r'\w*\d\w*', ' ', tweet)
    tweet = re.sub(r'\.{2,6}', ' ', tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tweet = re.sub(r"([a-z])\1{2,5}", r'\1', tweet)
    tweet = ' '.join(tweet.split())
    
    for old, new in bigrams:
        tweet = re.sub(old, new, tweet) # ex: ('white house', 'whitehouse')
    
    return tweet

def tokenize(tweet, all_words, stop_words):
    """
    Returns all of the tokens in a cleaned tweet
    Parameters:
        - tweet (Series, required)
        - stop_words (set, required)
        - all_words (set, required) 
        
    Note:
        - any words not included in all_words here will be
          removed from tokens (including bigrams)
    """
    
    tweet = clean_tweet(tweet)
    
    twt = TweetTokenizer()    
    lemm = WordNetLemmatizer()  
    # lemmatize tokens & remove stop words
    tokens = [lemm.lemmatize(token) for token in twt.tokenize(tweet) if token not in stop_words]
    # only include words that are in our customized list of words
    tokens = [token for token in tokens if token in all_words]
    combined_tokens = ' '.join(tokens)

    return combined_tokens

def clean_and_tokenize(original_tweets):
    """
    Efficiently cleans, tokenizes, lemmatizes, and implements customized
    bigrams on a list of tweets.
    
    Parameters:
        original_tweets (required, pd.Series)
    
    Returns:
        cleaned, tokenized tweets (np.array)
    """
    tweet_array = original_tweets.to_numpy()

    all_words = list(words.words('en')) + custom_words
    stop_words = set(list(stopwords.words('english')) + custom_stop_words)
    
    for _, new in bigrams:
        all_words.append(new)

    all_words = set(all_words)
    print("num words: ", len(all_words))
    print("num stop words: ", len(stop_words))
    
    clean_tweets = [tokenize(tweet, all_words, stop_words) for tweet in tweet_array]
    
    return clean_tweets

In [8]:
%%time

data['tweet'] = clean_and_tokenize(data.original)
data.shape

num words:  236110
num stop words:  393
CPU times: user 38.8 s, sys: 107 ms, total: 39 s
Wall time: 39 s


(96000, 10)

In [9]:
data['num_tokens'] = data['tweet'].str.count(' ') + 1

mask = data['num_tokens'] >= 5
data_minlen = data[mask]
data_minlen.shape

(60778, 11)

In [10]:
mask = (data_minlen.trump == 0) & (data_minlen.biden == 1)
biden = data_minlen[mask]
mask = (data_minlen.trump ==1) & (data_minlen.biden == 0)
trump = data_minlen[mask]

print(f"\n Trump Tweets {len(trump)}\n\n Biden Tweets: {len(biden)}")




 Trump Tweets 32084

 Biden Tweets: 28694


In [11]:
data.to_pickle("pickle/n2_tokenized_eff.pick")

In [12]:
data[['original', 'tweet']].sample(15)

Unnamed: 0,original,tweet
49154,@MeghanMcCain you must be so proud of your beloved trump trash supporters!,beloved trash supporter
12212,@lucyg127 ewwwww the people at my school where trump stuff to school all the time i’m just there like 😔,school stuff school
76582,"Trump never talks about the terrible pain that the families of the 230,000 dead from his COVID are going through.",talk pain family dead covid
60215,"NEW AD: Joe Biden uses 2 Live Crew classic ""Get the Fuck Out of My House"" to encourage voting.",us crew classic house voting
63072,“BUT BIDEN’S LEADING IN THE POLLS”.....FAMOUS LAST WORDS https://t.co/U1zMF5yMfS,leading poll famous word
...,...,...
74948,"@All_Prays @th3damage_ @LeighEllis @turbotax Ok genius; tell me if this impresses you, Lebron has a lifetime contract from Nike that will pay him a billion dollars requiring virtually nothing from him. Use whatever discount rate you want, Trump would hock all 5? of his kids for something half as valuable.",genius impress lifetime contract pay billion dollar virtually use whatever discount rate hock something half valuable
3818,@GOP @realDonaldTrump An adorable child was taught about voting and the election at school. The teacher asked do you know why your parents are voting tomorrow. The child answered “ to kick the Trump out”,adorable child taught voting school teacher parent voting child kick
71542,Imagine this scenario: President: Joe Biden Vice President: Kamala Harris Attorney General: Adam Schiff Secretary of State: Hillary Clinton Treasury Secretary: Elizabeth Warren COVID-19 Czar: Andrew Cuomo Green New Deal Board: AOC Constitution Review Board: Beto O'Rourke #VOTE,scenario vicepresident attorneygeneral secretary hillaryclinton treasury secretary warren covid czar green deal board constitution review board vote
72652,"@VotingBlueInTX Fauci has made it very clear that we are not turning the corner and in fact we couldn't be in a worse position for going into winter. This complete contradicts Trump's campaign message, and we know what he does when someone in Government takes a position against this monster.",anthonyfauci clear turning corner fact worse position winter complete trump campaign government take position monster


In [13]:
# data.to_pickle("pickle/tweets_df_5000tw.pick")

In [14]:
sample = 'tweet tokenize me please mr. biden helloaskldjalksfj I  pence    voting rights am asking for a favor continuous breakdown American Americans'

clean_and_tokenize(pd.Series(sample))


num words:  236110
num stop words:  393


['tweet votingrights continuous breakdown']

In [15]:
lemm = WordNetLemmatizer()
lemm.lemmatize('organizations')

'organization'

In [16]:
# data.to_pickle("pickle/n2_tokenized_min4.pick")