In [1]:
import pickle
import re
import string
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

from langdetect import detect

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [2]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [3]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [4]:
data = pd.read_pickle("pickle/df_ALL_TWEETS_raw.pick")
data.shape

(570189, 9)

In [5]:
# keep only necessary columns
data['original'] = data.tweet
data.drop(columns='tweet', inplace=True)
data.head(3)

Unnamed: 0,id,date,time,user_id,username,hashtags,trump,biden,original
1,1313630113821581312,2020-10-06,23:59:59,1299543389092884480,nutzwhat,[],False,True,"@Astraea_Muse @DontBanMeBro7 @RonColeman @JoeBiden @LLinWood @ToddMcMurtry @MarinaMedvin So concerned about teens with guns? What are your thoughts on Chicago, tough guy?"
3,1313630112609456128,2020-10-06,23:59:59,141645754,davidkgather,[],False,True,LIVE: “Battle for the Soul of the Nation”- Joe Biden Speech in Gettysbur... https://t.co/Vz7sXiAbfh via @YouTube
4,1313630112500326404,2020-10-06,23:59:59,5510112,spaceweasel,[],False,True,Biden raises eyebrows after telling 'these beautiful young ladies' he wants to 'see them dancing when they're four years older' https://t.co/DTME8CXWHr


## Pre-Processing Pipeline

Now it's time to tokenize our tweets. Here are our pre-processing steps:
* Remove URLs
* Remove Twitter handles
* Remove numbers
* Convert to lowercase
* Remove punctuation
* Remove repeated letters so spell check will work ('aaaaand' -> 'aand')
* Remove non-English words
* Remove stop words

Since we're working with so many different words, I've chosen to use **lemmatization** instead of stemming for two reasons:
1. Lemmatization accurately reduces words to true meaning
2. Inxreased word reduction (handles synonyms)

In [6]:
# nltk.download() # must run first time (download 'popular')

In [7]:
# custom word dictionaries
from more_words import more_words as custom_words
from stop_words import stop_words as custom_stop_words
from bigrams import bigrams

import time
import nltk
from nltk.corpus import words, stopwords

def clean_tweet(tweet):
    """Pre-processing pipeline."""
    
    tweet = tweet.lower()
    tweet = re.sub(r"u\.s\. ", "usa", tweet)
    tweet = re.sub(r"https?:\/\/\S+", "", tweet)
    tweet = re.sub(r'\w*\d\w*', ' ', tweet)
    tweet = re.sub(r'\.{2,6}', ' ', tweet)
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    tweet = re.sub(r"([a-z])\1{2,5}", r'\1', tweet)
    tweet = ' '.join(tweet.split())
    
    for old, new in bigrams:
        tweet = re.sub(old, new, tweet) # ex: ('white house', 'whitehouse')
    
    return tweet

def tokenize(tweet, all_words, stop_words):
    """
    Returns all of the tokens in a cleaned tweet
    Parameters:
        - tweet (Series, required)
        - stop_words (set, required)
        - all_words (set, required) 
        
    Note:
        - any words not included in all_words here will be
          removed from tokens (including bigrams)
    """
    
    tweet = clean_tweet(tweet)
    
    twt = TweetTokenizer()    
    lemm = WordNetLemmatizer()  
    # lemmatize tokens & remove stop words
    tokens = [lemm.lemmatize(token) for token in twt.tokenize(tweet) if token not in stop_words]
    # only include words that are in our customized list of words
    tokens = [token for token in tokens if token in all_words]
    tokens = list(set(tokens))
    combined_tokens = ' '.join(tokens)

    return combined_tokens

def clean_and_tokenize(original_tweets):
    """
    Efficiently cleans, tokenizes, lemmatizes, and implements customized
    bigrams on a list of tweets.
    
    Parameters:
        original_tweets (required, pd.Series)
    
    Returns:
        cleaned, tokenized tweets (list)
    """
    tweet_array = list(original_tweets)

    all_words = set(list(words.words('en')) + custom_words + [new for _, new in bigrams])
    stop_words = set(list(stopwords.words('english')) + custom_stop_words)
    
    print("num words: ", len(all_words))
    print("num stop words: ", len(stop_words))
    
    clean_tweets = [tokenize(tweet, all_words, stop_words) for tweet in tweet_array]
    
    return clean_tweets

In [8]:
%%time

data['tweet'] = clean_and_tokenize(data.original)
data.shape

num words:  236142
num stop words:  430
CPU times: user 3min 51s, sys: 304 ms, total: 3min 51s
Wall time: 3min 51s


(570189, 10)

In [27]:
from nltk import word_tokenize
data['num_tokens'] = data['tweet'].str.count(' ') + 1

mask = data['num_tokens'] >= 7
data_minlen = data[mask]
data_minlen.shape

(389371, 11)

In [28]:
data_minlen.shape

(389371, 11)

In [29]:
mask = (data_minlen.trump == 0) & (data_minlen.biden == 1)
biden = data_minlen[mask]
mask = (data_minlen.trump ==1) & (data_minlen.biden == 0)
trump = data_minlen[mask]

print(f"\n Trump Tweets {len(trump)}\n\n Biden Tweets: {len(biden)}",)




 Trump Tweets 172455

 Biden Tweets: 116100


In [30]:
data_minlen.to_pickle("pickle/n2tk_limited.pick")

In [13]:
data[['original', 'tweet']].sample(15)

Unnamed: 0,original,tweet
576394,"@EricTrump @imwatchinU58 And what did your father do all of those years besides several bankruptcies, stealing from charities not paying his workers, even scammed Canadian agencies that he owed money to re Trump University etc. Your father isn’t qualified to lick Joe Biden’s boots. Worse president ever.",qualified agency joebiden erictrump money donaldtrump besides university several worse year lick worker boot paying bankruptcy charity stealing
901589,"@MetsWes Same a wins a win and this shows that even if Trump wins Florida, Georgia and Ohio Biden can still win",win joebiden show donaldtrump
17869,@saletan @jaketapper @KBeds I voted early for Joe Biden But you don’t have to spin the way of Kellyanne Conway &amp; her ilk Stay off the tv @KBeds I’m absolutely disgusted &amp; nauseated by any hint of condescending smirking spin! @JoeBiden @K_JeanPierre Be straight cuz he’s in the right!,joebiden spin ilk condescending disgusted absolutely hint smirking straight stay
826015,".@#The Minnesota Governor Just Gave The State To Donald Trump, Its All Outside, So What's The Difference, You Just Made The Minnesota Ppl Mad, They Will Vote Fir Trump, Thank You Governor, You Dumbass ...",governor outside donaldtrump
614868,"Hey @LindseyGrahamSC I donated $5 to @harrisonjaime because I think you’re corrupt, despicable, and so far up Trump’s ass you don’t even know how to work for the people of this country anymore! If you’re worried I’ll give you all my information.",as corrupt despicable information donated worried far hey donaldtrump
73211,@Swashbucklist @QuinnSimmons9 Only a matter of time before Trump supporters aren’t allowed to go out in public or hold public office. The Democrats playbook isn’t new.,trumpsupporter public playbook hold democrat
827821,@hitforhat @tomsegura @VicBergerIV How do you idiots look at trump like he's a pinnacle of strength? That orange shit really works...,orange pinnacle strength work idiot he donaldtrump
175739,@mindnotforrent I don't know what you mean. Ukraine side of what convo? Vindman left out what? I'm referring to the official transcript of the Trump/Zelensky call when I say Burisma wasn't mentioned.,side call official transcript wasnt
367600,@xxnezumi @JoeBiden Stay home - you'll be wasting your time and energy. Also . . . I like presidents who aren't RAPISTS. Keep safe. 👍,joebiden safe energy president wasting home arent rapist stay
619922,"Biden Botches Trump Attack, Loses Temper as Dem. Voter https://t.co/vdjxHFqAJs (Liberty Headlines) Former Vice President Joe Biden grew angry at his town hall in New Hampton, Iowa, when an 83-year-old retired farmer pressed him about his son Hunter Biden accepting a lucrative pos",headline hunterbiden attack angry lucrative grew retired botch vicepresident town po temper donaldtrump joebiden farmer hall voter liberty former


In [14]:
# data.to_pickle("pickle/tweets_df_5000tw.pick")

In [15]:
sample = 'tweet tokenize me please mr. biden helloaskldjalksfj I  pence    voting rights am asking for a favor continuous breakdown American Americans'

clean_and_tokenize(pd.Series(sample))


num words:  236142
num stop words:  430


['joebiden breakdown mikepence tweet continuous votingrights']

In [16]:
lemm = WordNetLemmatizer()
lemm.lemmatize('organizations')

'organization'