In [4]:
import pickle
import re
import string
import emoji
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

from langdetect import detect

import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.corpus import stopwords, words
from nltk.stem import PorterStemmer, WordNetLemmatizer

import spacy
import contextualSpellCheck

nlp = spacy.load('en_core_web_sm')

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [3]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [4]:
df = pd.read_pickle("pickle/balanced_nov2_tweets.pick")
df.shape

(144000, 9)

In [5]:
df.columns

Index(['id', 'date', 'time', 'user_id', 'username', 'tweet', 'hashtags',
       'trump', 'biden'],
      dtype='object')

In [6]:
len(df.username.unique())

102600

In [7]:
df.sample(3)

Unnamed: 0,id,date,time,user_id,username,tweet,hashtags,trump,biden
67365,1323394849429770247,2020-11-02,22:41:34,847891925394960384,boycevida,@SUBRATA30016572 @JoeBiden https://t.co/NxBdVNalsR,[],False,True
71795,1323393507692814336,2020-11-02,22:36:14,979513121541967873,lindakost53,"Why Is Australia Reporting This Biden Child Porn Story, But The Media In America Hasn’t Reported It❓❓❓ Joe &amp; Hunter Biden Need To Be Investigated For Child Porn &amp; Sexual Assault Of Minors‼️",[],False,True
103198,1323383725716111363,2020-11-02,21:57:22,1203560977746276352,changed100times,@JoeBiden @KamalaHarris Where's Hunter Biden? can anyone tell me pleaseeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee,[],False,True


Now let's create a subset, containing the same amount of Trump tweets as Biden tweets. We will exclude tweets that mention both candidates.

In [8]:
# keep only necessary columns
data = df.loc[:,['tweet', 'trump', 'biden', 'hashtags', 'user_id']]
data['original'] = data.tweet
data.drop(columns='tweet', inplace=True)
data.head(3)

Unnamed: 0,trump,biden,hashtags,user_id,original
181142,False,True,[],2820503362,"All these articles showing that Biden is in the lead.... IGNORE THAT AND STILL GO VOTE. All of these maps showing information that may or may not be correct won’t matter on Election Day. Hillary was also in the lead last election, just do your part. ⁽ᶠᵘᶜᵏ ᵗʳᵘᵐᵖ⁾"
0,False,True,[],1312487180258820096,@FoxNews Lady Gaga’s a nobody. Can’t figure out her own life and can’t even see nobody can help Biden. He’s out of the game from the day he gets in the game.
4,False,True,[],2335763630,"@The_Grupp “It is purely a fortuity that this isn’t one of the great mass casualty events in American history,” Ron Klain, who was Biden’s chief of staff at the time, said of H1N1 in 2019.” https://t.co/Umi317supK"


## Pre-Processing Pipeline

Now it's time to tokenize our tweets. Here are our pre-processing steps:
* Remove URLs
* Remove Twitter handles
* Remove numbers
* Convert to lowercase
* Remove punctuation
* Remove repeated letters so spell check will work ('aaaaand' -> 'aand')
* Remove non-English words
* Remove stop words

Since we're working with so many different words, I've chosen to use **lemmatization** instead of stemming for two reasons:
1. Lemmatization accurately reduces words to true meaning
2. Inxreased word reduction (handles synonyms)

In [None]:
# Testing these before putting them in func


tweet = 'this is ! SAMPLE text...    @joebiden @donaldtrump #2020electionusa #2020ELECTIONUSA'

# remove urls
tweet = re.sub(r"https?:\/\/\S+", "", tweet)
# remove twitter handles
tweet = re.sub(r"@[\d\w_]+", "", tweet)
# remove numbers
tweet = re.sub('\w*\d\w*', ' ', tweet)
# convert to lowercase
tweet = re.sub('[%s]'.format(re.escape(string.punctuation)), ' ', tweet.lower())
# remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
tweet = re.sub(r"([a-z])\1{2,}", r'\1', tweet)
# replace consecutive spaces with one
tweet = ' '.join(tweet.split())

tweet


In [10]:
# from autocorrect import Speller # TOO SLOW...TRY PYSPELLCHECKER

def clean_tweets(tweet):
    # pre-processing pipeline
    
    # remove urls
    tweet = re.sub(r"https?:\/\/\S+", "", tweet)
    # remove numbers
    tweet = re.sub('\w*\d\w*', ' ', tweet)
    # remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # convert to lowercase
    tweet = tweet.lower()

    # and other popular campaign phrases
    tweet = re.sub(r"make america great again", "maga", tweet)
    tweet = re.sub(r"makeamericagreatagain", "maga", tweet)
    
    # remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
    tweet = re.sub(r"([a-z])\1{2,}", r'\1', tweet)
    # replace consecutive spaces with one
    tweet = ' '.join(tweet.split())
    
    # remove spaces in candidate names
    tweet = re.sub(r"joebiden", "joe_biden", tweet)
    tweet = re.sub(r"kamalaharris", "kamala_harris", tweet)
    tweet = re.sub(r"donaldtrump", "donald_trump", tweet)
    tweet = re.sub(r"mikepence", "mike_pence", tweet)
    tweet = re.sub(r"joe biden", "joe_biden", tweet)
    tweet = re.sub(r"kamala harris", "kamala_harris", tweet)
    tweet = re.sub(r"donald trump", "donald_trump", tweet)
    tweet = re.sub(r"mike pence", "mike_pence", tweet)
    # replace 'biden' with 'joebiden' (do for all candidates)
    tweet = re.sub(r"\bbiden\b", "joe_biden", tweet)
    tweet = re.sub(r"\bpence\b", "mike_pence", tweet)
    tweet = re.sub(r"\bharris\b", "kamala_harris", tweet)
    tweet = re.sub(r"\btrump\b", "donald_trump", tweet)

    return tweet

def tweet_tokenize(tweet, more_stop=None, more_words=None):
    """Get all of the tokens in a set of tweets.
    
    Parameters:
        - tweets (Series, required)
        
        - more_stop (List, optional): additional stop words to exclude
        
        - more_words (List, optional): additional words to INCLUDE in dictionary
    
    """
    # identify election-related words that 
    more_words = ['trump', 'biden', 'maga', 'bidenharris', 
                  'kamala', 'pence', 'harris', 'mike',
                  'bidenharris2020', 'trumppence',
                  'trumppence2020', 'usa', 'election2020',
                  'ivoted', 'joe_biden', 'realdonaldtrump',
                  'donald_trump',
                  'mike_pence', 'kamala_harris']
    
    all_words = list(words.words()) + more_words
    all_words = set(all_words)
    
    twt = TweetTokenizer()
    tokens = [token for token in twt.tokenize(tweet) if token in all_words]
    
    # initiate stop word removal and lemmatization    
    more_stop = ['fxhedg','fyck','fy','fxxking','give','go',
                 'going','gonna','get','one','de','la','el','en','un','ha',
                 'would','dont','know','time','think','want','via','dont']
    
    stop_words = list(stopwords.words('english')) + more_stop
    stop = stop_words
    stop = set(stop)
    
    lemm = WordNetLemmatizer()
    
    # implement lemmatization and stop word removal
    tokens = [lemm.lemmatize(token) for token in tokens
              if token.lower() not in stop]
#     spell = Speller(lang='en')
#     tokens = [spell(t) for t in tokens]

    combined_tokens = ' '.join(tokens)

    return combined_tokens

In [9]:
lemm = WordNetLemmatizer()
lemm.lemmatize('good great decent ok medicore miserable unhappy')

'good great decent ok medicore miserable unhappy'

In [11]:
data['tweet'] = data['original'].map(tweet_tokenize)

In [2]:
data.to_pickle("pickle/n2_tokenized.pick")

NameError: name 'data' is not defined

In [13]:
data.sample(10)

Unnamed: 0,trump,biden,hashtags,user_id,original,tweet
12665,True,False,['cult'],71560607,"@sapphiresdust #Cult Hard to imagine, but they see God's Will in everything he says and does or demands be done. Trump may as well be the Second Coming.",cult hard imagine see everything done donald_trump may well second coming
89888,False,True,[],1190786220407885824,@JessIsAngry @JoeBiden Biden is a trojan horse. He would be removed on day 1 and Kamala takes over. I know socialism exists. That's what I am saying. These politicians have been implementing socialist policies for decades and they have brought America to the brink of destruction.,joe_biden joe_biden horse removed day kamala socialism thats saying socialist brought brink destruction
78167,False,True,[],1166050885526589440,@sunlorrie Biden doesnt know if hes married to his sister or his wife.,joe_biden doesnt married sister wife
39887,True,False,[],1486117886,Only 6 major US newspapers have endorsed Trump for reelection https://t.co/mjXyC3fSAx,major u endorsed donald_trump
20532,False,True,[],354435309,"@JustLouG @TheFrostyMac Absolutely. I’m voting Biden, and it’s gonna leave a bad taste in my mouth, and I know Indiana is going red, but at least I’m not voting for 4 more years of blatant xenophobia, sexism, and restrictions of my sisters’ rights.",absolutely voting joe_biden leave bad taste mouth red least voting blatant xenophobia
4205,True,False,[],3245363547,"Imagine your best friend is Latino and you straight in their face say you voted for Trump who is ripping their families apart and holding them in cages. Then call them soft because they can't your ""point"".",imagine best friend straight face say donald_trump ripping apart holding call soft cant point
7495,False,True,[],44785696,@monicasloves @AmyAThatcher Pray for the greater good. Visualize Joe Biden &amp; Kamala Harris in the White House!,pray greater good visualize joe_biden kamala_harris white house
12419,False,True,[],1240751653143248896,Vote Biden. https://t.co/tJwTeJIBDN,vote joe_biden
59883,False,True,[],562988689,@rcmahoney @JoeBiden Thank you!!!!!!!!,joe_biden thank
75403,True,False,[],830988262328066048,@MorganCribbs1 @Angelic19801673 @larrayxo Oh I’ve been seeing trump has 28 allegations,oh seeing donald_trump


In [14]:
# data.to_pickle("pickle/tweets_df_5000tw.pick")