In [1]:
import pickle
import re
import string
import emoji
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from spellchecker import SpellChecker

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)

In [3]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [4]:
df = pd.read_pickle("pickle/df_t_raw.pick")
df.shape

(879311, 38)

In [5]:
df.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
4677,1315801617401368577,1315801617401368577,2020-10-12 23:48:46 UTC,2020-10-12,23:48:46,0,1108447939,mac_ovelli,The Dude,,Oh no.. the Hispanic/Black man lefties call a ...,en,[],[],[],0,0,0,[],[],https://twitter.com/Mac_ovelli/status/13158016...,False,https://twitter.com/Timcast/status/13157961163...,0,,,,,,,,[],,,,,1,0
71845,1323398087965069312,1323383130435260419,2020-11-02 22:54:26 UTC,2020-11-02,22:54:26,0,4130935702,bickfordmelinda,MB,,@JoeBiden We need trump out and we need some s...,en,[],[],[],0,0,0,[],[],https://twitter.com/BickfordMelinda/status/132...,False,,0,,,,,,,,"[{'screen_name': 'JoeBiden', 'name': 'Joe Bide...",,,,,1,1
1367,1323051805576732672,1322936987347611648,2020-11-01 23:58:26 UTC,2020-11-01,23:58:26,0,1538892457,happybeachplace,BB13,,@AynRandPaulRyan Trumps kids are losers and ev...,en,[],[],[],0,0,0,[],[],https://twitter.com/happybeachplace/status/132...,False,,0,,,,,,,,"[{'screen_name': 'AynRandPaulRyan', 'name': ""H...",,,,,0,1


Now let's create a subset, containing the same amount of Trump tweets as Biden tweets.

In [6]:
mask = (df.trump == 0) & (df.biden == 1)
biden_tweets = df[mask]

mask = (df.trump == 1) & (df.biden == 0)
trump_tweets = df[mask]

In [7]:
subset = pd.DataFrame(columns=df.columns)
subset = subset.append(biden_tweets.sample(2000))
subset = subset.append(trump_tweets.sample(2000))

subset.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
41590,1323407029701062661,1323407029701062661,2020-11-02 23:29:58 UTC,2020-11-02,23:29:58,0,823157329,d_kuehn,Daniel Kuehn,,Taking a closer look at this &amp; playing wit...,en,[],['https://projects.fivethirtyeight.com/electio...,[],2,1,1,[],[],https://twitter.com/D_Kuehn/status/13234070297...,False,,0,,,,,,,,[],,,,,1,0
3415,1319789146592956417,1319785862138351617,2020-10-23 23:53:47 UTC,2020-10-23,23:53:47,0,1154918693127819264,mary047318411,something bout Mary,,@KarrieK817 Sorry I meant to tweet that elsewh...,en,[],[],[],0,0,1,[],[],https://twitter.com/Mary047318411/status/13197...,False,,0,,,,,,,,"[{'screen_name': 'KarrieK817', 'name': 'Karrie...",,,,,0,1
10664,1312539293945131009,1312539293945131009,2020-10-03 23:45:28 UTC,2020-10-03,23:45:28,0,375792620,redpepperjellie,ellie poole,,i like to refer to our collective lack of fait...,en,[],[],[],0,0,4,[],[],https://twitter.com/redpepperjellie/status/131...,False,,0,,,,,,,,[],,,,,0,1


In [8]:
# keep only necessary columns
data = subset.loc[:,['tweet', 'trump', 'biden']]
data.head(3)

Unnamed: 0,tweet,trump,biden
12717,@KingRezizt @JoeBiden Ty. You're one of the go...,0,1
9344,@BerserkerCooki2 @lindoyle2 @ScooterCasterNY 1...,0,1
14162,"@chipfranklin Decency, integrity, moral value,...",0,1


## Preprocessing

We'll start small by simply removing numbers & punctuation and converting each tweet to lowercase. 

In [9]:
# remove urls
no_links = lambda x: re.sub(r"https?:\/\/\S+", "", x)
# remove twitter handles
no_handles = lambda x: re.sub(r"@[\d\w_]+", "", x)
# remove numbers
alphanum = lambda x: re.sub('\w*\d\w*', ' ', x)
# convert to lowercase
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
# remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
no_repeats = lambda x: re.sub(r"([a-z])\1{2,}", r'\1', x)
# replace consecutive spaces with one
no_dup_spaces = lambda x: ' '.join(x.split())



In [10]:
data['original'] = data.tweet

data['tweet'] = (data['tweet']
                 .map(no_handles)
                 .map(no_links)
                 .map(punc_lower)
                 .map(alphanum)
                 .map(no_repeats)
                 .map(no_dup_spaces))


In [11]:
data.head(3)

Unnamed: 0,tweet,trump,biden,original
12717,ty you re one of the goods bro 💞,0,1,@KingRezizt @JoeBiden Ty. You're one of the go...
9344,neither do you fact you don’t speak for black ...,0,1,@BerserkerCooki2 @lindoyle2 @ScooterCasterNY 1...
14162,decency integrity moral value compassion all t...,0,1,"@chipfranklin Decency, integrity, moral value,..."


In [12]:
# Fix spelling after we create the tokens?

In [13]:
sample = 'helloo ambiguos this spell checker thingy'
spell = SpellChecker(tokenizer='word')
spell.correction(sample)

'helloo ambiguos this spell checker thingy'

## Tokenization

Now it's time to tokenize our tweets. Here, we'll implement NLTK's tokenizer, stop word removal, Porter Stemming, and spell correction