In [1]:
import pickle
import re
import string
import emoji
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

import spacy
import contextualSpellCheck

nlp = spacy.load('en_core_web_sm')

from spellchecker import SpellChecker

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)

In [3]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [4]:
df = pd.read_pickle("pickle/df_t_raw.pick")
df.shape

(879311, 38)

In [5]:
df.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
646,1321602263044022272,1321602263044022272,2020-10-28 23:58:28 UTC,2020-10-28,23:58:28,0,349115403,sirtitan45,Pontiff,,Trump and Biden they have your best interests ...,en,[],[],[],0,0,0,[],[],https://twitter.com/SirTitan45/status/13216022...,False,,0,,,,,,,,[],,,,,1,1
6031,1315801979793403905,1315801979793403905,2020-10-12 23:50:13 UTC,2020-10-12,23:50:13,0,11611052,foxnewsradio,FOX News Radio,,President Trump back on the campaign trail in ...,en,[],['https://fxn.ws/hourly'],[],5,6,8,[],[],https://twitter.com/foxnewsradio/status/131580...,False,,0,,,,,,,,[],,,,,0,1
2166,1321601410211995650,1321601410211995650,2020-10-28 23:55:05 UTC,2020-10-28,23:55:05,0,525750162,rafawkes,Richard Fawkes,,Gutfeld on media coverage of latest Hunter Bid...,en,"[{'screen_name': 'youtube', 'name': 'youtube',...",['https://youtu.be/bSTQWOnZjS0'],[],0,0,0,[],[],https://twitter.com/rafawkes/status/1321601410...,False,,0,,,,,,,,[],,,,,1,0


Now let's create a subset, containing the same amount of Trump tweets as Biden tweets.

In [6]:
mask = (df.trump == 0) & (df.biden == 1)
biden_tweets = df[mask]

mask = (df.trump == 1) & (df.biden == 0)
trump_tweets = df[mask]

In [7]:
subset = pd.DataFrame(columns=df.columns)
subset = subset.append(biden_tweets.sample(75))
subset = subset.append(trump_tweets.sample(75))

subset.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
375,1320152848991043585,1320151335199756288,2020-10-24 23:59:01 UTC,2020-10-24,23:59:01,0,1312444769604235265,inspire0419,inspire,,"@JoeBiden Old man, based on your sins,surrende...",en,[],[],[],0,0,0,[],[],https://twitter.com/inspire0419/status/1320152...,False,,0,,,,,,,,"[{'screen_name': 'JoeBiden', 'name': 'Joe Bide...",,,,,1,0
3253,1321600846937948160,1321576224624414721,2020-10-28 23:52:50 UTC,2020-10-28,23:52:50,0,1032262359061090304,younglibertari,Young Libertarian,,@mallorymonster8 @JoeBiden Are you for the sta...,en,[],[],[],2,0,0,[],[],https://twitter.com/YoungLibertari/status/1321...,False,,0,,,,,,,,"[{'screen_name': 'mallorymonster8', 'name': 'M...",,,,,1,0
1783,1318340183188770826,1318340183188770826,2020-10-19 23:56:07 UTC,2020-10-19,23:56:07,0,1247159216676253697,mesickrandy,Randy Mesick,,"Hunter Biden Emails Confirmed, DNC PANIC! htt...",en,[],['https://rumble.com/var0c7-hunter-biden-email...,[],1,0,0,[],[],https://twitter.com/MesickRandy/status/1318340...,False,,0,,,,,,,,[],,,,,1,0


In [8]:
# keep only necessary columns
data = subset.loc[:,['tweet', 'trump', 'biden']]
data.head(3)

Unnamed: 0,tweet,trump,biden
4134,@KamalaHarris Hunter Biden U.S. presidential c...,0,1
3396,"@JoeBiden I will vote, but not for a single De...",0,1
7885,@donwinslow @JoeBiden Dear @JoeBiden call into...,0,1


## Preprocessing

We'll start small by simply removing numbers & punctuation and converting each tweet to lowercase. 

In [9]:
# remove urls
no_links = lambda x: re.sub(r"https?:\/\/\S+", "", x)
# remove twitter handles
no_handles = lambda x: re.sub(r"@[\d\w_]+", "", x)
# remove numbers
alphanum = lambda x: re.sub('\w*\d\w*', ' ', x)
# convert to lowercase
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
# remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
no_repeats = lambda x: re.sub(r"([a-z])\1{2,}", r'\1', x)
# replace consecutive spaces with one
no_dup_spaces = lambda x: ' '.join(x.split())



In [13]:
data['original'] = data.tweet

data['tweet'] = (data['tweet']
                 .map(no_handles)
                 .map(no_links)
                 .map(punc_lower)
                 .map(alphanum)
                 .map(no_repeats)
                 .map(no_dup_spaces))

list(data.tweet)[80:100]

['that’s exactly why trump hired him',
 'you heard it here first it s clear they ve let trump take control of his own treatment the lies the over treatment saying he knows all about covid today s publicity stunt dr trump will handle his illness the same way he handled the pandemic watch sciencemeetskarma',
 '🚨🚨new anti trump ad dystopia we cannot allow trump’s dystopia trumpcrimefamily',
 'to paraphrase trump i like presidents who haven t gotten sick ok',
 'icymi trump confessed to putting a hit on a protester in portland having him assassinated by us marshals this puts trump squarely into the same category as the murderous dictators he so admires such as kim jong un and vladimir putin amp is yet another parallel to adolf hitler',
 'oh i didn’t realize it was multiple choice i was gonna say “all of the face fucking” but given the choices i’d initially have said fuck but the fact that you’re a trump girl i might marry you 🇺🇸😉',
 'here s another trump comedy video be sure to subscribe le

In [11]:
data.tweet.iloc[12
            ]

'talk to joe biden about the incarceration rate'

In [12]:
sample="Income was $9.4 million compared to the prior year of $2.7 million."


In [14]:
tweets = data.tweet
contextualSpellCheck.add_to_pipe(nlp)
docs = list(nlp.pipe(tweets))
    
    

In [31]:
docs == list(tweets)

False

In [40]:
list(tweets)[90:100]

['trump iowa enough is enough via vote votehimout',
 'or trump was stupid enough to catch the fake china vitus and spread it to a bunch of people and then proceeded to not care who else caught with reckless crap btw are you all planning to pretend that you don’t realize there are a bunch of mail in ballots that must be counted',
 'trump cai nas pesquisas eleitorais após pegar covid depois de três dias internado com covid donald trump teve alta de hospital em washington após ser diagnosticado com coronavírus o presidente dos estados unidos amarga nova queda nas pesquisas eleitorais',
 'donald j trump is an renaissance man',
 'donald trump is most likely sitting bricks right now',
 'after years of wiping trumps ass the toilet paper complaints that it s sick of this shit right well i guess better late than never right',
 'i m starting to hate all the mealy mouthed morally bankrupt hypocrites holding trump s hand more than the guy himself and especially this racist enabler',
 'yes hurry am

In [42]:
docs[91]

or trump was stupid enough to catch the fake china vitus and spread it to a bunch of people and then proceeded to not care who else caught with reckless crap btw are you all planning to pretend that you don’t realize there are a bunch of mail in ballots that must be counted

In [16]:
print(nlp.pipeline)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x7f8d82ccd4f0>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x7f8d82ceea90>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x7f8d82a72ee0>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x7f8d82a728e0>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x7f8d82cd6a00>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x7f8d82d6dc00>), ('contextual spellchecker', <contextualSpellCheck.contextualSpellCheck.ContextualSpellCheck object at 0x7f8d2e08fe20>)]


In [17]:
# from autocorrect import Speller # TOO SLOW...TRY PYSPELLCHECKER
def tweet_tokenize(tweets):
    """Get all of the tokens in a set of tweets"""
    twt = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
    tokens = [token for tweet in tweets for token in twt.tokenize(tweet)]
    # combine stop words and punctuation
    puncs = [c for c in string.punctuation if c not in ["#", ":"]]
    stop = stopwords.words("english") + puncs + ['”']
    stemmer = PorterStemmer()
    tokens = [ stemmer.stem(token) for tweet in tweets
              for token in twt.tokenize(tweet)
              if token.lower() not in stop]
#     spell = Speller(lang='en')
#     tokens = [spell(t) for t in tokens]

    
    
    return tokens

In [18]:
data.tweet.iloc[22]

'i am voting biden harris bluewave'

In [19]:
# Fix spelling after we create the tokens?

## Tokenization

Now it's time to tokenize our tweets. Here, we'll implement NLTK's tokenizer, stop word removal, Porter Stemming, and spell correction