In [1]:
import pickle
import re
import string
import emoji
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

from langdetect import detect

import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.corpus import stopwords, words
from nltk.stem import PorterStemmer, WordNetLemmatizer

import spacy
import contextualSpellCheck

nlp = spacy.load('en_core_web_sm')

from spellchecker import SpellChecker

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [3]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [4]:
df = pd.read_pickle("pickle/df_t_raw.pick")
df.shape

(879311, 38)

In [24]:
df.columns

Index(['id', 'conversation_id', 'created_at', 'date', 'time', 'timezone',
       'user_id', 'username', 'name', 'place', 'tweet', 'language', 'mentions',
       'urls', 'photos', 'replies_count', 'retweets_count', 'likes_count',
       'hashtags', 'cashtags', 'link', 'retweet', 'quote_url', 'video',
       'thumbnail', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
       'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
       'trans_dest', 'biden', 'trump'],
      dtype='object')

In [23]:
len(df.username.unique())

438106

In [5]:
df.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
9997,1312539517358878720,1312403670269652992,2020-10-03 23:46:21 UTC,2020-10-03,23:46:21,0,825727643958046720,arch1com,Wayne Ferrell,,@AndrewPollackFL @realDonaldTrump @andrewcuomo Next parade. Prayers for Trump and family and suporters https://t.co/Kw0BinQTrx,en,[],[],[],0,1,1,[],[],https://twitter.com/arch1com/status/1312539517358878720,False,,1,https://pbs.twimg.com/ext_tw_video_thumb/1312539428909445120/pu/img/aHrK6EI7XWtNYwOa.jpg,,,,,,,"[{'screen_name': 'AndrewPollackFL', 'name': 'Andrew Pollack', 'id': '983766825610694656'}, {'screen_name': 'realDonaldTrump', 'name': 'Donald J. Trump', 'id': '25073877'}, {'screen_name': 'andrewcuomo', 'name': 'Andrew Cuomo', 'id': '2513761008'}]",,,,,0,1
10412,1315074585495298049,1314952057053474821,2020-10-10 23:39:48 UTC,2020-10-10,23:39:48,0,3379991093,drowningheaven1,Paraluman Mariposa,,"@RosevilleCindy @edieconstantine @LauraHennesse11 THIS! Exactly! Even if there is no violations, per se, re the Hatch Act. What Trump violated was the ethics and tradition of not doing so, especially during an election year.",en,[],[],[],0,2,3,[],[],https://twitter.com/drowningheaven1/status/1315074585495298049,False,,0,,,,,,,,"[{'screen_name': 'RosevilleCindy', 'name': 'Cindy', 'id': '829486775928098816'}, {'screen_name': 'edieconstantine', 'name': 'Mom No Longer Needs To Resist! WE WON!', 'id': '2243886992'}, {'screen_name': 'LauraHennesse11', 'name': 'Laura Hennessey 🆘✍️', 'id': '1010891010384322561'}]",,,,,0,1
8244,1314714395516309508,1314714395516309508,2020-10-09 23:48:32 UTC,2020-10-09,23:48:32,0,715961737133047810,devineearth,"Isolated, Internet took over by another idiot",,"@JoeBiden I got it Trump is Threatning to blow up any country that does not do as he says,as he asset strips them, he has a total media courage so he has prob bombed some where and showed all the countries leaders. then dictate what they have to do All the news is fake!",en,[],[],[],0,0,0,[],[],https://twitter.com/DevineEarth/status/1314714395516309508,False,,0,,,,,,,,[],,,,,1,1


Now let's create a subset, containing the same amount of Trump tweets as Biden tweets as tweets mentioning both candidates.

In [6]:
mask = (df.trump == 0) & (df.biden == 1)
biden_tweets = df[mask].tail(2500)

mask = (df.trump == 1) & (df.biden == 0)
trump_tweets = df[mask].tail(2500)

mask = (df.trump == 1) & (df.biden == 1)
both_tweets = df[mask].tail(2500)

In [7]:
subset = pd.DataFrame(columns=df.columns)
subset = subset.append(biden_tweets)
subset = subset.append(trump_tweets)
subset = subset.append(both_tweets)

subset.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
12769,1312531719669465089,1312531257394307072,2020-10-03 23:15:22 UTC,2020-10-03,23:15:22,0,14840308,nickcalder,Nicholas,,"@kinsellawarren @charlesadler @JoeBiden Bingo. I want him in the best health of his septuagenarian life, as he heads off to prison.",en,[],[],[],0,0,1,[],[],https://twitter.com/nickcalder/status/1312531719669465089,False,,0,,,,,,,,"[{'screen_name': 'kinsellawarren', 'name': 'Warren Kinsella', 'id': '16106522'}, {'screen_name': 'charlesadler', 'name': 'Charles Adler', 'id': '16219502'}, {'screen_name': 'JoeBiden', 'name': 'Joe Biden', 'id': '939091'}]",,,,,1,0
13615,1319784481360756737,1319460648431046658,2020-10-23 23:35:15 UTC,2020-10-23,23:35:15,0,2897845807,strongglenna,glenna jp strong,,@suzyluves @dyckmanDD @InsuranceRuby @Realblonde9 @TheGenerationME @davidhogg111 These policies were Inherited by Trump.,en,[],[],[],1,0,3,[],[],https://twitter.com/StrongGlenna/status/1319784481360756737,False,,0,,,,,,,,"[{'screen_name': 'suzyluves', 'name': 'Summer Cook', 'id': '1441827319'}, {'screen_name': 'dyckmanDD', 'name': 'Lone Patriot 🇺🇸', 'id': '256569484'}, {'screen_name': 'InsuranceRuby', 'name': 'Inappropriate Laughter', 'id': '1130492664623841280'}, {'screen_name': 'Realblonde9', 'name': 'Realblonde', 'id': '1313898096485707779'}, {'screen_name': 'TheGenerationME', 'name': '🏳️\u200d🌈🇲🇽🇺🇸 👉🏼TheGenerationMe 🇺🇸', 'id': '408239560'}, {'screen_name': 'davidhogg111', 'name': 'David Hogg', 'id': '1915033663'}]",,,,,0,1
12705,1319784882210570246,1319784666249990145,2020-10-23 23:36:50 UTC,2020-10-23,23:36:50,0,373347464,mattflammable,MaTT FLaMMaBLe,,@Politidope BREAKING: Trump’s female base.,en,[],[],[],0,0,1,[],[],https://twitter.com/MaTTFLaMMaBLe/status/1319784882210570246,False,,0,,,,,,,,"[{'screen_name': 'Politidope', 'name': 'Matt Rogers 🗳', 'id': '113724715'}]",,,,,0,1


In [8]:
# keep only necessary columns
data = subset.loc[:,['tweet', 'trump', 'biden']]
data['original'] = data.tweet
data.head(3)

Unnamed: 0,tweet,trump,biden,original
11029,"👇 Exactly.... Bernie bros &amp; Ilhan Omar will takeover Biden administration They have their own pet issues ... Modi , Netanyahu &amp; Sisi are targets of izlamists wing of Democrats https://t.co/qay1mCGCmc",0,1,"👇 Exactly.... Bernie bros &amp; Ilhan Omar will takeover Biden administration They have their own pet issues ... Modi , Netanyahu &amp; Sisi are targets of izlamists wing of Democrats https://t.co/qay1mCGCmc"
11030,@theodore8675309 @JoeBiden @DrBiden pack it up theodore,0,1,@theodore8675309 @JoeBiden @DrBiden pack it up theodore
11031,"@RealOmarNavarro Almost sounds like the stupid woman had it set up for her...or is that going to be the excuse in another week when Biden goes down with it...and a heroine steps up to be his replacement? wife. obama, clinton...any of those nutters",0,1,"@RealOmarNavarro Almost sounds like the stupid woman had it set up for her...or is that going to be the excuse in another week when Biden goes down with it...and a heroine steps up to be his replacement? wife. obama, clinton...any of those nutters"


## Pre-Processing Pipeline

Now it's time to tokenize our tweets. Here are our pre-processing steps:
* Remove URLs
* Remove Twitter handles
* Remove numbers
* Convert to lowercase
* Remove punctuation
* Remove repeated letters so spell check will work ('aaaaand' -> 'aand')
* Remove non-English words
* Remove stop words

Since we're working with so many different words, I've chosen to use **lemmatization** instead of stemming for two reasons:
1. Lemmatization accurately reduces words to true meaning
2. Inxreased word reduction (handles synonyms)

In [9]:
# Testing these before putting them in func


tweet = '@JoeBiden how is it   GOING?! Looking forward to seeing you #maga'

# remove urls
tweet = re.sub(r"https?:\/\/\S+", "", tweet)
# remove twitter handles
tweet = re.sub(r"@[\d\w_]+", "", tweet)
# remove numbers
tweet = re.sub('\w*\d\w*', ' ', tweet)
# convert to lowercase
tweet = re.sub('[%s]'.format(re.escape(string.punctuation)), ' ', tweet.lower())
# remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
tweet = re.sub(r"([a-z])\1{2,}", r'\1', tweet)
# replace consecutive spaces with one
tweet = ' '.join(tweet.split())

tweet


'how i it going?! looking forward to eeing you #maga'

In [10]:
# from autocorrect import Speller # TOO SLOW...TRY PYSPELLCHECKER
def tweet_tokenize(tweet, more_stop=None, more_words=None):
    """Get all of the tokens in a set of tweets.
    
    Parameters:
        - tweets (Series, required)
        
        - more_stop (List, optional): additional stop words to exclude
        
        - more_words (List, optional): additional words to INCLUDE in dictionary
    
    """
    # pre-processing pipeline
    
    # remove urls
    tweet = re.sub(r"https?:\/\/\S+", "", tweet)
    # remove numbers
    tweet = re.sub('\w*\d\w*', ' ', tweet)
    # remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # convert to lowercase
    tweet = tweet.lower()

    # and other popular campaign phrases
    tweet = re.sub(r"make america great again", "maga", tweet)
    tweet = re.sub(r"makeamericagreatagain", "maga", tweet)
    tweet = re.sub(r"sleepyjoe", "sleepy joe", tweet)
    tweet = re.sub(r"sleepyjoebiden", "sleepy joe biden", tweet)
    
    # remove spaces in candidate names
    tweet = re.sub(r"joebiden", "joe_biden", tweet)
    tweet = re.sub(r"kamalaharris", "kamala_harris", tweet)
    tweet = re.sub(r"donaldtrump", "donald_trump", tweet)
    tweet = re.sub(r"mikepence", "mike_pence", tweet)
    tweet = re.sub(r"joe biden", "joe_biden", tweet)
    tweet = re.sub(r"kamala harris", "kamala_harris", tweet)
    tweet = re.sub(r"donald trump", "donald_trump", tweet)
    tweet = re.sub(r"mike pence", "mike_pence", tweet)
    # replace 'biden' with 'joebiden' (do for all candidates)
    tweet = re.sub(r"\bbiden\b", "joe_biden", tweet)
    tweet = re.sub(r"\bpence\b", "mike_pence", tweet)
    tweet = re.sub(r"\bharris\b", "kamala_harris", tweet)
    tweet = re.sub(r"\btrump\b", "donald_trump", tweet)
    
    # remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
    tweet = re.sub(r"([a-z])\1{2,}", r'\1', tweet)
    # replace consecutive spaces with one
    tweet = ' '.join(tweet.split())
    
    more_words = ['trump', 'biden', 'maga', 'bidenharris', 
                  'kamala', 'pence', 'harris', 'mike',
                  'bidenharris2020', 'trumppence',
                  'trumppence2020', 'usa', 'election2020',
                  'ivoted', 'joe_biden', 'realdonaldtrump',
                  'donald_trump', 'sleepy_joe',
                  'mike_pence', 'kamala_harris']
    
    dictionary = list(words.words()) + more_words
    dictionary = set(dictionary)
    
    twt = TweetTokenizer()
    tokens = [token for token in twt.tokenize(tweet) if token in dictionary]
    
    # initiate stop word removal and lemmatization    
    more_stop = ['fxhedg','fyck','fy','fxxking','give','go',
                 'going','gonna','get','one','de','la','el','en','un','ha',
                 'would','dont','know','time','think','want','via','dont']
    
    stop_words = list(stopwords.words('english')) + more_stop
    stop = stop_words
    stop = set(stop)
    
    lemm = WordNetLemmatizer()
    
    # implement lemmatization and stop word removal
    tokens = [lemm.lemmatize(token) for token in tokens
              if token.lower() not in stop]
#     spell = Speller(lang='en')
#     tokens = [spell(t) for t in tokens]

    combined_tokens = ' '.join(tokens)

    return combined_tokens

In [11]:
data['tweet'] = data['original'].map(tweet_tokenize)

In [12]:
data.sample(10)

Unnamed: 0,tweet,trump,biden,original
11471,joe_biden looking like lawsuit need file,0,1,@JoeBiden Are you looking into this?? Sounds like a lawsuit you need to file. @DNC
11472,joe_biden dare question joe_biden gentleman support clown make doubt joe_biden gentleman next president usa,1,1,"@gnuseibeh @realDonaldTrump @JoeBiden LOLOLOLOLOLOLOL!!!!!!! I don’t think so! How dare you question if Mr Biden is a gentleman when you support a classless clown! Make no doubt about it, Mr Biden is a gentleman and the next president of the USA!"
12817,donald_trump keep great check,1,0,DONALD TRUMP FOR 2020 KEEP AMERICA 🇺🇸 GREAT 👍 FACTS CHECK.........
13440,joe_biden forget joe showman trust,0,1,@JoeBiden DONT FORGET JOE HE IS A SHOWMAN DONT TRUST HIM
12853,joe_biden tested negative correct contact symptomatic,0,1,@JoeBiden You tested negative correct? You were in contact when he was symptomatic
14297,lying hypocrite racist k supporter thats blundering joe_biden,0,1,"lying hypocrite racist, KKK supporter. That's Blundering Biden"
11454,joe_biden kamala_harris u word,0,1,"@nycsouthpaw @SpeakerPelosi @SenSchumer @JoeBiden @KamalaHarris @RepAdamSchiff @joelockhart @TheRickWilson @MichaelSteele @SteveSchmidtSES Did he us the word ""condolences""?"
10295,joe_biden day j donald_trump win election lord j donald_trump walter reed military hospital,1,1,"@JoeBiden In 31 days, DONALD J TRUMP WILL WIN THE ELECTION!!! LORD is with Donald J Trump at Walter Reed Military Hospital."
11751,doubt talk supporter headquarters also talk aggression joe_biden,1,1,"@davenewworld_2 a doubt, do you talk about Trump's supporter headquarters, or do you also talk about aggression by supporters of Biden?"
5533,correction donald_trump afraid,1,1,@meiselasb Correction: trump is afraid of women. #EspeciallyFemale #VoteBidenHarrisToSaveAmerica


In [13]:
# data.to_pickle("pickle/tweets_df_5000tw.pick")

## Count Vectorizer

In [14]:
cv = CountVectorizer(min_df=0.05)
doc_words = cv.fit_transform(data.tweet)

In [15]:
data.iloc[5]

tweet                                                                                                                                    lo e con e
trump                                                                                                                                             0
biden                                                                                                                                             1
original    @VOANoticias Ese lo que es rolintranco de basura, uno de los promotores de &lt;venezolanos con Biden&gt; es un comunista enclosetado!!!
Name: 11034, dtype: object

## NMF

In [16]:
nmf_model = NMF(2, max_iter=115000)
doc_topic = nmf_model.fit_transform(doc_words)
print(f"Shape: {doc_topic.shape}")
print(f"Number of iterations used: {nmf_model.n_iter_}")

Shape: (7500, 2)
Number of iterations used: 11


From lecture: The **doc_topic** matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. We don't know yet what the topics are.

In [17]:
topic_word = nmf_model.components_
topic_word.shape

(2, 7)

From lecture: The **topic_word** matrix shows us the 2 resulting topics, and the terms that are associated with each topic. By looking at the words below, we an figure out what the topics are.


In [18]:
words = cv.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['joe_biden', 'vote', 'president', 'people', 'like', 'covid'],
 ['donald_trump', 'people', 'vote', 'president', 'covid', 'like']]

In [19]:
doc_topic

array([[0.10854188, 0.        ],
       [0.10854188, 0.        ],
       [0.11344354, 0.00492369],
       ...,
       [0.00357752, 0.13932251],
       [0.        , 0.13099543],
       [0.        , 0.        ]])

## LDA

In [20]:
lda_model = LatentDirichletAllocation(n_components=2)
doc_topic = lda_model.fit_transform(doc_words)
doc_topic.shape

(7500, 2)

In [21]:
words = cv.get_feature_names()
t = lda_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['donald_trump', 'president', 'joe_biden', 'people', 'covid', 'like'],
 ['joe_biden', 'vote', 'people', 'like', 'covid', 'donald_trump']]