In [1]:
import pickle
import re
import string
import emoji
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

import spacy
import contextualSpellCheck

nlp = spacy.load('en_core_web_sm')

from spellchecker import SpellChecker

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)

In [3]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [4]:
df = pd.read_pickle("pickle/df_t_raw.pick")
df.shape

(879311, 38)

In [5]:
df.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
12376,1321595921688350721,1321595556448202752,2020-10-28 23:33:16 UTC,2020-10-28,23:33:16,0,467067892,billy_el_flaco,Billy Graves (Nasty Nine Podcast) ⚾🎙️,,"@CWFHMarquez @JoeBiden Yes, David. They do. 🙄",en,[],[],[],0,0,0,[],[],https://twitter.com/Billy_el_flaco/status/1321...,False,,0,,,,,,,,"[{'screen_name': 'CWFHMarquez', 'name': 'David...",,,,,1,0
7608,1316163255958532096,1315768094921576448,2020-10-13 23:45:48 UTC,2020-10-13,23:45:48,0,2857463333,iliketsheila,Sheila Schlicht,,@RobertMaguire_ @Sentient_Onion A Trump presid...,en,[],[],[],0,0,0,[],[],https://twitter.com/IliketSheila/status/131616...,False,,0,,,,,,,,"[{'screen_name': 'RobertMaguire_', 'name': 'Ro...",,,,,0,1
8808,1315073157456420864,1314932938455355395,2020-10-10 23:34:08 UTC,2020-10-10,23:34:08,0,805102198577315840,deb_bee_2016,Bee,,@Biden_Army @JoeBiden No one expected him to s...,en,[],[],[],1,0,1,"['crybabytrump', 'trumpisaloser']",[],https://twitter.com/deb_bee_2016/status/131507...,False,,0,,,,,,,,"[{'screen_name': 'Biden_Army', 'name': 'Biden ...",,,,,1,1


Now let's create a subset, containing the same amount of Trump tweets as Biden tweets.

In [6]:
mask = (df.trump == 0) & (df.biden == 1)
biden_tweets = df[mask]

mask = (df.trump == 1) & (df.biden == 0)
trump_tweets = df[mask]

In [7]:
subset = pd.DataFrame(columns=df.columns)
subset = subset.append(biden_tweets.tail(10000))
subset = subset.append(trump_tweets.tail(10000))

subset.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
2323,1312540751730028545,1312460408192724992,2020-10-03 23:51:15 UTC,2020-10-03,23:51:15,0,958208381088493569,naname1961,Naname 🌊🌊🌊 🆘,,@JoeBiden @minnielobban Can’t come fast enough!,en,[],[],[],0,0,0,[],[],https://twitter.com/Naname1961/status/13125407...,False,,0,,,,,,,,"[{'screen_name': 'JoeBiden', 'name': 'Joe Bide...",,,,,1,0
5763,1312537654248968193,1312537654248968193,2020-10-03 23:38:57 UTC,2020-10-03,23:38:57,0,881307775355068416,la_krag,🇺🇸LA_KRAG🇺🇸,,@JoeBiden will destroy America as we know it!,en,[],[],[],0,0,0,[],[],https://twitter.com/LA_Krag/status/13125376542...,False,https://twitter.com/TarekFatah/status/13124403...,0,,,,,,,,[],,,,,1,0
13343,1312531236405956609,1312460408192724992,2020-10-03 23:13:27 UTC,2020-10-03,23:13:27,0,1141498926,bison_stew,Bison Stew,,@colleenwmobile @DeanSpicyReacts @Himod8583386...,en,[],[],[],0,0,1,[],[],https://twitter.com/bison_stew/status/13125312...,False,,0,,,,,,,,"[{'screen_name': 'colleenwmobile', 'name': 'Co...",,,,,1,0


In [8]:
# keep only necessary columns
data = df.loc[:,['tweet', 'trump', 'biden']]
data.head(3)

Unnamed: 0,tweet,trump,biden
0,@capriaaf @JoeBiden Plenty of results for #Tru...,1,1
1,@Changinglenses @greger_mary @JRubinBlogger @G...,0,1
2,"Inside a Biden v. Trump marriage: ""you woke me...",1,1


## Preprocessing

We'll start small by simply removing numbers & punctuation and converting each tweet to lowercase. 

In [9]:
# remove urls
no_links = lambda x: re.sub(r"https?:\/\/\S+", "", x)
# remove twitter handles
no_handles = lambda x: re.sub(r"@[\d\w_]+", "", x)
# remove numbers
alphanum = lambda x: re.sub('\w*\d\w*', ' ', x)
# convert to lowercase
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())
# remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
no_repeats = lambda x: re.sub(r"([a-z])\1{2,}", r'\1', x)
# replace consecutive spaces with one
no_dup_spaces = lambda x: ' '.join(x.split())



In [10]:
data['original'] = data.tweet

data['tweet'] = (data['tweet']
                 .map(no_handles)
                 .map(no_links)
                 .map(punc_lower)
                 .map(alphanum)
                 .map(no_repeats)
                 .map(no_dup_spaces))

data.head(3)

Unnamed: 0,tweet,trump,biden,original
0,plenty of results for trumpcrimefamily and tru...,1,1,@capriaaf @JoeBiden Plenty of results for #Tru...
1,he left washington before impeachment,0,1,@Changinglenses @greger_mary @JRubinBlogger @G...
2,inside a biden v trump marriage you woke me up...,1,1,"Inside a Biden v. Trump marriage: ""you woke me..."


## Tokenization

Now it's time to tokenize our tweets. Here, we'll implement NLTK's tokenizer, stop word removal, Porter Stemming, and spell correction

In [43]:
# from autocorrect import Speller # TOO SLOW...TRY PYSPELLCHECKER
def tweet_tokenize(tweets, more_stop=None):
    """Get all of the tokens in a set of tweets.
    
    Parameters:
        - tweets (Series, required)
        
        - more_stop (List, optional): additional stop words to exclude
    
    """
    twt = nltk.tokenize.TweetTokenizer(strip_handles=True, reduce_len=True)
    tokens = [token for tweet in tweets for token in twt.tokenize(tweet)]
    # combine stop words and punctuation
    puncs = [c for c in string.punctuation if c not in ["#", ":"]]
    stop_words = stopwords.words("english")
    if more_stop is not None:
        [stop_words.append(w) for w in more_stop]
        
    stop = stop_words + puncs + ['”']
    
    lemm = WordNetLemmatizer()
    tokens = [ lemm.lemmatize(token) for tweet in tweets
              for token in twt.tokenize(tweet)
              if token.lower() not in stop]
#     spell = Speller(lang='en')
#     tokens = [spell(t) for t in tokens]

    return tokens

In [46]:
more_stop = [
    'fxhedg',
    'fyck',
    'fy',
    'fxxking',
]

In [47]:
tokens = tweet_tokenize(data.tweet,more_stop=more_stop)

In [48]:
len(tokens)

11127421

In [49]:
len(set(tokens))

178862

## Count Vectorizer

In [56]:
cv = CountVectorizer(max_df=0.05, min_df=100)
doc_words = cv.fit_transform(tokens)
doc_words.shape



(11127421, 7609)

## NMF

In [None]:
nmf_model = NMF(2, max_iter=1500)
doc_topic = nmf_model.fit_transform(doc_words)
doc_topic.shape

In [None]:
print(f"Number of iterations used: {nmf_model.n_iter_}")

From lecture: The **doc_topic** matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. We don't know yet what the topics are.

In [None]:
topic_word = nmf_model.components_
topic_word.shape

From lecture: The **topic_word** matrix shows us the 2 resulting topics, and the terms that are associated with each topic. By looking at the words below, we an figure out what the topics are.


In [None]:
words = cv.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

In [None]:
doc_topic