In [93]:
import pickle
import re
import string
import emoji
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

from langdetect import detect

import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

import spacy
import contextualSpellCheck

nlp = spacy.load('en_core_web_sm')

from spellchecker import SpellChecker

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers

In [94]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)

In [95]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [96]:
df = pd.read_pickle("pickle/df_t_raw.pick")
df.shape

(879311, 38)

In [97]:
df.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
6116,1317250561788530689,1315638155840716800,2020-10-16 23:46:21 UTC,2020-10-16,23:46:21,0,2450503854,jmurray179,jean murray,,@ArelyKelly26 @Progress99vs1 Biden/Harris,eu,[],[],[],0,0,2,[],[],https://twitter.com/jmurray179/status/13172505...,False,,0,,,,,,,,"[{'screen_name': 'ArelyKelly26', 'name': 'Arel...",,,,,1,0
2866,1323413754294554625,1323413754294554625,2020-11-02 23:56:41 UTC,2020-11-02,23:56:41,0,3169861473,bostoninsouth,💋Carolyn Bartholomew✌️,,💙💙💙💙💙💙💙💙💙 BIDEN HARRIS 2020 🌊🌊🌊🌊🌊🌊🌊🌊🌊🌊🌊🌊 💙💙💙💙💙...,eu,[],[],[],0,0,1,['votebidentoendthisnightmare'],[],https://twitter.com/BostonInSouth/status/13234...,False,https://twitter.com/donwinslow/status/13234135...,0,,,,,,,,[],,,,,1,0
4669,1321600063471149056,1321600063471149056,2020-10-28 23:49:43 UTC,2020-10-28,23:49:43,0,1060303980574629888,sadies8ie,Sadie,,Coming out as non binary to your lib dem paren...,en,[],[],[],0,0,2,[],[],https://twitter.com/sadies8ie/status/132160006...,False,,0,,,,,,,,[],,,,,1,0


Now let's create a subset, containing the same amount of Trump tweets as Biden tweets.

In [98]:
mask = (df.trump == 0) & (df.biden == 1)
biden_tweets = df[mask]

mask = (df.trump == 1) & (df.biden == 0)
trump_tweets = df[mask]

In [99]:
subset = pd.DataFrame(columns=df.columns)
subset = subset.append(biden_tweets.tail(10000))
subset = subset.append(trump_tweets.tail(10000))

subset.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
5186,1319788357552115723,1319788357552115723,2020-10-23 23:50:39 UTC,2020-10-23,23:50:39,0,1431031490,arthur59611540,#TheRealSpiderMan - Trump Impeached -2nd Time!,,Just a heads-up that the US has horrendous num...,en,[],[],['https://pbs.twimg.com/media/ElDT_r2X0AEVBpf....,0,0,0,[],[],https://twitter.com/Arthur59611540/status/1319...,False,,1,https://pbs.twimg.com/media/ElDT_r2X0AEVBpf.png,,,,,,,[],,,,,0,1
14291,1312530389655666690,1312177040561328129,2020-10-03 23:10:05 UTC,2020-10-03,23:10:05,0,1276694407278989313,charlesdamel,charli fan 🤍,,@PunsUnfunny @jimmarsouza @JoeBiden so she sho...,en,[],[],[],1,0,1,[],[],https://twitter.com/charlesdamel/status/131253...,False,,0,,,,,,,,"[{'screen_name': 'PunsUnfunny', 'name': 'Mr. B...",,,,,1,0
4027,1312539162139070465,1312539162139070465,2020-10-03 23:44:56 UTC,2020-10-03,23:44:56,0,1268794234817007616,ce4biden,Clean Energy for Biden,,The choice is clear. @JoeBiden will follow the...,en,"[{'screen_name': 'joebiden', 'name': 'joe bide...",['https://www.utilitydive.com/news/climate-cha...,[],0,3,16,"['cleanenergy', 'cleanenergyforbiden', 'ce4b',...",[],https://twitter.com/ce4biden/status/1312539162...,False,,0,,,,,,,,[],,,,,1,0


In [100]:
# keep only necessary columns
data = subset.loc[:,['tweet', 'trump', 'biden']]
data.head(3)

Unnamed: 0,tweet,trump,biden
14520,@TomiLahren Is it a coincidence that all the p...,0,1
14523,@JoeBiden @LucianFerguso15 https://t.co/7CRGX...,0,1
14524,"@YoungLibertari @pjwcnc @JoeBiden Right, I use...",0,1


## Preprocessing

We'll start small by simply removing numbers & punctuation and converting each tweet to lowercase. 

In [101]:
# remove urls
no_links = lambda x: re.sub(r"https?:\/\/\S+", "", x)
# remove twitter handles
no_handles = lambda x: re.sub(r"@[\d\w_]+", "", x)
# remove numbers
alphanum = lambda x: re.sub('\w*\d\w*', ' ', x)
# convert to lowercase
punc_lower = lambda x: re.sub('[%s]'.format(re.escape(string.punctuation)), ' ', x.lower())
# remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
no_repeats = lambda x: re.sub(r"([a-z])\1{2,}", r'\1', x)
# replace consecutive spaces with one
no_dup_spaces = lambda x: ' '.join(x.split())



In [102]:
data['original'] = data.tweet

# data['tweet'] = (data['tweet']
#                  .map(no_handles)
#                  .map(no_links)
#                  .map(punc_lower)
#                  .map(alphanum)
#                  .map(no_repeats)
#                  .map(no_dup_spaces))

data.head(3)

Unnamed: 0,tweet,trump,biden,original
14520,@TomiLahren Is it a coincidence that all the p...,0,1,@TomiLahren Is it a coincidence that all the p...
14523,@JoeBiden @LucianFerguso15 https://t.co/7CRGX...,0,1,@JoeBiden @LucianFerguso15 https://t.co/7CRGX...
14524,"@YoungLibertari @pjwcnc @JoeBiden Right, I use...",0,1,"@YoungLibertari @pjwcnc @JoeBiden Right, I use..."


## Tokenization

Now it's time to tokenize our tweets. Here, we'll implement NLTK's tokenizer, stop word removal, Porter Stemming, and spell correction

In [128]:
# from autocorrect import Speller # TOO SLOW...TRY PYSPELLCHECKER
def tweet_tokenize(tweet, more_stop=None):
    """Get all of the tokens in a set of tweets.
    
    Parameters:
        - tweets (Series, required)
        
        - more_stop (List, optional): additional stop words to exclude
    
    """
    # pre-processing pipeline
    
    # remove urls
    lambda tweet: re.sub(r"https?:\/\/\S+", "", tweet)
    # remove twitter handles
    lambda tweet: re.sub(r"@[\d\w_]+", "", tweet)
    # remove numbers
    lambda tweet: re.sub('\w*\d\w*', ' ', tweet)
    # convert to lowercase
    lambda tweet: tweet.lower()
    # remove punctuation
    lambda tweet: re.sub('[%s]'.format(re.escape(string.punctuation)), ' ', tweet.lower())
    # remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
    lambda tweet: re.sub(r"([a-z])\1{2,}", r'\1', tweet)
    # replace consecutive spaces with one
    lambda tweet: ' '.join(tweet.split())
    
    words = set(nltk.corpus.words.words())
    
    twt = nltk.tokenize.TweetTokenizer(strip_handles=True)
    tokens = [token for token in twt.tokenize(tweet) if token in words]
    
    # initiate lemmatization and stop word removal
    puncs = [c for c in string.punctuation if c not in ["#", ":"]]
    stop_words = stopwords.words("english")
    if more_stop is not None:
        [stop_words.append(w) for w in more_stop]
        
    stop = stop_words + puncs + ['”']
    
    lemm = WordNetLemmatizer()
    
    # implement lemmatization and stop word removal
    tokens = [ lemm.lemmatize(token) for token in tokens
              if token.lower() not in stop]
#     spell = Speller(lang='en')
#     tokens = [spell(t) for t in tokens]

    combined_tokens = ' '.join(tokens)

    return combined_tokens

In [129]:
# testing langdetect



text = 'una'

detect(text)

'es'

In [130]:
more_stop = [
    'fxhedg',
    'fyck',
    'fy',
    'fxxking',
    'give',
    'go',
    'going',
    'gonna',
    'get',
    'one'
]

In [None]:
data['clean'] = data['original'].map(tweet_tokenize)

In [None]:
data.head(3)

In [None]:
tokenized = data.clean

In [None]:
tokenized.head()

## Count Vectorizer

In [None]:
cv = CountVectorizer(max_df=0.05, min_df=100)
doc_words = cv.fit_transform(tokenized)



## NMF

In [None]:
nmf_model = NMF(2, max_iter=115000)
doc_topic = nmf_model.fit_transform(doc_words)
doc_topic.shape

In [None]:
print(f"Number of iterations used: {nmf_model.n_iter_}")

From lecture: The **doc_topic** matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. We don't know yet what the topics are.

In [None]:
topic_word = nmf_model.components_
topic_word.shape

From lecture: The **topic_word** matrix shows us the 2 resulting topics, and the terms that are associated with each topic. By looking at the words below, we an figure out what the topics are.


In [None]:
words = cv.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

In [None]:
doc_topic