In [83]:
import pickle
import re
import string
import emoji
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

from langdetect import detect

import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF

import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.corpus import stopwords, words
from nltk.stem import PorterStemmer, WordNetLemmatizer

import spacy
import contextualSpellCheck

nlp = spacy.load('en_core_web_sm')

from spellchecker import SpellChecker

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)

In [3]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [4]:
df = pd.read_pickle("pickle/df_t_raw.pick")
df.shape

(879311, 38)

In [5]:
df.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
10370,1322684318125350914,1322683862175109120,2020-10-31 23:38:10 UTC,2020-10-31,23:38:10,0,941439774,centrist_phone,Dr Moderate,,@NateSilver538 Something odd happening in this...,en,[],[],[],2,1,12,[],[],https://twitter.com/centrist_phone/status/1322...,False,,0,,,,,,,,"[{'screen_name': 'NateSilver538', 'name': 'Nat...",,,,,1,0
9449,1314711646837694464,1314706514670612480,2020-10-09 23:37:37 UTC,2020-10-09,23:37:37,0,1137629164326887426,acetoyourhead,Acetoyourhead,,@SteveSisolak @JoeBiden @KamalaHarris @JoeForN...,en,"[{'screen_name': 'govsisolak', 'name': 'govern...",[],['https://pbs.twimg.com/tweet_video_thumb/Ej7L...,5,0,15,[],[],https://twitter.com/acetoyourhead/status/13147...,False,,1,https://pbs.twimg.com/tweet_video_thumb/Ej7LRX...,,,,,,,"[{'screen_name': 'SteveSisolak', 'name': 'Stev...",,,,,1,0
3918,1318339055642443776,1318339055642443776,2020-10-19 23:51:39 UTC,2020-10-19,23:51:39,0,1179130056112824322,newguardsrising,Greg Wilson,,@ashlie_weeks as a former political appointee ...,en,[],[],[],0,0,0,['dumptrump2020'],[],https://twitter.com/NewGuardsRising/status/131...,False,https://twitter.com/ashlie_weeks/status/131833...,0,,,,,,,,[],,,,,1,1


Now let's create a subset, containing the same amount of Trump tweets as Biden tweets.

In [6]:
mask = (df.trump == 0) & (df.biden == 1)
biden_tweets = df[mask]

mask = (df.trump == 1) & (df.biden == 0)
trump_tweets = df[mask]

In [7]:
subset = pd.DataFrame(columns=df.columns)
subset = subset.append(biden_tweets.tail(50))
subset = subset.append(trump_tweets.tail(50))

subset.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
14987,1312529747100930050,1312230127112204289,2020-10-03 23:07:31 UTC,2020-10-03,23:07:31,0,2600787139,brh_pino,Orange is the New Stupid,,@jrgaillot @JoeBiden No.,und,[],[],[],0,0,0,[],[],https://twitter.com/BRH_Pino/status/1312529747...,False,,0,,,,,,,,"[{'screen_name': 'jrgaillot', 'name': 'JR Gail...",,,,,1,0
15031,1319783792391892992,1319783792391892992,2020-10-23 23:32:31 UTC,2020-10-23,23:32:31,0,1011057087386914817,cd_smithy,CdSmithy,,Obviously Mr. Jenkins has need of more educati...,en,"[{'screen_name': 'wsj', 'name': 'the wall stre...",['https://apple.news/AMoFESa8NQci-vLmRnRj_jQ'],[],0,0,0,[],[],https://twitter.com/cd_smithy/status/131978379...,False,,0,,,,,,,,[],,,,,0,1
14968,1312529761139187712,1312518819370291202,2020-10-03 23:07:35 UTC,2020-10-03,23:07:35,0,801253065684881408,seeseerider2,vps,,@BreitbartNews Just watched the Bork confirmat...,en,[],[],[],0,0,0,[],[],https://twitter.com/Seeseerider2/status/131252...,False,,0,,,,,,,,"[{'screen_name': 'BreitbartNews', 'name': 'Bre...",,,,,1,0


In [8]:
# keep only necessary columns
data = subset.loc[:,['tweet', 'trump', 'biden']]
data['original'] = data.tweet
data.head(3)

Unnamed: 0,tweet,trump,biden,original
14945,@CMWooly @JoeBiden @jimdicker He called for th...,0,1,@CMWooly @JoeBiden @jimdicker He called for th...
14946,@MissyPDX @HKrassenstein @JoeBiden Got a probl...,0,1,@MissyPDX @HKrassenstein @JoeBiden Got a probl...
14948,@CarolyneMas @BCStevens77 @JoeBiden Not only w...,0,1,@CarolyneMas @BCStevens77 @JoeBiden Not only w...


## Pre-Processing Pipeline

Now it's time to tokenize our tweets. Here are our pre-processing steps:
* Remove URLs
* Remove Twitter handles
* Remove numbers
* Convert to lowercase
* Remove punctuation
* Remove repeated letters so spell check will work ('aaaaand' -> 'aand')
* Remove non-English words
* Remove stop words

Since we're working with so many different words, I've chosen to use **lemmatization** instead of stemming for two reasons:
1. Lemmatization accurately reduces words to true meaning
2. Inxreased word reduction (handles synonyms)

In [9]:
x = '@JoeBiden how is it GOING?! Looking forward to seeing you #maga'

lambda x: re.sub(r"https?:\/\/\S+", "", x)
# remove twitter handles
lambda x: re.sub(r"@[\d\w_]+", "", x)
# remove numbers
lambda x: re.sub('\w*\d\w*', ' ', x)
# convert to lowercase
lambda x: re.sub('[%s]'.format(re.escape(string.punctuation)), ' ', x.lower())
# remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
lambda x: re.sub(r"([a-z])\1{2,}", r'\1', x)
# replace consecutive spaces with one
lambda x: ' '.join(x.split())

<function __main__.<lambda>(x)>

In [10]:
# Testing these before putting them in func


tweet = '@JoeBiden how is it GOING?! Looking forward to seeing you #maga'

# remove urls
tweet = re.sub(r"https?:\/\/\S+", "", tweet)
# remove twitter handles
tweet = re.sub(r"@[\d\w_]+", "", tweet)
# remove numbers
tweet = re.sub('\w*\d\w*', ' ', tweet)
# convert to lowercase
tweet = re.sub('[%s]'.format(re.escape(string.punctuation)), ' ', tweet.lower())
# remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
tweet = re.sub(r"([a-z])\1{2,}", r'\1', tweet)
# replace consecutive spaces with one
' '.join(tweet.split())

# data['tweet'] = (data['tweet']
#                  .map(no_handles)
#                  .map(no_links)
#                  .map(punc_lower)
#                  .map(alphanum)
#                  .map(no_repeats)
#                  .map(no_dup_spaces))
tweet


' how i  it going?! looking forward to  eeing you #maga'

In [94]:
# from autocorrect import Speller # TOO SLOW...TRY PYSPELLCHECKER
def tweet_tokenize(tweet, more_stop=None, more_words=None):
    """Get all of the tokens in a set of tweets.
    
    Parameters:
        - tweets (Series, required)
        
        - more_stop (List, optional): additional stop words to exclude
        
        - more_words (List, optional): additional words to INCLUDE in dictionary
    
    """
    
    
    # pre-processing pipeline
    
    # remove urls
    tweet = re.sub(r"https?:\/\/\S+", "", tweet)
    # remove twitter handles
#     tweet = re.sub(r"@[\d\w_]+", "", tweet)
    # remove numbers
    tweet = re.sub('\w*\d\w*', ' ', tweet)
    
    # remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # convert to lowercase
    tweet = tweet.lower()
    # remove spaces in nominee names
    tweet = re.sub(r"joebiden", "joe_biden", tweet)
    tweet = re.sub(r"kamalaharris", "kamala_harris", tweet)
    tweet = re.sub(r"donaldtrump", "donald_trump", tweet)
    tweet = re.sub(r"mikepence", "mike_pence", tweet)
    
    # and other popular campaign phrases
    tweet = re.sub(r"make america great again", "maga", tweet)
    # remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
    tweet = re.sub(r"([a-z])\1{2,}", r'\1', tweet)
    # replace consecutive spaces with one
    ' '.join(tweet.split())
    
    more_words = ['trump', 'biden', 'maga', 'bidenharris', 
                  'kamala', 'pence', 'harris', 'mike',
                  'bidenharris2020', 'trumppence',
                  'trumppence2020', 'usa', 'election2020',
                  'ivoted', 'joe_biden', 'realdonaldtrump',
                  'donald_trump', 'sleepy_joe', 'donald_trump',
                  'mike_pence', 'kamala_harris']
    
    dictionary = list(words.words()) + more_words
    dictionary = set(dictionary)
    
    twt = TweetTokenizer()
    tokens = [token for token in twt.tokenize(tweet)]
    tokens = [token for token in tokens if token in dictionary]
    
    # initiate stop word removal and lemmatization    
    more_stop = ['fxhedg','fyck','fy','fxxking','give','go',
                 'going','gonna','get','one']
    
    stop_words = list(stopwords.words('english')) + more_stop
    stop = stop_words
    stop = set(stop)
    
    lemm = WordNetLemmatizer()
    
    # implement lemmatization and stop word removal
    tokens = [lemm.lemmatize(token) for token in tokens
              if token.lower() not in stop]
#     spell = Speller(lang='en')
#     tokens = [spell(t) for t in tokens]

    combined_tokens = ' '.join(tokens)

    return combined_tokens

In [95]:
lemm = WordNetLemmatizer()
lemm.lemmatize('alksfjaehf')

'alksfjaehf'

In [96]:
data['tweet'] = data['original'].map(tweet_tokenize)

In [97]:
data.head(3)

Unnamed: 0,tweet,trump,biden,original
14945,joe_biden death penalty much le responsible,0,1,@CMWooly @JoeBiden @jimdicker He called for th...
14946,joe_biden got problem missy wear mask everyday...,0,1,@MissyPDX @HKrassenstein @JoeBiden Got a probl...
14948,joe_biden personally taking ton stock better t...,0,1,@CarolyneMas @BCStevens77 @JoeBiden Not only w...


In [98]:
data.to_pickle("pickle/tweets_df_1000tw.pick")

## Count Vectorizer

In [38]:
cv = CountVectorizer(max_df=0.05, min_df=10)
doc_words = cv.fit_transform(data.tweet)

ValueError: max_df corresponds to < documents than min_df

In [None]:
data.iloc[5]

## NMF

In [None]:
nmf_model = NMF(2, max_iter=115000)
doc_topic = nmf_model.fit_transform(doc_words)
doc_topic.shape

In [None]:
print(f"Number of iterations used: {nmf_model.n_iter_}")

From lecture: The **doc_topic** matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. We don't know yet what the topics are.

In [None]:
topic_word = nmf_model.components_
topic_word.shape

From lecture: The **topic_word** matrix shows us the 2 resulting topics, and the terms that are associated with each topic. By looking at the words below, we an figure out what the topics are.


In [None]:
words = cv.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

In [None]:
doc_topic