In [1]:
import pickle
import re
import string
import emoji
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

from langdetect import detect

import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF

import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

import spacy
import contextualSpellCheck

nlp = spacy.load('en_core_web_sm')

from spellchecker import SpellChecker

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)

In [3]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [4]:
df = pd.read_pickle("pickle/df_t_raw.pick")
df.shape

(879311, 38)

In [5]:
df.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
11859,1316157376463073281,1316155757511966721,2020-10-13 23:22:26 UTC,2020-10-13,23:22:26,0,75595415,zhepon,Justice For All 🗽🇺🇸,,@JoeBiden Joe you forgot !you are running for ...,en,[],[],[],0,0,5,[],[],https://twitter.com/Zhepon/status/131615737646...,False,,0,,,,,,,,"[{'screen_name': 'JoeBiden', 'name': 'Joe Bide...",,,,,1,0
9442,1313262374892326913,1313262374892326913,2020-10-05 23:38:44 UTC,2020-10-05,23:38:44,0,1094027680138313728,coolc04490919,🌊🌊CoolC🌊🌊,,"Well Joe Biden, Trump is out of the hospital. ...",en,[],[],[],0,2,2,[],[],https://twitter.com/CoolC04490919/status/13132...,False,,0,,,,,,,,[],,,,,1,1
2469,1322326466634264577,1322323661441896452,2020-10-30 23:56:11 UTC,2020-10-30,23:56:11,0,1225087611384893440,phillipwinsto13,MixedTape,,@nprpolitics Lol when the toss-up states were ...,en,[],[],[],0,0,1,[],[],https://twitter.com/PhillipWinsto13/status/132...,False,,0,,,,,,,,"[{'screen_name': 'nprpolitics', 'name': 'NPR P...",,,,,0,1


Now let's create a subset, containing the same amount of Trump tweets as Biden tweets.

In [6]:
mask = (df.trump == 0) & (df.biden == 1)
biden_tweets = df[mask]

mask = (df.trump == 1) & (df.biden == 0)
trump_tweets = df[mask]

In [25]:
subset = pd.DataFrame(columns=df.columns)
subset = subset.append(biden_tweets.tail(100000))
subset = subset.append(trump_tweets.tail(100000))

subset.sample(3)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest,biden,trump
2384,1312540688920326148,1312533640392314882,2020-10-03 23:51:00 UTC,2020-10-03,23:51:00,0,1419592358,jessett36330,jessett,,@JoeBiden https://t.co/YfNuO6IAse,und,[],['https://twitter.com/scoobidoo76/status/13125...,[],1,0,2,[],[],https://twitter.com/jessett36330/status/131254...,False,https://twitter.com/scoobidoo76/status/1312535...,0,,,,,,,,"[{'screen_name': 'JoeBiden', 'name': 'Joe Bide...",,,,,1,0
12273,1318336989582643201,1318336989582643201,2020-10-19 23:43:26 UTC,2020-10-19,23:43:26,0,3183642870,sirrockstone,Sir Rock Stone,,Trump,en,[],[],[],0,0,1,[],[],https://twitter.com/sirrockstone/status/131833...,False,https://twitter.com/AlexPhD1/status/1318171406...,0,,,,,,,,[],,,,,0,1
3157,1315440410920194048,1315411495874965504,2020-10-11 23:53:28 UTC,2020-10-11,23:53:28,0,31119062,richal13,🏳️‍🌈🐬👬🕷🖖WandaVision,,@DaxGigandet Donald tRump is president,en,[],[],[],0,0,0,[],[],https://twitter.com/richal13/status/1315440410...,False,,0,,,,,,,,"[{'screen_name': 'DaxGigandet', 'name': 'Dax G...",,,,,0,1


In [26]:
# keep only necessary columns
data = subset.loc[:,['tweet', 'trump', 'biden']]
data.head(3)

Unnamed: 0,tweet,trump,biden
40246,"If Joe Biden wins, homosexuals will destroy fo...",0,1
40253,@JoeBiden #trunalimunumaprzure,0,1
40254,Traitor dude voted Biden dude calls himself pr...,0,1


## Preprocessing

We'll start small by simply removing numbers & punctuation and converting each tweet to lowercase. 

In [27]:
# remove urls
no_links = lambda x: re.sub(r"https?:\/\/\S+", "", x)
# remove twitter handles
no_handles = lambda x: re.sub(r"@[\d\w_]+", "", x)
# remove numbers
alphanum = lambda x: re.sub('\w*\d\w*', ' ', x)
# convert to lowercase
punc_lower = lambda x: re.sub('[%s]'.format(re.escape(string.punctuation)), ' ', x.lower())
# remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
no_repeats = lambda x: re.sub(r"([a-z])\1{2,}", r'\1', x)
# replace consecutive spaces with one
no_dup_spaces = lambda x: ' '.join(x.split())



In [28]:
data['original'] = data.tweet

# data['tweet'] = (data['tweet']
#                  .map(no_handles)
#                  .map(no_links)
#                  .map(punc_lower)
#                  .map(alphanum)
#                  .map(no_repeats)
#                  .map(no_dup_spaces))

data.head(3)

Unnamed: 0,tweet,trump,biden,original
40246,"If Joe Biden wins, homosexuals will destroy fo...",0,1,"If Joe Biden wins, homosexuals will destroy fo..."
40253,@JoeBiden #trunalimunumaprzure,0,1,@JoeBiden #trunalimunumaprzure
40254,Traitor dude voted Biden dude calls himself pr...,0,1,Traitor dude voted Biden dude calls himself pr...


## Pre-Processing Pipeline

Now it's time to tokenize our tweets. Here are our pre-processing steps:
* Remove URLs
* Remove Twitter handles
* Remove numbers
* Convert to lowercase
* Remove punctuation
* Remove repeated letters so spell check will work ('aaaaand' -> 'aand')
* Remove non-English words
* Remove stop words

Since we're working with so many different words, I've chosen to use **lemmatization** instead of stemming for two reasons:
1. Lemmatization accurately reduces words to true meaning
2. Inxreased word reduction (handles synonyms)

In [33]:
# from autocorrect import Speller # TOO SLOW...TRY PYSPELLCHECKER
def tweet_tokenize(tweet, more_stop=None, more_words=None):
    """Get all of the tokens in a set of tweets.
    
    Parameters:
        - tweets (Series, required)
        
        - more_stop (List, optional): additional stop words to exclude
        
        - more_words (List, optional): additional words to INCLUDE in dictionary
    
    """
    # pre-processing pipeline
    
    # remove urls
    lambda tweet: re.sub(r"https?:\/\/\S+", "", tweet)
    # remove twitter handles
    lambda tweet: re.sub(r"@[\d\w_]+", "", tweet)
    # remove numbers
    lambda tweet: re.sub('\w*\d\w*', ' ', tweet)
    # convert to lowercase
    lambda tweet: tweet.lower()
#     # remove punctuation
#     lambda tweet: re.sub('[%s]'.format(re.escape(string.punctuation)), ' ', tweet.lower())
    # remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
    lambda tweet: re.sub(r"([a-z])\1{2,}", r'\1', tweet)
    # replace consecutive spaces with one
    lambda tweet: ' '.join(tweet.split())
    
    words = list(set(nltk.corpus.words.words()))
    if words is not None:
        [words.append(w) for w in more_words]
    
    twt = nltk.tokenize.TweetTokenizer(strip_handles=True)
    tokens = [token for token in twt.tokenize(tweet) if token in words]
    
    # initiate lemmatization and stop word removal
    puncs = [c for c in string.punctuation if c not in ["#", ":"]]
    stop_words = stopwords.words("english")
    if more_stop is not None:
        [stop_words.append(w) for w in more_stop]
        
    stop = stop_words + puncs + ['”']
    
    lemm = WordNetLemmatizer()
    
    # implement lemmatization and stop word removal
    tokens = [ lemm.lemmatize(token) for token in tokens
              if token.lower() not in stop]
#     spell = Speller(lang='en')
#     tokens = [spell(t) for t in tokens]

    combined_tokens = ' '.join(tokens)

    return combined_tokens

In [30]:
# testing langdetect



text = 'una'

detect(text)

'es'

In [34]:
more_stop = [
    'fxhedg',
    'fyck',
    'fy',
    'fxxking',
    'give',
    'go',
    'going',
    'gonna',
    'get',
    'one'
]

more_words = ['trump', 'biden', 'maga', 'bidenharris', 'kamala', 'pence', 'harris', 'mike', 'bidenharris2020', 'trumppence', 'trumppence2020', 'usa', 'election2020', 'ivoted']

In [None]:
data['clean'] = data['original'].map(tweet_tokenize)

In [None]:
data.head(3)

In [None]:
tokenized = data.clean

In [None]:
tokenized.head()

In [None]:
data.to_pickle("pickle/tweets_df_v1.pick")

## Count Vectorizer

In [None]:
cv = CountVectorizer(max_df=0.05, min_df=300)
doc_words = cv.fit_transform(tokenized)



## NMF

In [None]:
nmf_model = NMF(2, max_iter=115000)
doc_topic = nmf_model.fit_transform(doc_words)
doc_topic.shape

In [None]:
print(f"Number of iterations used: {nmf_model.n_iter_}")

From lecture: The **doc_topic** matrix shows us the documents we started with, and how each document is made up of the 2 resulting topics. We don't know yet what the topics are.

In [None]:
topic_word = nmf_model.components_
topic_word.shape

From lecture: The **topic_word** matrix shows us the 2 resulting topics, and the terms that are associated with each topic. By looking at the words below, we an figure out what the topics are.


In [None]:
words = cv.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-7:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

In [None]:
doc_topic