In [1]:
import pickle
import re
import string
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

from langdetect import detect

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [3]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [4]:
data = pd.read_pickle("pickle/balanced_nov2_tweets.pick")
data.shape

(96000, 9)

In [5]:
data.sample(3)

Unnamed: 0,id,date,time,user_id,username,tweet,hashtags,trump,biden
31507,1323405540945498116,2020-11-02,23:24:03,1026844796395180032,ryan_kelley_,@danjlevy @Realeugenelevy @alxsupertramp_ and I voted early for @JoeBiden and are watching some schitts creek to distract us tonight!,[],False,True
106211,1323382795352244224,2020-11-02,21:53:40,261487376,alinecvt,"@villa_manzoni @ShironRedshift @JoeBiden Let’s stay positive, I know there’s justice in this world. #BidenHarris2020 🙏🏻",['bidenharris2020'],False,True
8431,1323412191157039104,2020-11-02,23:50:28,1117284155241435137,pineorange10,@Sagesseforever @sebastianjonasw @evan_graham04 @JoeBiden There's evidence lady https://t.co/SgIYyXKccN,[],False,True


Now let's create a subset, containing the same amount of Trump tweets as Biden tweets. We will exclude tweets that mention both candidates.

In [6]:
# keep only necessary columns
data['original'] = data.tweet
data.drop(columns='tweet', inplace=True)
data.head(3)

Unnamed: 0,id,date,time,user_id,username,hashtags,trump,biden,original
181142,1323379284434669568,2020-11-02,21:39:43,2820503362,artistacriseida,[],False,True,"All these articles showing that Biden is in the lead.... IGNORE THAT AND STILL GO VOTE. All of these maps showing information that may or may not be correct won’t matter on Election Day. Hillary was also in the lead last election, just do your part. ⁽ᶠᵘᶜᵏ ᵗʳᵘᵐᵖ⁾"
0,1323414585995526144,2020-11-02,23:59:59,1312487180258820096,annapieters17,[],False,True,@FoxNews Lady Gaga’s a nobody. Can’t figure out her own life and can’t even see nobody can help Biden. He’s out of the game from the day he gets in the game.
4,1323414585232293888,2020-11-02,23:59:59,2335763630,kylechwatt,[],False,True,"@The_Grupp “It is purely a fortuity that this isn’t one of the great mass casualty events in American history,” Ron Klain, who was Biden’s chief of staff at the time, said of H1N1 in 2019.” https://t.co/Umi317supK"


## Pre-Processing Pipeline

Now it's time to tokenize our tweets. Here are our pre-processing steps:
* Remove URLs
* Remove Twitter handles
* Remove numbers
* Convert to lowercase
* Remove punctuation
* Remove repeated letters so spell check will work ('aaaaand' -> 'aand')
* Remove non-English words
* Remove stop words

Since we're working with so many different words, I've chosen to use **lemmatization** instead of stemming for two reasons:
1. Lemmatization accurately reduces words to true meaning
2. Inxreased word reduction (handles synonyms)

In [7]:
# nltk.download() # must run first time (download 'popular')

In [8]:
# from autocorrect import Speller # TOO SLOW...TRY PYSPELLCHECKER
# custom word dictionaries
from more_words import more_words as custom_words
from stop_words import stop_words as custom_stop_words
from multi_words import multi_words

import nltk
from nltk.corpus import words, stopwords

def clean_tweet(tweet):
    # pre-processing pipeline
    
    # convert to lowercase
    tweet = tweet.lower()
    # convert 'U.S.' --> 'usa'
    tweet = re.sub(r"u\.s\. ", "usa", tweet)
    # remove urls
    tweet = re.sub(r"https?:\/\/\S+", "", tweet)
    # remove numbers
    tweet = re.sub('\w*\d\w*', ' ', tweet)
    # replace '...' with ' '
    tweet = re.sub('\.{2,6}', ' ', tweet)
    # remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
    tweet = re.sub(r"([a-z])\1{2,5}", r'\1', tweet)
    # replace consecutive spaces with one
    tweet = ' '.join(tweet.split())
    
    # custom replacements. multiwords is a list of tuples such as ('white house', 'white_house')
    stop_words = list(stopwords.words('english')) + custom_stop_words
    all_words = list(words.words('en')) + custom_words
    
    for old, new in multi_words:
        tweet = re.sub(old, new, tweet)
        all_words.append(new)
    
    return tweet, set(stop_words), set(all_words)

def tweet_tokenize(tweet, more_stop=None, more_words=None):
    """
    Get all of the tokens in a set of tweets.
    Parameters:
        - tweets (Series, required)
        - more_stop (List, optional): additional stop words to exclude
        - more_words (List, optional): additional words to INCLUDE in dictionary
    """
    
    tweet, stop_words, all_words = clean_tweet(tweet)
    
    # lemmatize text
    all_words = set(all_words)
    twt = TweetTokenizer()
        
    lemm = WordNetLemmatizer()    
    tokens = [lemm.lemmatize(token) for token in twt.tokenize(tweet) if token in all_words and token not in stop_words]
    combined_tokens = ' '.join(tokens)

    return combined_tokens

In [None]:
%%time
data['tweet'] = data['original'].map(tweet_tokenize)

In [None]:
data.to_pickle("pickle/n2_tokenized.pick")

In [None]:
data[['original', 'tweet']].sample(15)

In [None]:
# data.to_pickle("pickle/tweets_df_5000tw.pick")

In [None]:
for mw in multi_words:
    print(mw)

In [None]:
sample = 'tweet tokenize me please mr. biden helloaskldjalksfj I  pence    voting rights am asking for a favor continuous breakdown American Americans'

tweet_tokenize(sample)

In [None]:
text = 'left wing hispanic vote white house'
for old, new in multi_words:
    text = re.sub(old, new, text)
text


In [None]:
# data.to_pickle("pickle/n2_tokenized.pick")

In [None]:
data.sample(10)

In [None]:
data.shape