In [None]:
import pickle
import re
import string
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

from langdetect import detect

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

import spacy
import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [None]:
spacy_stopwords = spacy.lang('en').stop_words.STOP_WORDS

**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers

In [None]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [None]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [None]:
data = pd.read_pickle("pickle/balanced_nov2_tweets.pick")
data.shape

In [None]:
data.columns

In [None]:
len(data.username.unique())

In [None]:
data.sample(3)

Now let's create a subset, containing the same amount of Trump tweets as Biden tweets. We will exclude tweets that mention both candidates.

In [None]:
# keep only necessary columns
data['original'] = data.tweet
data.drop(columns='tweet', inplace=True)
data.head(3)

## Pre-Processing Pipeline

Now it's time to tokenize our tweets. Here are our pre-processing steps:
* Remove URLs
* Remove Twitter handles
* Remove numbers
* Convert to lowercase
* Remove punctuation
* Remove repeated letters so spell check will work ('aaaaand' -> 'aand')
* Remove non-English words
* Remove stop words

Since we're working with so many different words, I've chosen to use **lemmatization** instead of stemming for two reasons:
1. Lemmatization accurately reduces words to true meaning
2. Inxreased word reduction (handles synonyms)

In [None]:
# Testing these before putting them in func


tweet = 'this is ! SAMPLE text...  blm united states of america U.S.A usa U.S.  @joebiden @donaldtrump #2020electionusa #2020ELECTIONUSA'

# pre-processing pipeline

# pre-processing pipeline

# convert to lowercase
tweet = tweet.lower()

# convert 'U.S.' --> 'usa'
tweet = re.sub(r"u\.s\. ", "usa", tweet)
# remove urls
tweet = re.sub(r"https?:\/\/\S+", "", tweet)
# remove numbers
tweet = re.sub('\w*\d\w*', ' ', tweet)
# replace '...' with ' '
tweet = re.sub('\.{2,6}', ' ', tweet)

# remove punctuation
tweet = tweet.translate(str.maketrans('', '', string.punctuation))

# remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
tweet = re.sub(r"([a-z])\1{2,5}", r'\1', tweet)
# replace consecutive spaces with one
tweet = ' '.join(tweet.split())

# remove spaces in candidate names
tweet = re.sub(r"joebiden", "joe_biden", tweet)
tweet = re.sub(r"kamalaharris", "kamala_harris", tweet)
tweet = re.sub(r"president trump", "donald_trump", tweet)
tweet = re.sub(r"president donald trump", "donald_trump", tweet)
tweet = re.sub(r"vice president mike pence", "mike_pence", tweet)
tweet = re.sub(r"vice president pence", "mike_pence", tweet)
tweet = re.sub(r"vice president kamala harris", "kamala_harris", tweet)
tweet = re.sub(r"vice president harris", "kamala_harris", tweet)
tweet = re.sub(r"democratic presidential nominee", "democratic_presidential_nominee", tweet)
tweet = re.sub(r"republican presidential nominee", "republican_presidential_nominee", tweet)
tweet = re.sub(r"red state", "red_state", tweet)
tweet = re.sub(r"blue state", "blue_state", tweet)
tweet = re.sub(r"absentee ballot", "absentee_ballot", tweet)
tweet = re.sub(r"voting rights", "voting_rights", tweet)

tweet = re.sub(r"donaldtrump", "donald_trump", tweet)
tweet = re.sub(r"mikepence", "mike_pence", tweet)
tweet = re.sub(r"joe biden", "joe_biden", tweet)
tweet = re.sub(r"kamala harris", "kamala_harris", tweet)
tweet = re.sub(r"donald trump", "donald_trump", tweet)
tweet = re.sub(r"mike pence", "mike_pence", tweet)
tweet = re.sub(r"nancy pelosi", "nancy_pelosi", tweet)
tweet = re.sub(r"mitch mcconnell", "mitch_mcconnell", tweet)

# replace 'biden' with 'joebiden' (do for all candidates)
tweet = re.sub(r"\bbiden\b", "joe_biden", tweet)
tweet = re.sub(r"\bpence\b", "mike_pence", tweet)
tweet = re.sub(r"\bharris\b", "kamala_harris", tweet)
tweet = re.sub(r"\btrump\b", "donald_trump", tweet)
# other replacements
tweet = re.sub(r"united states of america", "usa", tweet)
tweet = re.sub(r"pro life", "pro_life", tweet)
tweet = re.sub(r"pro choice", "pro_choice", tweet)
tweet = re.sub(r"black lives matter", "black_lives_matter", tweet)
tweet = re.sub(r"blm", "black_lives_matter", tweet)
tweet = re.sub(r"blacklivesmatter", "black_lives_matter", tweet)
tweet = re.sub(r"mailin ballots", "mail_in_ballots", tweet)
tweet = re.sub(r"mailin", "mail_in_ballots", tweet)
tweet = re.sub(r"united states", "usa", tweet)
tweet = re.sub(r"attorney general", "attorney_general", tweet)
tweet = re.sub(r"white house", "white_house", tweet)
tweet = re.sub(r"make america great again", "maga", tweet)
tweet = re.sub(r"makeamericagreatagain", "maga", tweet)
tweet = re.sub(r"election fraud", "election fraud", tweet)
tweet = re.sub(r"sleepy joe biden", "sleepy_joe", tweet)
tweet = re.sub(r"sleepy joe", "sleepy_joe", tweet)
tweet = re.sub(r"presidential election", "election", tweet)
tweet = re.sub(r"running mate", "running_mate", tweet)
tweet = re.sub(r"voting machine", "voting_machine", tweet)
tweet = re.sub(r"cast your ballot", "cast_your_ballot", tweet)
tweet = re.sub(r"foreign policy", "foreign_policy", tweet)
tweet = re.sub(r"election day", "election_day", tweet)
tweet = re.sub(r"voting booth", "voting_booth", tweet)
tweet = re.sub(r"radical left", "radical_left", tweet)
tweet = re.sub(r"free speech", "free_speech", tweet)
tweet = re.sub(r"first amendment", "first_amendment", tweet)
tweet = re.sub(r"racial injustice", "racial_injustice", tweet)
tweet = re.sub(r"social inequality", "social_inequality", tweet)
tweet = re.sub(r"russian interference", "russian_inteference", tweet)
tweet = re.sub(r"electoral college", "electoral_college", tweet)
tweet = re.sub(r"right wing", "right_wing", tweet)
tweet = re.sub(r"left wing", "left_wing", tweet)
tweet = re.sub(r"far right", "far_right", tweet)
tweet = re.sub(r"far left", "far_left", tweet)
tweet = re.sub(r"conspiracy theory", "conspiracy_theory", tweet)
tweet = re.sub(r"domestic terrorism", "domestic_terrorism", tweet)
tweet = re.sub(r"vice president", "vice president", tweet)

tweet

In [None]:
# nltk.download()

In [None]:
import nltk
from nltk.corpus import stopwords, words
# print(stopwords.words('english'))
print(list(words.words('en'))[:10])

In [None]:
# from autocorrect import Speller # TOO SLOW...TRY PYSPELLCHECKER
# custom word dictionaries
from more_words import more_words as custom_words
from stop_words import stop_words as custom_stop_words
from multi_words import multi_words

import nltk
from nltk.corpus import words, stopwords


def clean_tweet(tweet):
    # pre-processing pipeline
    
    # convert to lowercase
    tweet = tweet.lower()
    # convert 'U.S.' --> 'usa'
    tweet = re.sub(r"u\.s\. ", "usa", tweet)
    # remove urls
    tweet = re.sub(r"https?:\/\/\S+", "", tweet)
    # remove numbers
    tweet = re.sub('\w*\d\w*', ' ', tweet)
    # replace '...' with ' '
    tweet = re.sub('\.{2,6}', ' ', tweet)
    # remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
    tweet = re.sub(r"([a-z])\1{2,5}", r'\1', tweet)
    # replace consecutive spaces with one
    tweet = ' '.join(tweet.split())
    
    # custom replacements. multiwords is a list of tuples such as ('white house', 'white_house')    
    for old, new in multi_words:
        tweet = re.sub(old, new, tweet)
    return tweet

def tweet_tokenize(tweet, more_stop=None, more_words=None):
    """
    Get all of the tokens in a set of tweets.
    Parameters:
        - tweets (Series, required)
        - more_stop (List, optional): additional stop words to exclude
        - more_words (List, optional): additional words to INCLUDE in dictionary
    """
    
    tweet = clean_tweet(tweet)
    stop_words = set(list(stopwords.words('english')) + custom_stop_words)
    all_words = set(list(words.words('en')) + custom_words)
    print(len(all_words))
    
    # lemmatize text
    twt = TweetTokenizer()
    lemm = WordNetLemmatizer()
    
    tokens = [lemm.lemmatize(token) for token in twt.tokenize(tweet) if token in all_words and token not in stop_words]
#     spell = Speller(lang='en')
#     tokens = [spell(t) for t in tokens]

    combined_tokens = ' '.join(tokens)
    


    return combined_tokens

In [None]:
lemm = WordNetLemmatizer()
lemm.lemmatize('word')

In [None]:
%%time
data['tweet'] = data['original'].map(tweet_tokenize)

In [None]:
data.to_pickle("pickle/n2_tokenized.pick")

In [None]:
data.sample(10)

In [None]:
# data.to_pickle("pickle/tweets_df_5000tw.pick")