In [1]:
import pickle
import re
import string
import pandas as pd
import numpy as np

from textblob import TextBlob
from collections import Counter

from langdetect import detect

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF, LatentDirichletAllocation

import nltk
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.corpus import stopwords, words
from nltk.stem import PorterStemmer, WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


**Kelsey 1-1**

- cleaning
    - preprocessing until comfortable with words
   
- sentiment analysis on all tweets
    - don't need to do any splitting at this stage
    - TextBlob & VaderSentiment first, spacy if the results aren't as expected
    
- topic modeling
    - decide: use all tweets (all topics) at once
        - start here
        - then can use these as features in the dataFrame and do splitting here
    - or: split to trump/biden - then bot/not bot for each
    - point here is there are multiple ways to split it
        - no right answers

In [2]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_colwidth', None)

In [3]:
# must uncomment & run the first time to DOWNLOAD NLTK data
# I used package identifier 'popular'
# nltk.download()

In [4]:
data = pd.read_pickle("pickle/balanced_nov2_tweets.pick")
data.shape

(96000, 9)

In [5]:
data.columns

Index(['id', 'date', 'time', 'user_id', 'username', 'tweet', 'hashtags',
       'trump', 'biden'],
      dtype='object')

In [6]:
len(data.username.unique())

69839

In [7]:
data.sample(3)

Unnamed: 0,id,date,time,user_id,username,tweet,hashtags,trump,biden
68721,1323403683305005059,2020-11-02,23:16:40,935285552601067520,broc_universe,@DanicMendonca Brasileira que apoia o Trump KKKKKKKKKKKKKKKKKKKKKKKKKKK,[],True,False
80516,1323390816895815681,2020-11-02,22:25:32,1173375597042688000,magicjo68173373,"@TeamJoe @JoeBiden @ladygaga @KamalaHarris @DrBiden As a 15 year old I know that Joe Biden will save people's lives, fix racism, give health care to those who can't afford it, and help the planets atmosphere. A vote for Joe is a vote for a better future",[],False,True
6098,1323412844759711749,2020-11-02,23:53:04,938850153372536832,stevenp30730282,@CGasparino @LJMoynihan @JoeBiden Was it not 2 days ago you were saying a GOP “insider” was very worried about PA and like 3 other states?,[],False,True


Now let's create a subset, containing the same amount of Trump tweets as Biden tweets. We will exclude tweets that mention both candidates.

In [8]:
# keep only necessary columns
data['original'] = data.tweet
data.drop(columns='tweet', inplace=True)
data.head(3)

Unnamed: 0,id,date,time,user_id,username,hashtags,trump,biden,original
181142,1323379284434669568,2020-11-02,21:39:43,2820503362,artistacriseida,[],False,True,"All these articles showing that Biden is in the lead.... IGNORE THAT AND STILL GO VOTE. All of these maps showing information that may or may not be correct won’t matter on Election Day. Hillary was also in the lead last election, just do your part. ⁽ᶠᵘᶜᵏ ᵗʳᵘᵐᵖ⁾"
0,1323414585995526144,2020-11-02,23:59:59,1312487180258820096,annapieters17,[],False,True,@FoxNews Lady Gaga’s a nobody. Can’t figure out her own life and can’t even see nobody can help Biden. He’s out of the game from the day he gets in the game.
4,1323414585232293888,2020-11-02,23:59:59,2335763630,kylechwatt,[],False,True,"@The_Grupp “It is purely a fortuity that this isn’t one of the great mass casualty events in American history,” Ron Klain, who was Biden’s chief of staff at the time, said of H1N1 in 2019.” https://t.co/Umi317supK"


## Pre-Processing Pipeline

Now it's time to tokenize our tweets. Here are our pre-processing steps:
* Remove URLs
* Remove Twitter handles
* Remove numbers
* Convert to lowercase
* Remove punctuation
* Remove repeated letters so spell check will work ('aaaaand' -> 'aand')
* Remove non-English words
* Remove stop words

Since we're working with so many different words, I've chosen to use **lemmatization** instead of stemming for two reasons:
1. Lemmatization accurately reduces words to true meaning
2. Inxreased word reduction (handles synonyms)

In [None]:
# Testing these before putting them in func


tweet = 'this is ! SAMPLE text...    @joebiden @donaldtrump #2020electionusa #2020ELECTIONUSA'

    tweet = re.sub(r"https?:\/\/\S+", "", tweet)
    # remove numbers
    tweet = re.sub('\w*\d\w*', ' ', tweet)
    # replace '...' with ' '
    tweet = re.sub('\.{2,6}', ' ', tweet)
    # remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # convert to lowercase
    tweet = tweet.lower()

    # and other popular campaign phrases
    tweet = re.sub(r"make america great again", "maga", tweet)
    tweet = re.sub(r"makeamericagreatagain", "maga", tweet)
    
    # remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
    tweet = re.sub(r"([a-z])\1{2,}", r'\1', tweet)
    # replace consecutive spaces with one
    tweet = ' '.join(tweet.split())
    
    # remove spaces in candidate names
    tweet = re.sub(r"joebiden", "joe_biden", tweet)
    tweet = re.sub(r"kamalaharris", "kamala_harris", tweet)
    tweet = re.sub(r"donaldtrump", "donald_trump", tweet)
    tweet = re.sub(r"mikepence", "mike_pence", tweet)
    tweet = re.sub(r"joe biden", "joe_biden", tweet)
    tweet = re.sub(r"kamala harris", "kamala_harris", tweet)
    tweet = re.sub(r"donald trump", "donald_trump", tweet)
    tweet = re.sub(r"mike pence", "mike_pence", tweet)
    # replace 'biden' with 'joebiden' (do for all candidates)
    tweet = re.sub(r"\bbiden\b", "joe_biden", tweet)
    tweet = re.sub(r"\bpence\b", "mike_pence", tweet)
    tweet = re.sub(r"\bharris\b", "kamala_harris", tweet)
    tweet = re.sub(r"\btrump\b", "donald_trump", tweet)
    # other replacements
    tweet = re.sub(r"attorney general", "attorney_general", tweet)
    tweet = re.sub(r"white house", "white_house", tweet)

tweet


In [10]:
# from autocorrect import Speller # TOO SLOW...TRY PYSPELLCHECKER

def clean_tweets(tweet):
    # pre-processing pipeline
    
    # remove urls
    tweet = re.sub(r"https?:\/\/\S+", "", tweet)
    # remove numbers
    tweet = re.sub('\w*\d\w*', ' ', tweet)
    # remove punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # convert to lowercase
    tweet = tweet.lower()

    # and other popular campaign phrases
    tweet = re.sub(r"make america great again", "maga", tweet)
    tweet = re.sub(r"makeamericagreatagain", "maga", tweet)
    
    # remove repeated letters so spell check will work (ex: 'aaaand' --> 'aand')
    tweet = re.sub(r"([a-z])\1{2,}", r'\1', tweet)
    # replace consecutive spaces with one
    tweet = ' '.join(tweet.split())
    
    # remove spaces in candidate names
    tweet = re.sub(r"joebiden", "joe_biden", tweet)
    tweet = re.sub(r"kamalaharris", "kamala_harris", tweet)
    tweet = re.sub(r"donaldtrump", "donald_trump", tweet)
    tweet = re.sub(r"mikepence", "mike_pence", tweet)
    tweet = re.sub(r"joe biden", "joe_biden", tweet)
    tweet = re.sub(r"kamala harris", "kamala_harris", tweet)
    tweet = re.sub(r"donald trump", "donald_trump", tweet)
    tweet = re.sub(r"mike pence", "mike_pence", tweet)
    # replace 'biden' with 'joebiden' (do for all candidates)
    tweet = re.sub(r"\bbiden\b", "joe_biden", tweet)
    tweet = re.sub(r"\bpence\b", "mike_pence", tweet)
    tweet = re.sub(r"\bharris\b", "kamala_harris", tweet)
    tweet = re.sub(r"\btrump\b", "donald_trump", tweet)
    # other replacements
    tweet = re.sub(r"attorney general", "attorney_general", tweet)
    tweet = re.sub(r"white house", "white_house", tweet)

    return tweet

def tweet_tokenize(tweet, more_stop=None, more_words=None):
    """Get all of the tokens in a set of tweets.
    
    Parameters:
        - tweets (Series, required)
        
        - more_stop (List, optional): additional stop words to exclude
        
        - more_words (List, optional): additional words to INCLUDE in dictionary
    
    """
    # identify election-related words that 
    more_words = ['trump', 'biden', 'maga', 'bidenharris', 
                  'kamala', 'pence', 'harris', 'mike',
                  'bidenharris2020', 'trumppence', 'white_house',
                  'trumppence2020', 'usa', 'election2020',
                  'ivoted', 'joe_biden', 'realdonaldtrump',
                  'donald_trump', 'attorney_general',
                  'mike_pence', 'kamala_harris']
    
    all_words = list(words.words()) + more_words
    all_words = set(all_words)
    
    twt = TweetTokenizer()
    tokens = [token for token in twt.tokenize(tweet) if token in all_words]
    
    # lemmatize text
    lemm = WordNetLemmatizer()
    
    tokens = [lemm.lemmatize(token) for token in tokens]
#     spell = Speller(lang='en')
#     tokens = [spell(t) for t in tokens]

    combined_tokens = ' '.join(tokens)

    return combined_tokens

In [11]:
lemm = WordNetLemmatizer()

In [12]:
data['tweet'] = data['original'].map(tweet_tokenize)

KeyboardInterrupt: 

In [None]:
data.to_pickle("pickle/n2_tokenized.pick")

In [None]:
data.sample(10)

In [None]:
# data.to_pickle("pickle/tweets_df_5000tw.pick")