# 4. Preprocessing and NLP Modeling
*Author: Boom*

## Import Packages and Data

In [1]:
# Load data manipulation libraries
import numpy as np
import pandas as pd

# Load text processing libraries
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gensim

# Load visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Miscellaneous
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore')



In [2]:
# Read clean data
clean_tweets_df = pd.read_csv("./datasets/clean_tweets_20180101-20181231.csv")

## Preprocessing

### Tweet Tokenization

Tokenization is the process of turning a body of text into its constituent words (i.e. tokens). To make processing easier, will create a nested list of lists which we will later iterate through.

To clarify, each tweet will have a list of cleaned tokens (inner list) and will have length equal to number of tokens in that tweet. The outer list will contain these inner lists and will have length equal to the number of tweets in our sample.

In [3]:
# Instantiate Tokenizer
tokenizer = RegexpTokenizer(r'\b[^\d\W]+\b') # Regex here says to get full words but exclude digits

# Tokenizer Tweets
dirty_tweet_tokens = [tokenizer.tokenize(token.lower().strip()) for token in clean_tweets_df['text']]

# Initialize master list to populate
master_tweet_tokens = []

# For each dirty tweet, get list of clean tokens and append to master_tweet_tokens
for tweet in dirty_tweet_tokens:
    # Create template for clean tokens to populate for this particular tweet
    clean_tweet_tokens = []
    # For each word/token in tweet, make sure it is not a stopword or URL piece or main search term 
    for word in tweet:
        if ((word not in stopwords.words('english')) and (word not in ['http','https','www','com','@','...','…','power','outage','outages','blackout'])):
            # Now extract it into clean token list
            clean_tweet_tokens.append(word)
    # Append this list of tokens to the master list
    master_tweet_tokens.append(clean_tweet_tokens)

### Word2Vec Vectorization

Word2Vec is a model based on shallow neural networks that are trained to reconstruct the linguistic context of words. The model takes in a large corpus of text and converts it to a high-dimensional vector (typically several hundred dimensions). Each word in the corpus is assigned a corresponding vector positioned in such a way that words sharing common contexts in the corpus are close to each other in the space.

#### Instantiate Word2Vec Model from Google News

The model is trained using the GoogleNews vector. Note however because the `GoogleNews-vectors-negative300.bin.gz` file is too large to be put on GitHub, you will need to save it down in your local repository one folder level up. 

In your terminal (iOS) or GitBash (windows), make sure you are one folder level up then run
`curl -O https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz`

In [4]:
# This takes at least 10 minutes to load
w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin.gz', binary=True)

#### Vectorization Step

- We begin by defining a list of words that indicate serious/true/electrical blackout vs. list of words that refer to other non-serious blackouts (e.g. Netflix, internet, drunken night)
- We will use these words to create vectors for word2vec.
- Subject matter experts should refine these word lists as they see fit.

In [5]:
# List of words for SERIOUS (actual electrical) outages
serious = ['electricity', 'electrical', 'conedison', 'con edison',
           'generator', 'generators', 'failure', 'malfunction', 'fuse',
           'blow', 'explode', 'power grid', 'breaker', 'loss',
           'dark', 'darkness', 'pitch black', 'blind', 'massive',
           'major', 'serious', 'inclement', 'surge', 'storm',
           'solar flare', 'alert', 'light', 'lights']

# List of words for NON-SERIOUS (non-electrical such as movie streaming, internet, phone, non-current "outages")
non_serious = ['netflix', 'hulu', 'time warner', 'twc', 'at&t',
               'att', 'verizon', 'tmobile', 't-mobile', 'phone',
               'internet', 'wireless' 'conference', 'prepare', 'meeting',
               'strength', 'training', 'discipline', 'vicpowers', 'baseball']

For the corpus of words in each category, we take the word vector of each of the words and average them to gauge the overall "sentiment".

In [6]:
# Define corpus vectorization function
def vectorize_corpus(keyword_list):    
    
    # Instantiate counter for number of words in keyword_list that exists in GoogleNews word list
    n_words = 0
    
    # Create template for cumulative corpus vector sum
    corpus_vec_sum = np.zeros((1,300))                 # 300 dimensions is GoogleNews' default vector length
    
    # Scan through each word in corpus
    for word in keyword_list:
        if word in w2v_model.vocab:                    # Check if word exists in GoogleNews word list
            word_vec = w2v_model.word_vec(word)        # If yes, vectorize it: get its 300-dimensional word vector
            n_words +=1                                # Update counter
            corpus_vec_sum = corpus_vec_sum + word_vec # Update cumulative vector sum for corpus

    # Compute average vector by taking cumulative vector sum and dividing it by number of words traced
    corpus_avg_vec = corpus_vec_sum/n_words
    
    # Squeeze this messy N-dimensional nested array object into a 1-D array to streamline future processing
    corpus_avg_vec = np.squeeze(corpus_avg_vec)
    
    return(corpus_avg_vec)

In [7]:
# Apply function to vectorize two corpora
serious_vec     = vectorize_corpus(serious)
non_serious_vec = vectorize_corpus(non_serious)

### Modeling: NLP Classification

In an ideal world, someone could manually go through each Tweet and classify whether or not it is in response to a live electrical blackout. However with thousands of tweets, this isn't a viable option. Thus, the absence of a target variable to compare means we must employ an unsupervised learning model.

To tackle this classification problem, we use cosine similarity. Each tweet is vectorized in a similar manner to how we vectorized the category corpus above (i.e. vectorize each word and average them), then compare the cosine similarity scores of:
- `tweet_avg_vec` vs. `serious_vec`
- `tweet_avg_vec` vs. `non_serious_vec`

A higher cosine similiarity score means the tweet is more likely to belong to that category. We assign a target variable value accordingly:
- If the cosine_sim(`tweet_avg_vec`,`serious_vector`) $\ge$  cosine_sim(`tweet_avg_vec`,`non_serious_vector`), then assign `serious_blackout = 1`.
- If the cosine_sim(`tweet_avg_vec`,`serious_vector`) $<$  cosine_sim(`tweet_avg_vec`,`non_serious_vector`), then assign `serious_blackout = 0`.

#### Define cosine similarity function

In [8]:
## Define function to compute cosine similarity score
def cosine_sim(u, v):
    dot_product = np.dot(u, v)
    magnitude_product = np.sqrt(np.dot(u, u)) * np.sqrt(np.dot(v, v))
    return (dot_product/magnitude_product)

#### Classify Tweets with  `serious_blackout` binary target variable

In [9]:
# For each tweet in our list of tweets' list of tokens
for idx, tweet in enumerate(master_tweet_tokens):
    
    # Call our function to get the tweet average vector
    tweet_avg_vec = vectorize_corpus(tweet)
        
    # Compare cosine similarity then assign target variable value
    clean_tweets_df['score_serious'] = cosine_sim(tweet_avg_vec, serious_vec)
    clean_tweets_df['score_non_serious'] = cosine_sim(tweet_avg_vec, non_serious_vec)
    
    if cosine_sim(tweet_avg_vec, serious_vec) >= cosine_sim(tweet_avg_vec, non_serious_vec):
        clean_tweets_df.loc[idx,'serious_blackout'] = 1
    else:
        clean_tweets_df.loc[idx, 'serious_blackout'] = 0

In [10]:
# Check that there are no errors in values filled
print("There are", clean_tweets_df[['serious_blackout']].isnull().sum()[0],"null or missing values in the target variable column")
print(" ")
print("Here are some summary statistics:")
clean_tweets_df[['serious_blackout']].describe()

There are 0 null or missing values in the target variable column
 
Here are some summary statistics:


Unnamed: 0,serious_blackout
count,954.0
mean,0.379455
std,0.485506
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [13]:
# clean_tweets_df.to_csv("./datasets/finalized_learned_tweets_df.csv")