In [2]:
# ! pip install imbalanced-learn
# ! pip install lime
# ! pip install textblob
# ! pip install contractions
# ! pip install spacy
# ! python -m spacy download en_core_web_sm
# ! pip install python-Levenshtein
# ! pip install gensim



Collecting en-core-web-sm==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[K     |████████████████████████████████| 13.7 MB 9.3 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting gensim
  Downloading gensim-4.0.1-cp36-cp36m-manylinux1_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 9.5 MB/s eta 0:00:01


Installing collected packages: gensim
Successfully installed gensim-4.0.1


In [53]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

import pickle 
import imblearn
from imblearn.over_sampling import SMOTE
from collections import Counter
import time

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import gensim
# import lime
# import lime.lime_tabular

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, plot_confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA, IncrementalPCA, LatentDirichletAllocation

## for deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K


from helpers import *

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load cleaned data
combine title and body of text 

In [14]:
aita_w2v = pd.read_csv('data/aita_w2v.csv')

# 1 - train test split

In [65]:
X = aita_w2v['combo_clean']
y = aita_w2v['is_asshole']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.01, random_state=21)
print(X_train.shape, X_test.shape)

(75267,) (761,)


In [122]:
Counter(y)

Counter({1: 20795, 0: 55233})

# 2 - word2vec

### 2.1 Create tokenized list, detect unigrams, bigrams, trigrams
### 2.2 Train word2vec model on list of list of unigrams, bigrams, trigrams

In [72]:
lst_corpus = [post.split() for post in X_train]
bigrams_detector = gensim.models.Phrases(lst_corpus, min_count=5, threshold=10)
trigrams_detector = gensim.models.Phrases(bigrams_detector[lst_corpus], min_count=5, threshold=10)
lst_corpus = list(bigrams_detector[lst_corpus])
lst_corpus = list(trigrams_detector[lst_corpus])
nlp = gensim.models.word2vec.Word2Vec(lst_corpus, vector_size=200, window=4, min_count=3, sg=1)

In [76]:
## use fitted dectors on test set
lst_corpus_test = [post.split() for post in X_test]
## detect common bigrams and trigrams using the fitted detectors
lst_corpus_test = list(bigrams_detector[lst_corpus_test])
lst_corpus_test = list(trigrams_detector[lst_corpus_test])

In [79]:
# nlp.save("models/nlp.model")

761

In [None]:
## tokenize text
tokenizer = kprocessing.text.Tokenizer(lower=True, split=' ', 
                                       oov_token="NaN", 
                                       filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(lst_corpus)
dic_vocabulary = tokenizer.word_index

### 2.3 tokenize text, create padded sequence

In [114]:
## create sequence
lst_text2seq= tokenizer.texts_to_sequences(lst_corpus)
## padding sequence
X_train_p = kprocessing.sequence.pad_sequences(lst_text2seq, maxlen=600, padding="post", truncating="post")

In [84]:
## text to sequence with the fitted tokenizer
lst_text2seq_test = tokenizer.texts_to_sequences(lst_corpus_test)
## padding sequence
X_test_p = kprocessing.sequence.pad_sequences(lst_text2seq_test, maxlen=600, padding="post", truncating="post")

In [117]:
print(len(lst_corpus),len(X_train_p))
print(len(lst_corpus_test),len(X_test_p))

75267 75267
761 761


In [93]:
## start the matrix (length of vocabulary x vector size) with all 0s
embeddings = np.zeros((len(dic_vocabulary)+1, 200))
for word,idx in dic_vocabulary.items():
    ## update the row with vector
    try:
        embeddings[idx] =  nlp.wv[word]
    ## if word not in model then skip and the row stays all 0s
    except:
        pass

In [103]:
## code attention layer
def attention_layer(inputs, neurons):
    x = layers.Permute((2,1))(inputs)
    x = layers.Dense(neurons, activation="softmax")(x)
    x = layers.Permute((2,1), name="attention")(x)
    x = layers.multiply([inputs, x])
    return x

## input
x_in = layers.Input(shape=(600,))
## embedding
x = layers.Embedding(input_dim=embeddings.shape[0],  
                     output_dim=embeddings.shape[1], 
                     weights=[embeddings],
                     input_length=600, trainable=False)(x_in)
## apply attention
x = attention_layer(x, neurons=600)
## 2 layers of bidirectional lstm
x = layers.Bidirectional(layers.LSTM(units=600, dropout=0.2, 
                         return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(units=600, dropout=0.2))(x)
## final dense layers
x = layers.Dense(64, activation='relu')(x)
y_out = layers.Dense(3, activation='softmax')(x)
## compile
model = models.Model(x_in, y_out)
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 600)]        0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 600, 200)     29763600    input_3[0][0]                    
__________________________________________________________________________________________________
permute_2 (Permute)             (None, 200, 600)     0           embedding_2[0][0]                
__________________________________________________________________________________________________
dense_2 (Dense)                 (None, 200, 600)     360600      permute_2[0][0]                  
______________________________________________________________________________________________

In [118]:
X_train_p.shape,y_train.size

(75267, 600)

In [121]:
## encode y
dic_y_mapping = {n:label for n,label in 
                 enumerate(np.unique(y_train))}
inverse_dic = {v:k for k,v in dic_y_mapping.items()}
y_train_i = np.array([inverse_dic[y] for y in y_train])

## train
training = model.fit(x=X_train_p, y=y_train_i, batch_size=200, 
                     epochs=10, shuffle=True, verbose=0, 
                     validation_split=0.3)

ResourceExhaustedError:  [_Derived_]  OOM when allocating tensor with shape[200,600] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator mklcpu
	 [[{{node while_20/body/_1/split}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[model/bidirectional/forward_lstm/StatefulPartitionedCall]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_distributed_function_11744]

Function call stack:
distributed_function -> distributed_function -> distributed_function


In [None]:
## plot loss and accuracy
metrics = [k for k in training.history.keys() if ("loss" not in k) and ("val" not in k)]
fig, ax = plt.subplots(nrows=1, ncols=2, sharey=True)
ax[0].set(title="Training")
ax11 = ax[0].twinx()
ax[0].plot(training.history['loss'], color='black')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Loss', color='black')
for metric in metrics:
    ax11.plot(training.history[metric], label=metric)
ax11.set_ylabel("Score", color='steelblue')
ax11.legend()
ax[1].set(title="Validation")
ax22 = ax[1].twinx()
ax[1].plot(training.history['val_loss'], color='black')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Loss', color='black')
for metric in metrics:
     ax22.plot(training.history['val_'+metric], label=metric)
ax22.set_ylabel("Score", color="steelblue")
plt.show()

In [6]:
def remove_punc_stop(s):
    s = re.sub('[%s]' % re.escape(string.punctuation), '', s)
    s = re.sub('[‘’“”…]', '', s)
    s = re.sub('\w*\d\w*', '', s)
    return s

In [105]:
# tokens = [nltk.word_tokenize(sentences) for sentences in aita_w2v['combo_clean']]
# aita_w2v['tokens'] = tokens

In [108]:
# model = gensim.models.Word2Vec(tokens, min_count=2, workers=4, window=4)

In [5]:
# model.save("models/word2vec.model")
model = Word2Vec.load("models/word2vec.model")
print(model)

Word2Vec(vocab=57802, vector_size=100, alpha=0.025)


In [7]:
def make_feature_vec(words, model, num_features):
    """
    Average the word vectors for a set of words
    """
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    nwords = 0.
    index2word_set = set(model.wv.index_to_key)  # words known to the model
    stop = set(stopwords.words('english')).union(['aita','wibta'])
    for word in words:
        if word in index2word_set and word not in stop: 
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec,model.wv[word])
    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec
def get_avg_feature_vecs(texts, model, num_features):
    """
    Calculate average feature vectors for all reviews
    """
    feature_vecs = np.zeros((len(texts),num_features), dtype='float32')  # pre-initialize (for speed)
    for ix, text in enumerate(texts):
        feature_vecs[ix] = make_feature_vec(text, model, num_features)
    return feature_vecs

In [135]:
# feature_vecs = get_avg_feature_vecs(aita_w2v['tokens'], model, 100)
# feature_vecs_df = pd.DataFrame(feature_vecs, columns=[f'w2v_{i}' for i in range(1, 101)])
# aita_w2v_1 = pd.concat([aita_w2v.reset_index(drop=True),feature_vecs_df], axis=1)

In [8]:
# aita_w2v_1.to_csv('data/aita_w2v_1.csv', index=False)
# aita_w2v_1 = pd.read_csv('data/aita_w2v_1.csv')

In [12]:
model.wv.most_similar(positive=['york'])

[('zealand', 0.9204254150390625),
 ('orleans', 0.8691218495368958),
 ('jersey', 0.744202733039856),
 ('mexico', 0.7416518330574036),
 ('england', 0.7132971286773682),
 ('hires', 0.7053131461143494),
 ('hampshire', 0.6276334524154663),
 ('boston', 0.6179364323616028),
 ('wales', 0.6043822765350342),
 ('city', 0.5941430926322937)]

## 4 Oversample minority class with SMOTE

In [162]:
# sentiment analysis + vectors
X_train_smote, y_train_smote = SMOTE().fit_resample(X_train, y_train)
X_test_smote, y_test_smote = SMOTE().fit_resample(X_test, y_test)
Counter(y_train_smote), Counter(y_test_smote)

(Counter({0: 54680, 1: 54680}), Counter({1: 553, 0: 553}))