In [8]:
import numpy as np
import pandas as pd

## Keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, GRU, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers

## Save and load
import pickle

### Load data and GloVe vectors

In [2]:
# Load train & valid splits
f = open('../data/preprocessed.pkl','rb')
train, valid = pickle.load(f)
labels = train.columns[2:]
# Get Ys
y_train = train[labels].values
y_valid = valid[labels].values

In [3]:
## BUILD GLOVE EMBEDDINGS DICTIONARY
embeddings_dict = dict()
f = open(r'../data/glove.6B.300d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    embeddings_dict[word] = vec
f.close()
print('Extracted {} word vectors'.format(len(embeddings_dict)))

## Additional params
embed_size = len(embeddings_dict[next(iter(embeddings_dict))]) # length of vector representation for each word
max_features = 50000 # # of unique words to use (number of rows in embedding vector)
max_len = 100 # # of words in a comment to use

Extracted 400000 word vectors


### Standard Keras Preprocessing

In [4]:
# Turn into list of strings
train_list = list(train['comment_text'].values)
valid_list = list(valid['comment_text'].values)
# Create and fit tokenizer
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_list)
# Tokenize train and valid
train_tokenized = tokenizer.texts_to_sequences(train_list)
valid_tokenized = tokenizer.texts_to_sequences(valid_list)
# Padd
X_train = pad_sequences(train_tokenized, maxlen=max_len)
X_valid = pad_sequences(valid_tokenized, maxlen=max_len)

### Use GloVe vectors to create embedding matrix. If words are not in GloVe dictionary, then use random initialization. To generate random initialization, use the same mean and std as GloVe vectors

In [5]:
## Get mean and std from Glove vectors
glove_mean, glove_std = np.stack(embeddings_dict.values()).mean(), np.stack(embeddings_dict.values()).std()

In [6]:
wordidx_dict = tokenizer.word_index # get word indices
num_words = min(len(wordidx_dict), max_features) # should be max 50000, but could be less if wordidx dict contains <50000 words
# random initliaization of weights
embedding_matrix = np.random.normal(glove_mean, 
                                    glove_std, 
                                    size = (num_words, embed_size)) 
# update embedding matrix with glove vectors
for word, idx in wordidx_dict.items():
    if idx < max_features: # stay within max # of features
        # grab glove vector if exists
        vec = embeddings_dict.get(word) 
        # if glove vector exists, add to embedding matrix (i.e., replace random initialization)
        if vec is not None: embedding_matrix[idx] = vec 

### Build RNN
- 1 bidirectional GRU layer with 2 FC layers and dropouts

In [9]:
input = Input(shape=(max_len,))
x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(input) # embedding layer to obtain vectors for words
x = Bidirectional(GRU(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x) # bidirectional lstm layer
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x) # 1st FC layer
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x) # Output label (6 outputs, 1 for each class for multi-label classification)
model = Model(inputs=input, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Fit Model

In [10]:
model.fit(X_train, 
          y_train, 
          batch_size=32, 
          epochs=2,
          validation_split=0.1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1482892a448>

In [11]:
## Save model
model.save('../artifacts/glove_gru')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ../artifacts/glove_gru\assets


### Check performance

In [12]:
## Get predictions
preds = model.predict(X_valid, batch_size=32)
preds_t = preds
preds_t[preds<=0.5] = 0
preds_t[preds>0.5] = 1

In [13]:
## Print results
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
for i in range(preds_t.shape[1]):
    i_preds = preds[:,i]
    i_true = y_valid[:,i]
    label = labels[i]
    print('Results for {0} comments: Accuracy - {1:.2f}; Precision - {2:.2f}; Recall - {3:.2f}; F1 - {4:.2f}'.format(
                                    label, 
                                    accuracy_score(i_true, i_preds), 
                                    precision_score(i_true, i_preds), 
                                    recall_score(i_true, i_preds),
                                    f1_score(i_true, i_preds)))

Results for toxic comments: Accuracy - 0.96; Precision - 0.87; Recall - 0.72; F1 - 0.79
Results for severe_toxic comments: Accuracy - 0.99; Precision - 0.72; Recall - 0.06; F1 - 0.12
Results for obscene comments: Accuracy - 0.98; Precision - 0.87; Recall - 0.75; F1 - 0.81
Results for threat comments: Accuracy - 1.00; Precision - 0.25; Recall - 0.01; F1 - 0.01
Results for insult comments: Accuracy - 0.97; Precision - 0.79; Recall - 0.66; F1 - 0.72
Results for identity_hate comments: Accuracy - 0.99; Precision - 0.69; Recall - 0.28; F1 - 0.40
