In [10]:
import numpy as np
import pandas as pd

## Keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
import tensorflow as tf

## Save and load
import pickle

### Load data and word2vec vectors

In [11]:
# Load train & valid splits
f = open('../data/preprocessed.pkl','rb')
train, valid = pickle.load(f)
labels = train.columns[2:]
# Get Ys
y_train = train[labels].values
y_valid = valid[labels].values

In [12]:
## BUILD WORD2VEC EMBEDDINGS DICTIONARY
embeddings_dict = dict()
f = open(r'../data/GoogleNews-vectors-negative300.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    if len(vec) == 300: 
        embeddings_dict[word] = vec
    else:
        print('word {0} has a vector length of {1}'.format(word,len(vec))) # got some problematic vectors
f.close()
print('Extracted {} word vectors'.format(len(embeddings_dict)))

word 3000000 has a vector length of 1
word 0.3828125 has a vector length of 299
word 0.049804688 has a vector length of 299
word 0.037597656 has a vector length of 299
Extracted 2999996 word vectors


In [13]:
## Additional params
embed_size = 300 # length of vector representation for each word
max_features = 50000 # # of unique words to use (number of rows in embedding vector)
max_len = 100 # # of words in a comment to use

### Standard keras preprocessing

In [14]:
# Turn into list of strings
train_list = list(train['comment_text'].values)
valid_list = list(valid['comment_text'].values)
# Create and fit tokenizer
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_list)
# Tokenize train and valid
train_tokenized = tokenizer.texts_to_sequences(train_list)
valid_tokenized = tokenizer.texts_to_sequences(valid_list)
# Padd
X_train = pad_sequences(train_tokenized, maxlen=max_len)
X_valid = pad_sequences(valid_tokenized, maxlen=max_len)

### Use w2c vectors to create embedding matrix. If words are not in w2c dictionary, then use random initialization. To generate random initialization, use the same mean and std as w2c vectors 

In [15]:
## Get mean and std from w2c vectors
w2c_mean, w2c_std = np.stack(embeddings_dict.values()).mean(), np.stack(embeddings_dict.values()).std()

In [16]:
wordidx_dict = tokenizer.word_index # get word indices
num_words = min(len(wordidx_dict), max_features) # should be max 50000, but could be less if wordidx dict contains <50000 words
# random initliaization of weights
embedding_matrix = np.random.normal(w2c_mean, 
                                    w2c_std, 
                                    size = (num_words, embed_size)) 
# update embedding matrix with w2c vectors
for word, idx in wordidx_dict.items():
    if idx < max_features: # stay within max # of features
        # grab w2c vector if exists
        vec = embeddings_dict.get(word) 
        # if w2c vector exists, add to embedding matrix (i.e., replace random initialization)
        if vec is not None: embedding_matrix[idx] = vec 

### Build RNN
- 1 bidirectional LSTM layer with 2 FC layers and dropouts

In [17]:
input = Input(shape=(max_len,))
x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(input) # embedding layer to obtain vectors for words
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x) # bidirectional lstm layer
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x) # 1st FC layer
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x) # Output label (6 outputs, 1 for each class for multi-label classification)
model = Model(inputs=input, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### Fit model

In [18]:
model.fit(X_train, 
          y_train, 
          batch_size=32, 
          epochs=2,
          validation_split=0.1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x12a40b9b488>

In [19]:
## Save model
model.save('../artifacts/w2c_lstm')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ../artifacts/w2c_lstm\assets


### Check performance

In [20]:
model = tf.keras.models.load_model('../artifacts/w2c_lstm')

In [21]:
## Get predictions
preds = model.predict(X_valid, batch_size=32)
preds_t = preds
preds_t[preds<=0.5] = 0
preds_t[preds>0.5] = 1

In [22]:
results = pd.DataFrame(columns=['Label','Accuracy', 'Recall', 'Precision', 'F1', 'Vectorizer', 'model'])

## Print results
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
for i in range(preds_t.shape[1]):
    i_preds = preds_t[:,i]
    i_true = y_valid[:,i]
    label = labels[i]

    # Evaluate predictions
    acc, prec, recall, f1 = (accuracy_score(i_true, i_preds), 
                            precision_score(i_true, i_preds), 
                            recall_score(i_true, i_preds), 
                            f1_score(i_true, i_preds))
    
    # Save results to dataframe
    results = results.append({'Label': label,
                            'Accuracy':acc,
                            'Recall':recall,
                            'Precision':prec,
                            'F1':f1,
                            'Vectorizer':'w2c',
                            'model': 'lstm'}, 
                            ignore_index = True)
    
    # print results
    print('Results for {0} comments: Accuracy - {1:.2f}; Precision - {2:.2f}; Recall - {3:.2f}; F1 - {4:.2f}'.format(
                                    label, 
                                    acc, 
                                    prec, 
                                    recall,
                                    f1))

Results for toxic comments: Accuracy - 0.96; Precision - 0.83; Recall - 0.78; F1 - 0.80
Results for severe_toxic comments: Accuracy - 0.99; Precision - 0.48; Recall - 0.41; F1 - 0.44
Results for obscene comments: Accuracy - 0.98; Precision - 0.84; Recall - 0.82; F1 - 0.83
Results for threat comments: Accuracy - 1.00; Precision - 0.00; Recall - 0.00; F1 - 0.00
Results for insult comments: Accuracy - 0.97; Precision - 0.77; Recall - 0.67; F1 - 0.72
Results for identity_hate comments: Accuracy - 0.99; Precision - 0.77; Recall - 0.19; F1 - 0.30


In [23]:
## SAVE RESULTS
results.to_csv('../artifacts/w2c_lstm.csv', index=False)