In [1]:
import numpy as np
import pandas as pd

## Keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
import tensorflow as tf

## Save and load
import pickle

### Load preprocessed data & combine labels

In [2]:
f = open('../data/preprocessed.pkl','rb')
train, valid = pickle.load(f)
labels = train.columns[2:]
ys_train = train[labels]
ys_valid = valid[labels]
## COMBINE TOXIC CATEGORIES
y_train = ys_train.sum(axis=1)
y_valid = ys_valid.sum(axis=1)
y_train.loc[y_train>1] = 1
y_valid.loc[y_valid>1] = 1

### Get embeddings

In [3]:
## BUILD GLOVE EMBEDDINGS DICTIONARY
embeddings_dict = dict()
f = open(r'../data/glove.6B.300d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    embeddings_dict[word] = vec
f.close()
print('Extracted {} word vectors'.format(len(embeddings_dict)))

Extracted 400000 word vectors


In [4]:
## Additional params
embed_size = 300 # length of vector representation for each word
max_features = 50000 # # of unique words to use (number of rows in embedding vector)
max_len = 100 # # of words in a comment to use

In [5]:
# Turn into list of strings
train_list = list(train['comment_text'].values)
valid_list = list(valid['comment_text'].values)
# Create and fit tokenizer
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train_list)
# Tokenize train and valid
train_tokenized = tokenizer.texts_to_sequences(train_list)
valid_tokenized = tokenizer.texts_to_sequences(valid_list)
# Padd
X_train = pad_sequences(train_tokenized, maxlen=max_len)
X_valid = pad_sequences(valid_tokenized, maxlen=max_len)

In [6]:
## Get mean and std from Glove vectors
glove_mean, glove_std = np.stack(embeddings_dict.values()).mean(), np.stack(embeddings_dict.values()).std()

In [7]:
wordidx_dict = tokenizer.word_index # get word indices
num_words = min(len(wordidx_dict), max_features) # should be max 50000, but could be less if wordidx dict contains <50000 words
# random initliaization of weights
embedding_matrix = np.random.normal(glove_mean, 
                                    glove_std, 
                                    size = (num_words, embed_size)) 
# update embedding matrix with glove vectors
for word, idx in wordidx_dict.items():
    if idx < max_features: # stay within max # of features
        # grab glove vector if exists
        vec = embeddings_dict.get(word) 
        # if glove vector exists, add to embedding matrix (i.e., replace random initialization)
        if vec is not None: embedding_matrix[idx] = vec 

### Build model and fit

In [8]:
input = Input(shape=(max_len,))
x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(input) # embedding layer to obtain vectors for words
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x) # bidirectional lstm layer
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x) # 1st FC layer
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x) # Output label (6 outputs, 1 for each class for multi-label classification)
model = Model(inputs=input, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [9]:
model.fit(X_train, 
          y_train, 
          batch_size=32, 
          epochs=2,
          validation_split=0.1)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x17627d40108>

In [10]:
## Save model
model.save('../artifacts/simple/glove_lstm')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: ../artifacts/simple/glove_lstm\assets


### Eval model

In [11]:
model = tf.keras.models.load_model('../artifacts/simple/glove_lstm')

In [19]:
## Get predictions
preds = model.predict(X_valid, batch_size=32)
preds_df = pd.DataFrame(data=preds, columns=['glove_gru'])
preds_df.to_csv('../artifacts/simple/preds/glove_lstm.csv')
preds_t = preds
preds_t[preds<=0.5] = 0
preds_t[preds>0.5] = 1

In [15]:
results = pd.DataFrame(columns=['Label','Accuracy', 'Recall', 'Precision', 'F1', 'Vectorizer', 'model'])

## Print results
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

i_preds = preds_t
i_true = y_valid

# Evaluate predictions
acc, prec, recall, f1 = (accuracy_score(i_true, i_preds), 
                        precision_score(i_true, i_preds), 
                        recall_score(i_true, i_preds), 
                        f1_score(i_true, i_preds))

# Save results to dataframe
results = results.append({'Label': 'Toxic_Combined',
                        'Accuracy':acc,
                        'Recall':recall,
                        'Precision':prec,
                        'F1':f1,
                        'Vectorizer':'glove',
                        'model': 'lstm'}, 
                        ignore_index = True)

# print results
print('Results for {0} comments: Accuracy - {1:.2f}; Precision - {2:.2f}; Recall - {3:.2f}; F1 - {4:.2f}'.format(
                                'Toxic_Combined', 
                                acc, 
                                prec, 
                                recall,
                                f1))

Results for Toxic_Combined comments: Accuracy - 0.96; Precision - 0.91; Recall - 0.72; F1 - 0.80
