# Improved LSTM baseline¶

This kernel is a somewhat improved version of Keras - Bidirectional LSTM baseline along with some additional documentation of the steps. (NB: this notebook has been re-run on the new test set.)


In [51]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

We include the GloVe word vectors in our input files. To include these in your kernel, simple click 'input files' at the top of the notebook, and search 'glove' in the 'datasets' section.

More about GloVe:

A matrix where rows are different words, and columns are ratios of co-ocurrence probabilities. A high number implies a stronger co-occurrence probability.

In [52]:
#path = 'data/'
#comp = 'jigsaw-toxic-comment-classification-challenge/'
EMBEDDING_FILE='data/glove.6B.50d.txt'
TRAIN_DATA_FILE='data/train.csv'
TEST_DATA_FILE='data/test.csv'

Set some basic config parameters:

In [53]:
embed_size = 50 # how big is each word vector
max_features = 20000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

Read in our data and replace missing values:

In [54]:
train = pd.read_csv(TRAIN_DATA_FILE)
test = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_na_").values

Standard keras preprocessing, to turn each comment into a list of word indexes of equal length (with truncation or padding as needed).

tokenizer.fit_on_texts = creates a vector of all possible words appearing in the training data

list_tokenized_train = index each word that occures in a sentence according to the word collection got from the training data

X_t = makes each sequence the same length by padding 0's to the beginning of the sequence

In [55]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [56]:
print(tokenizer.word_index.popitem())

('certified', 7324)


In [57]:
print(X_t[0])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0   688    75     1   126   130   177    29
   672  4511 12052  1116    86   331    51  2278 11448    50  6864    15
    60  2756   148     7  2937    34   117  1221 15190  2825     4    45
    59   244     1   365    31     1    38    27   143    73  3462    89
  3085  4583  2273   985]


In [58]:
list_sentences_train[0]

"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27"

In [59]:
list_tokenized_train[0]

[688,
 75,
 1,
 126,
 130,
 177,
 29,
 672,
 4511,
 12052,
 1116,
 86,
 331,
 51,
 2278,
 11448,
 50,
 6864,
 15,
 60,
 2756,
 148,
 7,
 2937,
 34,
 117,
 1221,
 15190,
 2825,
 4,
 45,
 59,
 244,
 1,
 365,
 31,
 1,
 38,
 27,
 143,
 73,
 3462,
 89,
 3085,
 4583,
 2273,
 985]

Read the glove word vectors (space delimited strings) into a dictionary from word->vector.

In [60]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))


Use these vectors to create our embedding matrix, with random initialization for words that aren't in GloVe. We'll use the same mean and stdev of embeddings the GloVe has when generating the random init.

In [61]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

(0.020940488, 0.6441035)

If the word can be found from the GloVe initialize its correlation values according to it? 

In [62]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [63]:
print(embedding_matrix.shape)

(20000, 50)


Simple bidirectional LSTM with two fully connected layers. We add some dropout to the LSTM since even 2 epochs is enough to overfit.

Input: 100 numbers, that represent the place in the embedding matrix

6 hidden layers: Embedding LSTM, Bidirectional, MaxPoolingID, Dense, Dropout

Embedded layer: takes 100 word indexes as input for every word index tracks correlation vector. as an output givees (100, 50) matrix that includes correlation matricies. 

output: probabilities for the six classes

In [64]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [65]:
embed_size

50

# Visualize

In [66]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
def show_me(model):
    return SVG(model_to_dot(model, show_shapes=True, show_layer_names=True).create(prog='dot', format='svg'))

In [67]:
show_me(model)

ImportError: Failed to import pydot. You must install pydot and graphviz for `pydotprint` to work.

In [41]:
#model.fit(X_t, y, batch_size=32, epochs=2, validation_split=0.1)

And finally, get predictions for the test set and prepare a submission CSV:

In [None]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
#sample_submission = pd.read_csv(f'{path}{comp}sample_submission.csv')
#sample_submission[list_classes] = y_test
#sample_submission.to_csv('submission.csv', index=False)