# Improved LSTM baseline

This kernel is a somewhat improved version of [Keras - Bidirectional LSTM baseline](https://www.kaggle.com/CVxTz/keras-bidirectional-lstm-baseline-lb-0-051) along with some additional documentation of the steps. (NB: this notebook has been re-run on the new test set.)

In [41]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.convolutional import Conv1D
from keras.models import Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, GlobalAveragePooling1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

import matplotlib.pyplot as plt
%matplotlib inline  

In [14]:
class ConvBlockLayer(object):
    """
    two layer ConvNet. Apply batch_norm and relu after each layer
    """

    def __init__(self, input_shape, num_filters):
        self.model = Sequential()
        # first conv layer
        self.model.add(Conv1D(filters=num_filters, kernel_size=3, strides=1, padding="same", input_shape=input_shape))
        self.model.add(BatchNormalization())
        self.model.add(Activation('relu'))

        # second conv layer
        self.model.add(Conv1D(filters=num_filters, kernel_size=3, strides=1, padding="same"))
        self.model.add(BatchNormalization())
        self.model.add(Activation('relu'))

    def __call__(self, inputs):
        return self.model(inputs)

We include the GloVe word vectors in our input files. To include these in your kernel, simple click 'input files' at the top of the notebook, and search 'glove' in the 'datasets' section.

In [2]:
path = 'data/'
EMBEDDING_FILE=f'wv/glove.6B.100d.txt'
TRAIN_DATA_FILE=f'{path}train.csv'
TEST_DATA_FILE=f'{path}test.csv'

Set some basic config parameters:

In [46]:
embed_size = 20 # char embed
maxlen = 1024 # max number of words in a comment to use

Read in our data and replace missing values:

In [5]:
train = pd.read_csv(TRAIN_DATA_FILE)
# test = pd.read_csv(TEST_DATA_FILE)

list_sentences_train = train["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
# list_sentences_test = test["comment_text"].fillna("_na_").values

In [5]:
# from nltk.corpus import stopwords
# cachedStop =  stopwords.words('english')
# pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
# def cleanwords(sent):
#     return ' '.join([word.lower() for word in sent.lower().split() if word not in cachedStop ])
    # return pattern.sub('', sent.lower())

# def cleanchars(sent):
#     return sent.translate(translator)


Standard keras preprocessing, to turn each comment into a list of word indexes of equal length (with truncation or padding as needed).

In [47]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
# list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
# X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [48]:
max_features = len(tokenizer.word_counts)+1

In [49]:
max_features

2336

Read the glove word vectors (space delimited strings) into a dictionary from word->vector.

In [51]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))+1
# embedding_matrix = np.random.normal(0, 1, (nb_words, embed_size))
# for word, i in word_index.items():
#     if i >= max_features: continue # greater than max word features
#     embedding_vector = embeddings_index.get(word) # out of word vocabulary
#     if embedding_vector is not None: embedding_matrix[i] = embedding_vector

Simple bidirectional LSTM with two fully connected layers. We add some dropout to the LSTM since even 2 epochs is enough to overfit.

In [52]:
def build_model(num_filters, num_classes, sequence_max_length=1024, num_quantized_chars=71, embedding_size=16, learning_rate=0.001, top_k=3, model_path=None):

    inputs = Input(shape=(sequence_max_length, ), dtype='int32', name='inputs')

    embedded_sent = Embedding(num_quantized_chars, embedding_size, input_length=sequence_max_length)(inputs)

    # First conv layer
    conv = Conv1D(filters=64, kernel_size=3, strides=2, padding="same")(embedded_sent)

    # Each ConvBlock with one MaxPooling Layer
#     for i in range(len(num_filters)):
#         conv = ConvBlockLayer(get_conv_shape(conv), num_filters[i])(conv)
#         conv = MaxPooling1D(pool_size=3, strides=2, padding="same")(conv)

    for i in range(len(num_filters)):
        conv = Conv1D(filters=num_filters[i], kernel_size=3, strides=1, padding="same")(conv)
        conv = BatchNormalization()(conv)
        conv = Activation('relu')(conv)

        conv = Conv1D(filters=num_filters[i], kernel_size=3, strides=1, padding="same")(conv)
        conv = BatchNormalization()(conv)
        conv = Activation('relu')(conv)

        conv = MaxPooling1D(pool_size=3, strides=2, padding="same")(conv)
    
    conv = GlobalAveragePooling1D()(conv)
    # k-max pooling (Finds values and indices of the k largest entries for the last dimension)
#     def _top_k(x):
#         x = tf.transpose(x, [0, 2, 1])
#         k_max = tf.nn.top_k(x, k=top_k)
#         return tf.reshape(k_max[0], (-1, num_filters[-1] * top_k))
#     k_max = Lambda(_top_k, output_shape=(num_filters[-1] * top_k,))(conv)

    # 3 fully-connected layer with dropout regularization
    fc1 = Dropout(0.2)(Dense(512, activation='relu', kernel_initializer='he_normal')(conv))
    fc2 = Dropout(0.2)(Dense(512, activation='relu', kernel_initializer='he_normal')(fc1))
    fc3 = Dense(num_classes, activation='sigmoid')(fc2)

    # define optimizer
#     sgd = SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=False)
    model = Model(inputs=inputs, outputs=fc3)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    if model_path is not None:
        model.load_weights(model_path)

    return model

In [53]:
num_filters = [64, 128, 256, 512]
model = build_model(num_filters=num_filters, num_classes=6, embedding_size=embed_size, learning_rate=0.001)

In [54]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 1024)              0         
_________________________________________________________________
embedding_13 (Embedding)     (None, 1024, 20)          1420      
_________________________________________________________________
conv1d_52 (Conv1D)           (None, 512, 64)           3904      
_________________________________________________________________
conv1d_53 (Conv1D)           (None, 512, 64)           12352     
_________________________________________________________________
batch_normalization_39 (Batc (None, 512, 64)           256       
_________________________________________________________________
activation_39 (Activation)   (None, 512, 64)           0         
_________________________________________________________________
conv1d_54 (Conv1D)           (None, 512, 64)           12352     
__________

Now we're ready to fit out model! Use `validation_split` when not submitting.

In [None]:
model.fit(X_t, y, batch_size=32, epochs=2, validation_split=0.1);

Train on 143613 samples, validate on 15958 samples
Epoch 1/2
 18400/143613 [==>...........................] - ETA: 1:48 - loss: 0.1249 - acc: 0.9645

And finally, get predictions for the test set and prepare a submission CSV:

In [16]:
y_test = model.predict([X_te], batch_size=1024, verbose=1)
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('glove300.csv', index=False)



In [4]:
# sample_submission.to_csv('base_test.csv',index=False)

In [19]:
# test_submission = pd.read_csv('data/sample_submission.csv')
# len(test_submission)

In [None]:
# Baseline Score
# loss: 0.0417 - acc: 0.9840 - val_loss: 0.0451 - val_acc: 0.9829 --> AUC : 0.9787

