# Baseline LSTM Model using GloVe Embeddings

In [1]:
import sys,os, re, csv, codecs
import numpy as np
import pandas as pd
import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.models import Model
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras import initializers, regularizers, constraints, optimizers,layers
from keras.callbacks import EarlyStopping
from time import time

Using TensorFlow backend.


Embedding File : Wikipedia GloVe dataset with 200-Dimensional representation -> embed_size

Maximum number of unique words to use: 20000 -> max_features

Maximum number of words to use in a comment: 100 -> maxlen

In [11]:
#training and testing files with the data augmentation sets 
train_file_en= '../input/jigsaw-toxic-comment-classification-challenge/train.csv'
train_file_de= '../input/toxic-comments-french-spanish-german-train/train_de.csv'
train_file_fr= '../input/toxic-comments-french-spanish-german-train/train_fr.csv'
train_file_es= '../input/toxic-comments-french-spanish-german-train/train_es.csv'

test_file= '../input/jigsaw-toxic-comment-classification-challenge/test.csv'
test_label_file = '../input/jigsaw'

#Using the GloVe embedding file
embedding_file = '../input/glove6b200d/glove.6B.200d.txt'

train_en = pd.read_csv(train_file_en)
train_es = pd.read_csv(train_file_es)
train_fr = pd.read_csv(train_file_fr)
train_de = pd.read_csv(train_file_de)

#Augmented Dataset
train = train_en.append(train_es.append(train_de, ignore_index=True), ignore_index=True)
test = pd.read_csv(test_file)

#Only English
train1 = train_en

In [13]:
embed_size = 200 #Embedding dimension
max_features = 200000 #Maximum number of words to keep based on frequency
maxlen = 100 #Maximum Length of comments

In [15]:
#Extracting the comments and removing the Empty comments
list_sentences_train = train["comment_text"].fillna("_na_").values
list_sentences_train1 = train1["comment_text"].fillna("_na_").values

list_sentences_test = test["comment_text"].fillna("_na_").values

#Categories
list_classes = ["toxic","severe_toxic", "obscene", "threat", "insult", "identity_hate"]

#Category Tags
y_t = train[list_classes].values
y_t1 = train1[list_classes].values

#np.shape(y_t1)

# Preparing the Text Data

In [34]:
#Using Keras' Tokenizer function on augmented dataset
#Vectorize a text corpus, by turning each text into either a sequence of integers 
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train)) 
#Takes the 200000 max frequency words in the dataset and creates tokens out of them

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

#Word Index holds the index integer values for all the words based on the Tokenizer aas a dictionary
word_index = tokenizer.word_index
print('Found %s unique tokens.' %len(word_index))

Found 246556 unique tokens.


In [36]:
#Word Index Dictionary where each 
word_index

{'the': 1,
 'to': 2,
 'of': 3,
 'and': 4,
 'a': 5,
 'i': 6,
 'you': 7,
 'is': 8,
 'that': 9,
 'in': 10,
 'not': 11,
 'it': 12,
 'for': 13,
 'this': 14,
 'on': 15,
 'be': 16,
 'have': 17,
 'are': 18,
 'as': 19,
 'your': 20,
 'if': 21,
 'with': 22,
 'do': 23,
 'article': 24,
 'was': 25,
 'or': 26,
 'but': 27,
 'wikipedia': 28,
 'an': 29,
 'can': 30,
 'page': 31,
 'my': 32,
 'from': 33,
 'by': 34,
 'about': 35,
 'me': 36,
 'will': 37,
 'what': 38,
 'so': 39,
 'has': 40,
 'there': 41,
 'at': 42,
 'they': 43,
 'he': 44,
 'would': 45,
 'all': 46,
 'no': 47,
 'talk': 48,
 'like': 49,
 'should': 50,
 'who': 51,
 'we': 52,
 'please': 53,
 'one': 54,
 'just': 55,
 'more': 56,
 'other': 57,
 'which': 58,
 'think': 59,
 'see': 60,
 'been': 61,
 'any': 62,
 'also': 63,
 'here': 64,
 'because': 65,
 'his': 66,
 'some': 67,
 "i'm": 68,
 'know': 69,
 'people': 70,
 'did': 71,
 'how': 72,
 'only': 73,
 'why': 74,
 'use': 75,
 "it's": 76,
 'does': 77,
 'when': 78,
 'time': 79,
 'discussion': 80,
 'artic

In [37]:
#Every comment in the dataset is now a sequence of integers of unequal length
list_tokenized_train[0]

[653,
 74,
 1,
 231,
 131,
 204,
 32,
 626,
 4544,
 11994,
 1046,
 82,
 723,
 43,
 4999,
 18265,
 55,
 6579,
 15,
 67,
 2717,
 141,
 6,
 3018,
 42,
 112,
 1147,
 8506,
 2772,
 4,
 53,
 159,
 273,
 1,
 358,
 33,
 1,
 48,
 31,
 122,
 68,
 3438,
 92,
 3067,
 4621,
 2275,
 969]

In [19]:
#Using Keras' Tokenizer function on English dataset
tokenizer1 = Tokenizer(num_words=max_features)
tokenizer1.fit_on_texts(list(list_sentences_train1))

list_tokenized_train1 = tokenizer1.texts_to_sequences(list_sentences_train1)
list_tokenized_test1 = tokenizer1.texts_to_sequences(list_sentences_test)

X_t1 = pad_sequences(list_tokenized_train1, maxlen=maxlen)
X_te1 = pad_sequences(list_tokenized_test1, maxlen=maxlen)

word_index1 = tokenizer.word_index
print('Found %s unique tokens.' %len(word_index1))
np.shape(X_t1)

Found 246556 unique tokens.


(159571, 100)

In [38]:
#Every comment is a sequence of indices
list_tokenized_train1[0]

[688,
 75,
 1,
 126,
 130,
 177,
 29,
 672,
 4511,
 12052,
 1116,
 86,
 331,
 51,
 2278,
 11448,
 50,
 6864,
 15,
 60,
 2756,
 148,
 7,
 2937,
 34,
 117,
 1221,
 15190,
 2825,
 4,
 45,
 59,
 244,
 1,
 365,
 31,
 1,
 38,
 27,
 143,
 73,
 3462,
 89,
 3085,
 4583,
 2273,
 985]

# GloVe Word Embeddings

We will be using GloVe embeddings, which you can read about here. GloVe stands for "Global Vectors for Word Representation". It's a somewhat popular embedding technique based on factorizing a matrix of word co-occurence statistics.

Specifically, we will use the **200-dimensional GloVe embeddings of 400k words computed on a 2014 dump of English Wikipedia.** 

## Preparing Embedding Layer

Returns a 20-dimensional vector space coordinate for each word in the embedding, i.e. 400k words each have a 200-d representation.

In [17]:
#Dictionary of word and their embedding values
#There are embeddings for 400k words and each embedding is a 200-d vector
embeddings_index={}

f = open(embedding_file)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Found %s word vectors.'%len(embeddings_index))

Found 400000 word vectors.


## Preparing Embedding Matrix

Words that are not in GloVe are randomly initialized
Creating an embedding matrix for 20000 words where each word has a 200 dimensional representation.

In [28]:
#Embedding matrix for 400k words
all_embeddings = np.stack(embeddings_index.values())

emb_mean= all_embeddings.mean()
emb_stddev = all_embeddings.std()

np.shape(all_embeddings)

  


(400000, 200)

In [29]:
#Embedding Matrix for Augmented Dataset
#Number of words is assigned as the minimum of either 400
nb_words = min(max_features, len(word_index)+1)

#Creating a matrix of length of the number of words we choose to represent
embedding_matrix = np.random.normal(emb_mean, emb_stddev, (nb_words, embed_size))

#For augmented data
for word, i in word_index.items():
    if i>=max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

np.shape(embedding_matrix)

(200000, 200)

In [39]:
#Creating a matrix of length of the number of words we choose to represent
embedding_matrix1 = np.random.normal(emb_mean, emb_stddev, (nb_words, embed_size))

#For augmented data
for word, i in word_index1.items():
    if i>=max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix1[i] = embedding_vector

np.shape(embedding_matrix1)

(200000, 200)

In [40]:
#Embedding Matrix for Augmented Dataset

nb_words = min(max_features, len(word_index1)+1)
embedding_matrix1 = np.random.normal(emb_mean, emb_stddev, (nb_words, embed_size))
for word, i in word_index1.items():
    if i>=max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix1[i] = embedding_vector

np.shape(embedding_matrix)

(200000, 200)

## Model

Simple bidirectional LSTM with two fully connected layers and adding some dropout is necessary.

In [41]:
#Early Stopping 

es = keras.callbacks.EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=0,
                              verbose=0, mode='auto')

from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
class roc_callback(Callback):
    def __init__(self,training_data,validation_data):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]


    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.x)
        roc = roc_auc_score(self.y, y_pred)
        y_pred_val = self.model.predict(self.x_val)
        roc_val = roc_auc_score(self.y_val, y_pred_val)
        print('\rroc-auc: %s - roc-auc_val: %s' % (str(round(roc,4)),str(round(roc_val,4))),end=100*' '+'\n')
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [42]:
#Model with Data Augmentation 

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights = [embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [43]:
#Model without Data Augmentation 

inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights = [embedding_matrix1])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model1 = Model(inputs=inp, outputs=x)
model1.compile(loss='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [44]:
#Model fitting with Augmented Data
history = model.fit(X_t, y_t, batch_size=256, epochs=5, validation_split=0.2,callbacks=[es]);

Train on 382970 samples, validate on 95743 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [45]:
#Model fitting without Augmented Data
history1 = model1.fit(X_t1, y_t1, batch_size=256, epochs=5, validation_split=0.2,callbacks=[es]);

Train on 127656 samples, validate on 31915 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5


In [46]:
y_test = model.predict([X_te],batch_size = 1024, verbose=1)
y_test1 = model1.predict([X_te],batch_size = 1024, verbose=1)


'''
sample_submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('submission.csv',index = False)
'''



"\nsample_submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')\nsample_submission[list_classes] = y_test\nsample_submission.to_csv('submission.csv',index = False)\n"

In [None]:
model.summary()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
#print(history.history.keys())


plt.figure(figsize=(16,12))
plt.suptitle('Accuracy and Loss Comparison')

gridspec.GridSpec(2,2)
plt.subplot2grid((2,2),(0,0))
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Augmented Model accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend(['train','test'],loc='upper left')


plt.subplot2grid((2,2),(0,1))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Augmented Model loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['train','test'],loc='upper left')

plt.subplot2grid((2,2),(1,0))
plt.plot(history1.history['acc'])
plt.plot(history1.history['val_acc'])
plt.title('Model accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend(['train','test'],loc='upper left')


plt.subplot2grid((2,2),(1,1))
plt.plot(history1.history['loss'])
plt.plot(history1.history['val_loss'])
plt.title('Model loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['train','test'],loc='upper left')

plt.show()