# Baseline LSTM Model using GloVe Embeddings

In [None]:
import sys,os, re, csv, codecs
import numpy as np
import pandas as pd
import keras

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.models import Model
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras import initializers, regularizers, constraints, optimizers,layers
from time import time

Embedding File : Wikipedia GloVe dataset with 200-Dimensional representation -> embed_size

Maximum number of unique words to use: 20000 -> max_features

Maximum number of words to use in a comment: 100 -> maxlen

In [None]:
train_file_en= '../input/jigsaw-toxic-comment-classification-challenge/train.csv'
train_file_de= '../input/toxic-comments-french-spanish-german-train/train_de.csv'
train_file_fr= '../input/toxic-comments-french-spanish-german-train/train_fr.csv'
train_file_es= '../input/toxic-comments-french-spanish-german-train/train_es.csv'

test_file= '../input/jigsaw-toxic-comment-classification-challenge/test.csv'
test_label_file = '../input/jigsaw'

embedding_file = '../input/glove6b200d/glove.6B.200d.txt'

train_en = pd.read_csv(train_file_en)
train_es = pd.read_csv(train_file_es)
train_fr = pd.read_csv(train_file_fr)
train_de = pd.read_csv(train_file_de)

train = train_en.append(train_es.append(train_de.append(train_fr, ignore_index= True), ignore_index=True), ignore_index=True)
test = pd.read_csv(test_file)

In [None]:
embed_size = 200
max_features = 200000
maxlen = 100

In [None]:
list_sentences_train = train["comment_text"].fillna("_na_").values
list_sentences_test = test["comment_text"].fillna("_na_").values

list_classes = ["toxic","severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_t = train[list_classes].values

In [None]:
np.shape(y_t)

# Preparing the Text Data

In [None]:
tokenizer = Tokenizer(num_words=max_features)

tokenizer.fit_on_texts(list(list_sentences_train))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

word_index = tokenizer.word_index
print('Found %s unique tokens.' %len(word_index))

In [None]:
np.shape(X_t)

# GloVe Word Embeddings

We will be using GloVe embeddings, which you can read about here. GloVe stands for "Global Vectors for Word Representation". It's a somewhat popular embedding technique based on factorizing a matrix of word co-occurence statistics.

Specifically, we will use the **200-dimensional GloVe embeddings of 400k words computed on a 2014 dump of English Wikipedia.** 

## Preparing Embedding Layer

Returns a 20-dimensional vector space coordinate for each word in the embedding, i.e. 400k words each have a 200-d representation.

In [None]:
embeddings_index={}

f = open(embedding_file)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype='float32')
    embeddings_index[word] = coefs

f.close()
print('Found %s word vectors.'%len(embeddings_index))

## Preparing Embedding Matrix

Words that are not in GloVe are randomly initialized
Creating an embedding matrix for 20000 words where each word has a 200 dimensional representation.

In [None]:
all_embeddings = np.stack(embeddings_index.values())

emb_mean= all_embeddings.mean()
emb_stddev = all_embeddings.std()

In [None]:
np.shape(all_embeddings)

In [None]:
nb_words = min(max_features, len(word_index)+1)
embedding_matrix = np.random.normal(emb_mean, emb_stddev, (nb_words, embed_size))
for word, i in word_index.items():
    if i>=max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
np.shape(embedding_matrix)

## Model

Simple bidirectional LSTM with two fully connected layers and adding some dropout is necessary.

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights = [embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
history = model.fit(X_t, y_t, batch_size=50, epochs=10, validation_split=0.3);

In [None]:
y_test = model.predict([X_te],batch_size = 1024, verbose=1)
sample_submission = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')
sample_submission[list_classes] = y_test
sample_submission.to_csv('submission.csv',index = False)

In [None]:
model.summary()