In [None]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y_train = train[classes].values
sentences_train = train["comment_text"]
sentences_test = test["comment_text"]

In [None]:
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(sentences_train))
tokenized_train = tokenizer.texts_to_sequences(sentences_train)
tokenized_test = tokenizer.texts_to_sequences(sentences_test)

In [None]:
maxlen = 200 # Can play with this, visualization show most comments are less than 50 words. 200 covers approx 98% comments.
X_train = pad_sequences(tokenized_train, maxlen=maxlen)
X_test = pad_sequences(tokenized_test, maxlen=maxlen)

In [None]:
embed_size = 128

inp = Input(shape=(maxlen, )) #maxlen=200 as defined earlier
x = Embedding(max_features, embed_size)(inp)
x = LSTM(60, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)

model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

In [None]:
batch_size = 32
epochs = 2
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

In [None]:
model.save('lstm.h5')

In [None]:
testPred = model.predict(X_test)
data_to_submit = pd.DataFrame.from_items([
    ('id',test["id"]),
    ('toxic',testPred[:,0]),
    ('severe_toxic',testPred[:,1]),
    ('obscene',testPred[:,2]),
    ('threat',testPred[:,3]),
    ('insult',testPred[:,4]),
    ('identity_hate',testPred[:,5])
])
data_to_submit.to_csv('csv_to_submit.csv', index = False)
data_to_submit.head()

In [None]:
# Bidirectional LSTM
embed_size = 128
inp = Input(shape=(maxlen, )) #maxlen=200 as defined earlier
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(LSTM(60, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)

model2 = Model(inputs=inp, outputs=x)
model2.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

batch_size = 32
epochs = 2
model2.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)
model2.save('Bilstm.h5')

testPred = model2.predict(X_test)
data_to_submit2 = pd.DataFrame.from_items([
    ('id',test["id"]),
    ('toxic',testPred[:,0]),
    ('severe_toxic',testPred[:,1]),
    ('obscene',testPred[:,2]),
    ('threat',testPred[:,3]),
    ('insult',testPred[:,4]),
    ('identity_hate',testPred[:,5])
])
data_to_submit2.to_csv('csv_to_submit2.csv', index = False)
data_to_submit2.head()