In [1]:
import logging
import pickle
import gensim
from sklearn import metrics
from sklearn.metrics import f1_score
import keras
from keras.models import Model, Sequential
from keras.layers import *
from keras.preprocessing.text import one_hot
from keras.optimizers import Adam
from tqdm import tnrange, tqdm_notebook
from keras.layers.advanced_activations import LeakyReLU
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.utils import class_weight

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))

In [3]:
MODEL_PATH = "../models/"

DATA_PATH = "../data/"
X_train = pickle.load(open(DATA_PATH + "X_train.p", "rb"))
X_dev = pickle.load(open(DATA_PATH + "X_dev.p", "rb"))
y_train = pickle.load(open(DATA_PATH + "y_train.p", "rb"))
y_dev = pickle.load(open(DATA_PATH + "y_dev.p", "rb"))

In [4]:
y_train.shape

(111699, 6)

In [5]:
train_text = X_train['comment_text']

In [6]:
tokenizer = Tokenizer(num_words=18400)
tokenizer.fit_on_texts(train_text)
sequences = tokenizer.texts_to_sequences(train_text)
word_index = tokenizer.word_index

In [7]:
embedding_layer = Embedding(len(word_index) + 1,
                            300,
                            input_length=200, trainable = True)

In [8]:
max_length = 200

x_train_texts = tokenizer.texts_to_sequences(X_train['comment_text'])
x_train_texts = pad_sequences(x_train_texts, maxlen=max_length, padding='post')

x_dev_texts = tokenizer.texts_to_sequences(X_dev['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')

In [9]:
from keras.callbacks import ModelCheckpoint

In [10]:
filepath="../models/lstm_model_custom-embeddings-{epoch:02d}-{val_loss:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [11]:
sequence_input = Input(shape=(max_length,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
lstm = keras.layers.CuDNNLSTM(10, return_sequences=True)(embedded_sequences)
lstm2 = keras.layers.CuDNNLSTM(2)(lstm)
output = Dense(units=6, activation='sigmoid')(lstm2)

model = Model(sequence_input, output)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 300)          50649300  
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 200, 10)           12480     
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 2)                 112       
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 18        
Total params: 50,661,910
Trainable params: 50,661,910
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.fit(x_train_texts, y_train, validation_data=(x_dev_texts, y_dev),
          epochs=20, batch_size=20, callbacks=callbacks_list)

Train on 111699 samples, validate on 47872 samples
Epoch 1/20

Epoch 00001: val_loss improved from -inf to 0.14057, saving model to ../models/lstm_model_custom-embeddings-01-0.14.hdf5
Epoch 2/20

Epoch 00002: val_loss did not improve
Epoch 3/20

Epoch 00003: val_loss did not improve
Epoch 4/20

Epoch 00004: val_loss did not improve
Epoch 5/20

Epoch 00005: val_loss did not improve
Epoch 6/20

Epoch 00006: val_loss did not improve
Epoch 7/20

Epoch 00007: val_loss did not improve
Epoch 8/20

Epoch 00008: val_loss did not improve
Epoch 9/20

Epoch 00009: val_loss did not improve
Epoch 10/20

Epoch 00010: val_loss did not improve
Epoch 11/20

Epoch 00011: val_loss did not improve
Epoch 12/20

Epoch 00012: val_loss did not improve
Epoch 13/20

Epoch 00013: val_loss did not improve
Epoch 14/20

Epoch 00014: val_loss did not improve
Epoch 15/20

Epoch 00015: val_loss did not improve
Epoch 16/20

Epoch 00016: val_loss did not improve
Epoch 17/20

Epoch 00017: val_loss did not improve
Epoch 18

<keras.callbacks.History at 0x7f59a048a630>

In [13]:
model_json = model.to_json()
with open(MODEL_PATH + "keras_lstm_custom-embeddings_classification_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(MODEL_PATH + "keras_lstm_custom-embeddings_classification_model.h5")
print("Saved model to disk")

Saved model to disk


In [14]:
model.evaluate(x_dev_texts, y_dev)



[0.09024629942191957, 0.9758905893102049]

In [15]:
pred_dev = model.predict(x_dev_texts)

In [16]:
def mean_roc_auc(y_true, y_pred):
    roc_auc_scores = []
    for i in range(0, y_true.shape[1]):
        roc_auc_scores.append(metrics.roc_auc_score(y_true[:, i], y_pred[:, i]))
    print(roc_auc_scores)
    return np.mean(roc_auc_scores)

In [17]:
mean_roc_auc(y_dev, pred_dev)

[0.9365083563905039, 0.9803120732122266, 0.978528468700612, 0.8611177345498279, 0.9683603238085808, 0.8471583745159578]


0.9286642218629515

In [18]:
model_name = 'keras_lstm_custom-embeddings_classification_model'

In [19]:
import pandas as pd

In [20]:
X_official_test = pd.read_csv("../data/raw/test.csv")
x_dev_texts = tokenizer.texts_to_sequences(X_official_test['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')
pred = model.predict(x_dev_texts)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
sample_submission = pd.read_csv('../submissions/sample_submission.csv')
sample_submission[list_classes] = pred
sample_submission.to_csv("../submissions/" + model_name + ".csv", index=False)