In [1]:
import logging
import pickle
from sklearn import metrics
from sklearn.metrics import f1_score
import keras
from keras.models import Model, Sequential
from keras.layers import *
from keras.preprocessing.text import one_hot
from keras.optimizers import Adam
from tqdm import tnrange, tqdm_notebook
from keras.layers.advanced_activations import LeakyReLU
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.utils import class_weight
from keras.callbacks import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import gensim

In [3]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.1
set_session(tf.Session(config=config))

In [4]:
MODEL_PATH = "../models/"

In [5]:
DATA_PATH = "../data/"
X_train = pickle.load(open(DATA_PATH + "X_train.p", "rb"))
X_dev = pickle.load(open(DATA_PATH + "X_dev.p", "rb"))
y_train = pickle.load(open(DATA_PATH + "y_train.p", "rb"))
y_dev = pickle.load(open(DATA_PATH + "y_dev.p", "rb"))

In [6]:
y_train.shape

(111699, 6)

In [7]:
train_text = X_train['comment_text']

In [10]:
tokenizer = Tokenizer(num_words=18400, char_level=True, oov_token = 0)
tokenizer.fit_on_texts(train_text)
sequences = tokenizer.texts_to_sequences(train_text)

In [15]:
char_index = tokenizer.word_index

In [41]:
embeddings_index = {}
f = open("../../../embeddings/pretrained_character_embeddings.txt")
for line in f:
    if line[0] != ' ':
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
    else:
        values = line.split()
        word = ' '
        coefs = np.asarray(values, dtype='float32')
    embeddings_index[word] = coefs
f.close()
embeddings_index['\n'] = embeddings_index.pop('\\n')

In [42]:
embeddings_index.keys()

dict_keys(['#', ']', '-', '{', 'O', '[', '/', 'S', 'l', '?', '$', 'x', '~', 'm', 't', 'I', 'b', 'F', 'h', 'Y', 'n', '"', 'U', ',', 'g', '0', 'k', 'T', '+', 'p', '!', 'y', 'Z', 's', 'N', '6', '5', '4', 'f', '\n', '1', '3', 'o', '8', 'q', 'V', '9', '}', 'C', 'e', '@', 'X', '7', 'c', 'w', 'L', '^', 'i', 'j', 'a', '2', "'", '&', 'R', 'E', 'r', ' ', '*', 'd', 'H', 'Q', 'z', ';', 'W', ')', 'D', 'A', 'v', 'G', '_', '|', 'K', 'J', 'M', '.', ':', '%', 'u', 'B', '(', 'P'])

In [None]:
max_length = 300

In [45]:
print('Found %s word vectors.' % len(embeddings_index))
embedding_matrix = np.zeros((len(char_index) + 1, 300))
oov = []
for char, i in char_index.items():
    try:
        embedding_vector = embeddings_index[char]
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    except Exception as e:
        oov.append(char)
embedding_size = 300
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_size,
                            weights=[embedding_matrix],
                            input_length=max_length, trainable = False, mask_zero = True)

Found 91 word vectors.


In [48]:
len(oov)

1972

In [50]:
x_train_texts = tokenizer.texts_to_sequences(X_train['comment_text'])
x_train_texts = pad_sequences(x_train_texts, maxlen=max_length, padding='post')

x_dev_texts = tokenizer.texts_to_sequences(X_dev['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')

In [51]:
batch_norm = False
num_ensembles = 11

def create_lstm_model():
    sequence_input = Input(shape=(max_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    lstm1 = LSTM(20, return_sequences=True)(embedded_sequences)
    lstm2 = LSTM(5)(lstm1)
    output = Dense(units=6, activation='sigmoid')(lstm2)
    return Model(sequence_input, output)

lstm_models = [create_lstm_model() for i in range(0,num_ensembles)]

ensemble_input = Input(shape=(max_length,))
concat = keras.layers.Concatenate()([lstm_model(ensemble_input) for lstm_model in lstm_models])
# averaged = keras.layers.average([lstm_model(ensemble_input) for lstm_model in lstm_models])
weighted = Dense(units=1)(concat)
ensemble_lstm = Model(inputs=[ensemble_input], outputs=[weighted])
ensemble_lstm.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])    


ensemble_lstm.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           (None, 300)          0                                            
__________________________________________________________________________________________________
model_1 (Model)                 (None, 6)            645436      input_12[0][0]                   
__________________________________________________________________________________________________
model_2 (Model)                 (None, 6)            645436      input_12[0][0]                   
__________________________________________________________________________________________________
model_3 (Model)                 (None, 6)            645436      input_12[0][0]                   
__________________________________________________________________________________________________
model_4 (M

In [52]:
model_name = "keras_lstm_char_ensemble_classification_model"

In [None]:
# model_name = "cnn_hannes"
# check_point_path= "../models/" + model_name + ".h5"
# callbacks = [
#      ModelCheckpoint(check_point_path, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
# ]
for model in lstm_models:
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
    callbacks = [
        keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1),
        EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
    ]
    model.fit(x_train_texts, y_train, validation_data=(x_dev_texts, y_dev),
              epochs=15, batch_size=50, callbacks = callbacks)

Train on 111699 samples, validate on 47872 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Train on 111699 samples, validate on 47872 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 9/15
Epoch 3/15

In [55]:
model_json = model.to_json()
with open(MODEL_PATH + model_name + ".json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(MODEL_PATH + model_name + ".h5")
print("Saved model to disk")
# ensemble_cnn.load_weights(check_point_path)

Saved model to disk


In [57]:
model.evaluate(x_dev_texts, y_dev)



[0.06006679061911054, 0.9800231369501129]

In [58]:
pred_dev = model.predict(x_dev_texts)

In [59]:
def mean_roc_auc(y_true, y_pred):
    roc_auc_scores = []
    for i in range(0, y_true.shape[1]):
        roc_auc_scores.append(metrics.roc_auc_score(y_true[:, i], y_pred[:, i]))
    print(roc_auc_scores)
    return np.mean(roc_auc_scores)

In [60]:
mean_roc_auc(y_dev, pred_dev)

[0.9418627723516417, 0.9806486401237782, 0.9729367663499064, 0.928991961918, 0.9643282237926842, 0.939841603038536]


0.954768327929091

In [61]:
model_name = 'keras_lstm_char_ensemble_classification_model'

In [62]:
import pandas as pd

In [63]:
X_official_test = pd.read_csv("../data/raw/test.csv")
x_dev_texts = tokenizer.texts_to_sequences(X_official_test['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')
pred = model.predict(x_dev_texts)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
sample_submission = pd.read_csv('../submissions/sample_submission.csv')
sample_submission[list_classes] = pred
sample_submission.to_csv("../submissions/" + model_name + ".csv", index=False)