In [1]:
import logging
import pickle
from sklearn import metrics
from sklearn.metrics import f1_score
import keras
from keras.models import Model, Sequential
from keras.layers import *
from keras.preprocessing.text import one_hot
from keras.optimizers import Adam
from tqdm import tnrange, tqdm_notebook
from keras.layers.advanced_activations import LeakyReLU
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.utils import class_weight
from keras.callbacks import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))

Things to include:
* bias initialization with 1
* dropout (recurrent?) https://github.com/tensorflow/tensorflow/blob/v1.3.0/tensorflow/contrib/keras/python/keras/layers/recurrent.py#L140 
* Adam / SGD with learning rate annealing + momentum
* self-attention
* Bayesian Optimization for hyperparameters
* Gradient norm clipping

In [3]:
MODEL_PATH = "../models/"

In [4]:
DATA_PATH = "../data/"
X_train = pickle.load(open(DATA_PATH + "X_train.p", "rb"))
X_dev = pickle.load(open(DATA_PATH + "X_dev.p", "rb"))
y_train = pickle.load(open(DATA_PATH + "y_train.p", "rb"))
y_dev = pickle.load(open(DATA_PATH + "y_dev.p", "rb"))

In [5]:
y_train.shape

(111699, 6)

In [6]:
train_text = X_train['comment_text']

In [7]:
tokenizer = Tokenizer(num_words=18400)
tokenizer.fit_on_texts(train_text)
sequences = tokenizer.texts_to_sequences(train_text)

In [8]:
word_index = tokenizer.word_index
embeddings_index = {}
f = open('../../../embeddings/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [9]:
print('Found %s word vectors.' % len(embeddings_index))
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
embedding_size = 300
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_size,
                            weights=[embedding_matrix],
                            input_length=200, trainable = False)

Found 400000 word vectors.


In [10]:
max_length = 200

x_train_texts = tokenizer.texts_to_sequences(X_train['comment_text'])
x_train_texts = pad_sequences(x_train_texts, maxlen=max_length, padding='post')

x_dev_texts = tokenizer.texts_to_sequences(X_dev['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')

In [11]:
batch_norm = False
num_ensembles = 11

def create_lstm_model():
    sequence_input = Input(shape=(max_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    lstm1 = CuDNNLSTM(20, return_sequences=True)(embedded_sequences)
    lstm2 = CuDNNLSTM(5, return_sequences=True)(lstm1)
    flatten = Flatten()(lstm2)
    output = Dense(units=6, activation='sigmoid')(flatten)
    return Model(sequence_input, output)

lstm_models = [create_lstm_model() for i in range(0,num_ensembles)]


ensemble_input = Input(shape=(max_length,))
averaged = keras.layers.average([lstm_model(ensemble_input) for lstm_model in lstm_models])
ensemble_cnn = Model(inputs=[ensemble_input], outputs=[averaged])
ensemble_cnn.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])    


ensemble_cnn.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           (None, 200)          0                                            
__________________________________________________________________________________________________
model_1 (Model)                 (None, 6)            50661446    input_12[0][0]                   
__________________________________________________________________________________________________
model_2 (Model)                 (None, 6)            50661446    input_12[0][0]                   
__________________________________________________________________________________________________
model_3 (Model)                 (None, 6)            50661446    input_12[0][0]                   
__________________________________________________________________________________________________
model_4 (M

In [12]:
# model_name = "cnn_hannes"
# check_point_path= "../models/" + model_name + ".h5"
# callbacks = [
#      ModelCheckpoint(check_point_path, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
# ]
for model in lstm_models:
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
    callbacks = [
        keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1),
        EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
    ]
    model.fit(x_train_texts, y_train, validation_data=(x_dev_texts, y_dev),
              epochs=15, batch_size=50, callbacks = callbacks)

Train on 111699 samples, validate on 47872 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Train on 111699 samples, validate on 47872 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Train on 111699 samples, validate on 47872 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Train on 111699 samples, validate on 47872 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Train on 111699 samples, validate on 47872 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Train on 111699 samples, validate on 47872 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Train on 111699 samples, validate on 47872 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch

In [13]:
model_json = model.to_json()
with open(MODEL_PATH + "keras_lstm_ensemble_classification_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
ensemble_cnn.save_weights(MODEL_PATH + "keras_lstm_ensemble_classification_model.h5")
print("Saved model to disk")
# ensemble_cnn.load_weights(check_point_path)

Saved model to disk


In [14]:
ensemble_cnn.evaluate(x_dev_texts, y_dev)



[0.047678700197372276, 0.9823104860310886]

In [15]:
pred_dev = ensemble_cnn.predict(x_dev_texts)

In [16]:
def mean_roc_auc(y_true, y_pred):
    roc_auc_scores = []
    for i in range(0, y_true.shape[1]):
        roc_auc_scores.append(metrics.roc_auc_score(y_true[:, i], y_pred[:, i]))
    print(roc_auc_scores)
    return np.mean(roc_auc_scores)

In [17]:
mean_roc_auc(y_dev, pred_dev)

[0.9754723279166753, 0.9884952389090977, 0.9863664084279117, 0.9768081833663581, 0.9814469156121579, 0.9754583762725628]


0.9806745750841274

In [18]:
model_name = 'keras_ensemble_lstm_classification_model'

In [19]:
import pandas as pd

In [21]:
X_official_test = pd.read_csv("../data/raw/test.csv")
x_dev_texts = tokenizer.texts_to_sequences(X_official_test['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')
pred = model.predict(x_dev_texts)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
sample_submission = pd.read_csv('../submissions/sample_submission.csv')
sample_submission[list_classes] = pred
sample_submission.to_csv("../submissions/" + model_name + ".csv", index=False)