In [1]:
import logging
import pickle
from sklearn import metrics
from sklearn.metrics import f1_score
import keras
from keras.models import Model, Sequential
from keras.layers import *
from keras.preprocessing.text import one_hot
from keras.optimizers import Adam
from tqdm import tnrange, tqdm_notebook
from keras.layers.advanced_activations import LeakyReLU
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.utils import class_weight
from keras.callbacks import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))

Things to include:
* bias initialization with 1
* dropout (recurrent?) https://github.com/tensorflow/tensorflow/blob/v1.3.0/tensorflow/contrib/keras/python/keras/layers/recurrent.py#L140 
* Adam / SGD with learning rate annealing + momentum
* self-attention
* Bayesian Optimization for hyperparameters
* Gradient norm clipping

In [3]:
MODEL_PATH = "../models/"

In [4]:
DATA_PATH = "../data/"
X_train = pickle.load(open(DATA_PATH + "X_train.p", "rb"))
X_dev = pickle.load(open(DATA_PATH + "X_dev.p", "rb"))
y_train = pickle.load(open(DATA_PATH + "y_train.p", "rb"))
y_dev = pickle.load(open(DATA_PATH + "y_dev.p", "rb"))

In [5]:
y_train.shape

(111699, 6)

In [6]:
train_text = X_train['comment_text']

In [7]:
tokenizer = Tokenizer(num_words=18400)
tokenizer.fit_on_texts(train_text)
sequences = tokenizer.texts_to_sequences(train_text)

In [8]:
word_index = tokenizer.word_index
embeddings_index = {}
f = open('../../../embeddings/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [9]:
print('Found %s word vectors.' % len(embeddings_index))
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
embedding_size = 300
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_size,
                            weights=[embedding_matrix],
                            input_length=200, trainable = False)

Found 400000 word vectors.


In [10]:
max_length = 200

x_train_texts = tokenizer.texts_to_sequences(X_train['comment_text'])
x_train_texts = pad_sequences(x_train_texts, maxlen=max_length, padding='post')

x_dev_texts = tokenizer.texts_to_sequences(X_dev['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')

In [29]:
batch_norm = False
num_ensembles = 21

def create_conv_model():
    sequence_input = Input(shape=(max_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    conv1 = Conv1D(64, 5, padding='same', activation='tanh', input_shape=(max_length, embedding_size))(embedded_sequences)
    pool1 = MaxPooling1D(pool_size=2)(conv1)
    batch_norm
    if batch_norm: pool1 = BatchNormalization()(pool1)
    conv2 = Conv1D(128, 5, activation='tanh')(pool1)
    pool2 = MaxPooling1D(pool_size=2)(conv2)
    if batch_norm: pool2 = BatchNormalization()(pool2)
    conv3 = Conv1D(128, 3, activation='tanh')(pool2)
    pool3 = MaxPooling1D(pool_size=2)(conv3)
    if batch_norm: pool3 = BatchNormalization()(pool3)
    conv4 = Conv1D(128, 2, activation='tanh')(pool3)
    pool4 = MaxPooling1D(pool_size=2)(conv4)
    if batch_norm: pool4 = BatchNormalization()(pool4)
    conv5 = Conv1D(128, 2, activation='tanh')(pool4)
    pool5 = MaxPooling1D(pool_size=2)(conv5)
    flatten = Flatten()(pool5)
    dense = Dense(1024, activation='tanh')(flatten)
    output = Dense(units=6, activation='sigmoid')(dense)
    return Model(sequence_input, output)

conv_models = [create_conv_model() for i in range(0,num_ensembles)]


ensemble_input = Input(shape=(max_length,))
averaged = keras.layers.average([conv_model(ensemble_input) for conv_model in conv_models])
ensemble_cnn = Model(inputs=[ensemble_input], outputs=[averaged])
ensemble_cnn.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])    


ensemble_cnn.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_90 (InputLayer)           (None, 200)          0                                            
__________________________________________________________________________________________________
model_67 (Model)                (None, 6)            51564058    input_90[0][0]                   
__________________________________________________________________________________________________
model_68 (Model)                (None, 6)            51564058    input_90[0][0]                   
__________________________________________________________________________________________________
model_69 (Model)                (None, 6)            51564058    input_90[0][0]                   
__________________________________________________________________________________________________
model_70 (

In [30]:
# model_name = "cnn_hannes"
# check_point_path= "../models/" + model_name + ".h5"
# callbacks = [
#      ModelCheckpoint(check_point_path, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
# ]
for model in conv_models:
    model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])
    callbacks = [
        keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5),
        EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
    ]
    model.fit(x_train_texts, y_train, validation_data=(x_dev_texts, y_dev),
              epochs=10, batch_size=50, callbacks = callbacks)

Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Tr

Epoch 7/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Ep

Epoch 5/10
Train on 111699 samples, validate on 47872 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [31]:
model_json = model.to_json()
with open(MODEL_PATH + "keras_cnn_hannes_classification_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
ensemble_cnn.save_weights(MODEL_PATH + "keras_cnn_hannes_classification_model.h5")
print("Saved model to disk")
# ensemble_cnn.load_weights(check_point_path)

Saved model to disk


In [32]:
ensemble_cnn.evaluate(x_dev_texts, y_dev)



[0.049636824942616634, 0.9817290753126144]

In [33]:
pred_dev = ensemble_cnn.predict(x_dev_texts)

In [34]:
def mean_roc_auc(y_true, y_pred):
    roc_auc_scores = []
    for i in range(0, y_true.shape[1]):
        roc_auc_scores.append(metrics.roc_auc_score(y_true[:, i], y_pred[:, i]))
    print(roc_auc_scores)
    return np.mean(roc_auc_scores)

In [35]:
mean_roc_auc(y_dev, pred_dev)

[0.9731841249300874, 0.9871194006182306, 0.9848754006123026, 0.9650801528504815, 0.9801902029421672, 0.9631875156142653]


0.9756061329279224

In [39]:
model_name = 'keras_ensemble_cnn_classification_model'

In [37]:
import pandas as pd

In [40]:
X_official_test = pd.read_csv("../data/raw/test.csv")
x_dev_texts = tokenizer.texts_to_sequences(X_official_test['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')
pred = model.predict(x_dev_texts)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
sample_submission = pd.read_csv('../submissions/sample_submission.csv')
sample_submission[list_classes] = pred
sample_submission.to_csv("../submissions/" + model_name + ".csv", index=False)