In [1]:
import logging
import pickle
from sklearn import metrics
from sklearn.metrics import f1_score
import keras
from keras.models import Model, Sequential
from keras.layers import *
from keras.preprocessing.text import one_hot
from keras.optimizers import Adam
from tqdm import tnrange, tqdm_notebook
from keras.layers.advanced_activations import LeakyReLU
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.utils import class_weight
from keras.callbacks import *

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import gensim

In [3]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.2
set_session(tf.Session(config=config))

In [4]:
MODEL_PATH = "../models/"

In [5]:
DATA_PATH = "../data/"
X_train = pickle.load(open(DATA_PATH + "X_train.p", "rb"))
X_dev = pickle.load(open(DATA_PATH + "X_dev.p", "rb"))
y_train = pickle.load(open(DATA_PATH + "y_train.p", "rb"))
y_dev = pickle.load(open(DATA_PATH + "y_dev.p", "rb"))

In [6]:
y_train.shape

(111699, 6)

In [7]:
train_text = X_train['comment_text']

In [8]:
tokenizer = Tokenizer(num_words=18400)
tokenizer.fit_on_texts(train_text)
sequences = tokenizer.texts_to_sequences(train_text)

In [9]:
word_index = tokenizer.word_index

In [10]:
from gensim.models.wrappers import FastText
embeddings_index_fasttext = gensim.models.KeyedVectors.load_word2vec_format('../../../embeddings/crawl-300d-2M.vec')

In [11]:
embeddings_index_w2v = gensim.models.KeyedVectors.load_word2vec_format('../../../embeddings/GoogleNews-vectors-negative300.bin', binary = True)

In [12]:
max_length = 200

x_train_texts = tokenizer.texts_to_sequences(X_train['comment_text'])
x_train_texts = pad_sequences(x_train_texts, maxlen=max_length, padding='post')

x_dev_texts = tokenizer.texts_to_sequences(X_dev['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')

In [13]:
print('Found %s word vectors.' % len(embeddings_index_w2v.vocab))
embedding_matrix = np.zeros((len(word_index) + 1, 300))
oov = []
for word, i in word_index.items():
    try:
        embedding_vector = embeddings_index_w2v[word]
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    except Exception as e:
        oov.append(word)
embedding_size = 300
embedding_layer_w2v = Embedding(embedding_matrix.shape[0],
                            embedding_size,
                            weights=[embedding_matrix],
                            input_length=200, trainable = False)
print('Found %s word vectors.' % len(embeddings_index_fasttext.vocab))
embedding_matrix = np.zeros((len(word_index) + 1, 300))
oov = []
for word, i in word_index.items():
    try:
        embedding_vector = embeddings_index_fasttext[word]
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    except Exception as e:
        oov.append(word)
embedding_size = 300
embedding_layer_fasttext = Embedding(embedding_matrix.shape[0],
                            embedding_size,
                            weights=[embedding_matrix],
                            input_length=200, trainable = False)

Found 3000000 word vectors.
Found 2000000 word vectors.


In [14]:
len(set(oov))

80572

In [22]:
sequence_input = Input(shape=(max_length,), dtype='int32')
embedded_sequences_w2v = embedding_layer_w2v(sequence_input)
embedded_sequences_fasttext = embedding_layer_fasttext(sequence_input)
concat = keras.layers.Concatenate()([embedded_sequences_w2v, embedded_sequences_fasttext])
flatten = Flatten()(concat)
dense = Dense(20)(flatten)
reshape = Reshape((-1, 1))(dense)
lstm = LSTM(20, return_sequences=True)(reshape)
flatten_lstm = Flatten()(lstm)
output = Dense(units=6, activation='sigmoid')(flatten_lstm)
model = Model(inputs=sequence_input, outputs=output)
model.compile(loss='binary_crossentropy',
          optimizer='adam',
          metrics=['acc'])    
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     50649300    input_2[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 200, 300)     50649300    input_2[0][0]                    
__________________________________________________________________________________________________
concatenate_2 (Concatenate)     (None, 200, 600)     0           embedding_1[1][0]                
                                                                 embedding_2[1][0]                
__________

In [16]:
model_name = "keras_cnn_embeddings-ensemble_model"

In [17]:
from sklearn.metrics import roc_auc_score

In [18]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [None]:
RocAuc = RocAucEvaluation(validation_data=(x_dev_texts, y_dev), interval=1)
model.fit(x_train_texts, y_train, validation_data=(x_dev_texts, y_dev),
              epochs=15, batch_size=5, callbacks=[RocAuc])

Train on 111699 samples, validate on 47872 samples
Epoch 1/15

 ROC-AUC - epoch: 1 - score: 0.966530 

Epoch 2/15

 ROC-AUC - epoch: 2 - score: 0.962169 

Epoch 3/15

 ROC-AUC - epoch: 3 - score: 0.956504 

Epoch 4/15

 ROC-AUC - epoch: 4 - score: 0.946031 

Epoch 5/15

 ROC-AUC - epoch: 5 - score: 0.944926 

Epoch 6/15

 ROC-AUC - epoch: 6 - score: 0.939071 

Epoch 7/15

 ROC-AUC - epoch: 7 - score: 0.940198 

Epoch 8/15

 ROC-AUC - epoch: 8 - score: 0.939484 

Epoch 9/15

 ROC-AUC - epoch: 9 - score: 0.940458 

Epoch 10/15

 ROC-AUC - epoch: 10 - score: 0.927821 

Epoch 11/15

 ROC-AUC - epoch: 11 - score: 0.925480 

Epoch 12/15

In [None]:
model_json = model.to_json()
with open(MODEL_PATH + model_name + ".json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(MODEL_PATH + model_name + ".h5")
print("Saved model to disk")
# ensemble_cnn.load_weights(check_point_path)

In [24]:
model.evaluate(x_dev_texts, y_dev)



[0.17409613444422264, 0.9758279218393213]

In [25]:
pred_dev = model.predict(x_dev_texts)

In [26]:
def mean_roc_auc(y_true, y_pred):
    roc_auc_scores = []
    for i in range(0, y_true.shape[1]):
        roc_auc_scores.append(metrics.roc_auc_score(y_true[:, i], y_pred[:, i]))
    print(roc_auc_scores)
    return np.mean(roc_auc_scores)

In [27]:
mean_roc_auc(y_dev, pred_dev)

[0.9194586462700822, 0.9729842416688508, 0.9362496427452032, 0.8994808456313645, 0.9448475024412867, 0.9398699282914871]


0.9354818011747125

In [29]:
import pandas as pd

In [30]:
X_official_test = pd.read_csv("../data/raw/test.csv")
x_dev_texts = tokenizer.texts_to_sequences(X_official_test['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')
pred = model.predict(x_dev_texts)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
sample_submission = pd.read_csv('../submissions/sample_submission.csv')
sample_submission[list_classes] = pred
sample_submission.to_csv("../submissions/" + model_name + ".csv", index=False)