In [1]:
from sklearn.metrics import roc_auc_score
import logging
import pickle
from sklearn import metrics
from sklearn.metrics import f1_score
import keras
from keras.models import Model, Sequential
from keras.layers import *
from keras.preprocessing.text import one_hot
from keras.optimizers import Adam
from tqdm import tnrange, tqdm_notebook
from keras.layers.advanced_activations import LeakyReLU
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn.utils import class_weight
from keras.callbacks import *
import gensim
import tensorflow as tf
import pandas as pd
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.2
set_session(tf.Session(config=config))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
model_name = "keras_cnn_model_new"

In [3]:
MODEL_PATH = "../models/"

In [4]:
DATA_PATH = "../data/"
X_train = pickle.load(open(DATA_PATH + "X_train.p", "rb"))
X_dev = pickle.load(open(DATA_PATH + "X_dev.p", "rb"))
y_train = pickle.load(open(DATA_PATH + "y_train.p", "rb"))
y_dev = pickle.load(open(DATA_PATH + "y_dev.p", "rb"))

In [5]:
y_train.shape

(111699, 6)

In [6]:
train_text = X_train['comment_text']

In [7]:
tokenizer = Tokenizer(num_words=18400)
tokenizer.fit_on_texts(train_text)
sequences = tokenizer.texts_to_sequences(train_text)

In [8]:
word_index = tokenizer.word_index

In [9]:
embeddings_index_w2v = gensim.models.KeyedVectors.load_word2vec_format('../../../embeddings/GoogleNews-vectors-negative300.bin', binary = True)

In [10]:
max_length = 200

x_train_texts = tokenizer.texts_to_sequences(X_train['comment_text'])
x_train_texts = pad_sequences(x_train_texts, maxlen=max_length, padding='post')

x_dev_texts = tokenizer.texts_to_sequences(X_dev['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')

In [11]:
print('Found %s word vectors.' % len(embeddings_index_w2v.vocab))
embedding_matrix = np.zeros((len(word_index) + 1, 300))
oov = []
for word, i in word_index.items():
    try:
        embedding_vector = embeddings_index_w2v[word]
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    except Exception as e:
        oov.append(word)
embedding_size = 300
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_size,
                            weights=[embedding_matrix],
                            input_length=200, trainable = False)

Found 3000000 word vectors.


NameError: name 'embeddings_index_fasttext' is not defined

In [13]:
len(set(oov))

109501

In [15]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [23]:
sequence_input = Input(shape=(max_length,), dtype='int32')
embedded_sequences_w2v = embedding_layer(sequence_input)
sequence_length = 200
vocabulary_size = len(embeddings_index_w2v.vocab) # 18765
embedding_dim = 300
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

epochs = 100
batch_size = 30

# this returns a tensor
print("Creating Model...")
inputs = Input(shape=(sequence_length,), dtype='int32')
embedding = embedding_layer(inputs)
reshape = Reshape((sequence_length,embedding_dim,1))(embedding)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=6, activation='sigmoid')(dropout)

# this creates a model that includes
model = Model(inputs=inputs, outputs=output)

Creating Model...


In [24]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     50649300    input_8[0][0]                    
__________________________________________________________________________________________________
reshape_3 (Reshape)             (None, 200, 300, 1)  0           embedding_1[6][0]                
__________________________________________________________________________________________________
conv2d_4 (Conv2D)               (None, 198, 1, 512)  461312      reshape_3[0][0]                  
__________________________________________________________________________________________________
conv2d_5 (

In [25]:
RocAuc = RocAucEvaluation(validation_data=(x_dev_texts, y_dev), interval=1)
model.fit(x_train_texts, y_train, validation_data=(x_dev_texts, y_dev),
              epochs=15, batch_size=5, callbacks=[RocAuc])

Train on 111699 samples, validate on 47872 samples
Epoch 1/15

 ROC-AUC - epoch: 1 - score: 0.964135 

Epoch 2/15

 ROC-AUC - epoch: 2 - score: 0.948982 

Epoch 3/15

 ROC-AUC - epoch: 3 - score: 0.947544 

Epoch 4/15

 ROC-AUC - epoch: 4 - score: 0.942824 

Epoch 5/15

 ROC-AUC - epoch: 5 - score: 0.945325 

Epoch 6/15

 ROC-AUC - epoch: 6 - score: 0.941825 

Epoch 7/15

 ROC-AUC - epoch: 7 - score: 0.935729 

Epoch 8/15

 ROC-AUC - epoch: 8 - score: 0.935964 

Epoch 9/15

 ROC-AUC - epoch: 9 - score: 0.932680 

Epoch 10/15
 22370/111699 [=====>........................] - ETA: 3:17 - loss: 0.0444 - acc: 0.9872

KeyboardInterrupt: 

In [None]:
# Model Hyperparameters
embedding_dim = 50
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 10

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10

# Build model
if model_type == "CNN-static":
    input_shape = (sequence_length, embedding_dim)
else:
    input_shape = (sequence_length,)

model_input = Input(shape=input_shape)

# Static model does not have embedding layer
if model_type == "CNN-static":
    z = model_input
else:
    z = Embedding(len(vocabulary_inv), embedding_dim, input_length=sequence_length, name="embedding")(model_input)

z = Dropout(dropout_prob[0])(z)

# Convolutional block
conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])


In [None]:
model_json = model.to_json()
with open(MODEL_PATH + model_name + ".json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(MODEL_PATH + model_name + ".h5")
print("Saved model to disk")
# ensemble_cnn.load_weights(check_point_path)

In [None]:
model.evaluate(x_dev_texts, y_dev)

In [None]:
pred_dev = model.predict(x_dev_texts)

In [None]:
def mean_roc_auc(y_true, y_pred):
    roc_auc_scores = []
    for i in range(0, y_true.shape[1]):
        roc_auc_scores.append(metrics.roc_auc_score(y_true[:, i], y_pred[:, i]))
    print(roc_auc_scores)
    return np.mean(roc_auc_scores)

In [None]:
mean_roc_auc(y_dev, pred_dev)

In [None]:
X_official_test = pd.read_csv("../data/raw/test.csv")
x_dev_texts = tokenizer.texts_to_sequences(X_official_test['comment_text'])
x_dev_texts = pad_sequences(x_dev_texts, maxlen=max_length, padding='post')
pred = model.predict(x_dev_texts)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
sample_submission = pd.read_csv('../submissions/sample_submission.csv')
sample_submission[list_classes] = pred
sample_submission.to_csv("../submissions/" + model_name + ".csv", index=False)