In [1]:
import numpy as np
import pandas as pd
np.random.seed(8)
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

train = pd.read_csv('train.csv')
test_cm = pd.read_csv('test.csv')
test_lb = pd.read_csv('test_labels.csv')
test_all = pd.merge(test_cm, test_lb, on='id')
test = test_all[test_all['toxic'] != -1]

Using TensorFlow backend.


In [3]:
X_train = train["comment_text"].values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].values
y_test = test[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [4]:
vocab = 100000
maxlen = 200
embed_size = 300

In [5]:
t = text.Tokenizer(num_words=vocab)
t.fit_on_texts(list(X_train) + list(X_test))

In [6]:
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [7]:
EMBEDDING_FILE = 'crawl-300d-2M.vec'
embeddings_index = dict()
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s pre-trained words' % len(embeddings_index))


Loaded 2000000 pre-trained words


In [8]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab, embed_size))
for word, i in t.word_index.items():
    if i >= vocab: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [9]:
def make_model(kernel, activation, filters, Sdroprate):

    inp = Input(shape=(maxlen, ))
    x = Embedding(vocab, embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False)(inp)
    x = SpatialDropout1D(Sdroprate)(x)
    x = Reshape((maxlen, embed_size, 1))(x)

    conv_0 = Conv2D(filters, kernel_size=(kernel, embed_size), activation=activation)(x)
    maxpool_0 = MaxPool2D(pool_size=(maxlen - kernel + 1, 1))(conv_0)
    #conv_1 = Conv2D(num_filters, kernel_size=(3, embed_size), activation=activation)(x)
    #maxpool_1 = MaxPool2D(pool_size=(maxlen - 2, 1))(conv_1)
    #conv_2 = Conv2D(num_filters, kernel_size=(4, embed_size), activation=activation)(x)
    #maxpool_2 = MaxPool2D(pool_size=(maxlen - 3, 1))(conv_2)
    #conv_3 = Conv2D(num_filters, kernel_size=(5, embed_size), activation=activation)(x)
    #maxpool_3 = MaxPool2D(pool_size=(maxlen - 4, 1))(conv_3)
    #conv_4 = Conv2D(num_filters, kernel_size=(6, embed_size), activation=activation)(x)
    #maxpool_4 = MaxPool2D(pool_size=(maxlen - 5, 1))(conv_4)

    #y = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3, maxpool_4])
    #y = Flatten()(y)
    y = Flatten()(maxpool_0)
    y = Dropout(0.1)(y)

    outp = Dense(6, activation="sigmoid")(y)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model
#model = make_model()
#model.summary()

In [15]:
x_trainS, x_val, y_trainS, y_val = train_test_split(x_train, y_train, train_size=0.90, random_state=8)

In [None]:
class RocAuc(Callback):

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.validation_data[0], verbose=0)
        score = roc_auc_score(self.validation_data[1], y_pred)
        print("\n roc_auc - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [15]:
from keras.callbacks import ModelCheckpoint
batch_size=256
epochs = 1
checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.pre_trained.conv2.testscore.hdf5', verbose=1, save_best_only=True)
RocAuc = RocAuc()

In [10]:
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits = 1, test_size = 0.1)

In [11]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
my_classifier = KerasClassifier(make_model, batch_size=256)
# define the grid search parameters
#activation = ['relu', 'elu', 'tanh']
#param_grid = dict(activation=activation)
#param_grid = {'activation' : activation}
validator = GridSearchCV(my_classifier, param_grid={'kernel': [1,2,3,4,5,6,7,8,9,10],
                                                    'activation': ['elu'],
                                                   'epochs': [3],
                                                   'filters': [32],
                                                   'Sdroprate': [0.4]},
                                                   cv = cv, 
                                                   scoring = 'roc_auc', 
                                                   n_jobs=1)

In [12]:
grid_result = validator.fit(x_train, y_train)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [13]:
print('The parameters of the best model are: ')
print(grid_result.best_params_)
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

The parameters of the best model are: 
{'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 7}
0.981575 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 1}
0.982148 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 2}
0.983750 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 3}
0.983134 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 4}
0.982115 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 5}
0.984456 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 6}
0.984867 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 7}
0.983697 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 8}
0.983629 (0.000000) with

In [15]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.984867 using {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 7}
0.981575 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 1}
0.982148 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 2}
0.983750 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 3}
0.983134 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 4}
0.982115 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 5}
0.984456 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 6}
0.984867 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 7}
0.983697 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'elu', 'epochs': 3, 'filters': 32, 'kernel': 8}
0.983629 (0.000000) with: {'Sdroprate': 0.

In [17]:
# validator.best_estimator_.model returns the (unwrapped) keras model
best_model = validator.best_estimator_.model

In [18]:
y_pred = best_model.predict(x_test, batch_size=256)

In [19]:
score = roc_auc_score(y_test, y_pred)
print("\n roc_auc score: %.6f \n" % (score))


 roc_auc score: 0.973500 



In [20]:
best_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 200)               0         
_________________________________________________________________
embedding_11 (Embedding)     (None, 200, 300)          30000000  
_________________________________________________________________
spatial_dropout1d_11 (Spatia (None, 200, 300)          0         
_________________________________________________________________
reshape_11 (Reshape)         (None, 200, 300, 1)       0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 194, 1, 32)        67232     
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 1, 1, 32)          0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 32)                0         
__________