In [19]:
import numpy as np
import pandas as pd
np.random.seed(8)
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('train.csv')
test_cm = pd.read_csv('test.csv')
test_lb = pd.read_csv('test_labels.csv')
test_all = pd.merge(test_cm, test_lb, on='id')
test = test_all[test_all['toxic'] != -1]

In [20]:
train.shape

(159571, 8)

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [22]:
train.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [23]:
X_train = train["comment_text"].values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].values
y_test = test[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [24]:
vocab = 100000
maxlen = 200
embed_size = 300

In [25]:
t = text.Tokenizer(num_words=vocab)
t.fit_on_texts(list(X_train) + list(X_test))

In [26]:
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [27]:
EMBEDDING_FILE = 'crawl-300d-2M.vec'
embeddings_index = dict()
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s pre-trained words' % len(embeddings_index))


Loaded 2000000 pre-trained words


In [28]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab, embed_size))
for word, i in t.word_index.items():
    if i >= vocab: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [29]:
def make_model(activation, filters, Sdroprate, droprate, layers):

    inp = Input(shape=(maxlen, ))
    x = Embedding(vocab, embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False)(inp)
    x = SpatialDropout1D(Sdroprate)(x)
    x = Reshape((maxlen, embed_size, 1))(x)

    conv_0 = Conv2D(filters, kernel_size=(2, embed_size), activation=activation)(x)
    maxpool_0 = MaxPool2D(pool_size=(maxlen - 1, 1))(conv_0)
    if layers == 2:   
        conv_1 = Conv2D(filters, kernel_size=(3, embed_size), activation=activation)(x)
        maxpool_1 = MaxPool2D(pool_size=(maxlen - 2, 1))(conv_1)
        y = Concatenate(axis=1)([maxpool_0, maxpool_1])
    elif layers == 3:
        conv_1 = Conv2D(filters, kernel_size=(3, embed_size), activation=activation)(x)
        maxpool_1 = MaxPool2D(pool_size=(maxlen - 2, 1))(conv_1)
        conv_2 = Conv2D(filters, kernel_size=(4, embed_size), activation=activation)(x)
        maxpool_2 = MaxPool2D(pool_size=(maxlen - 3, 1))(conv_2) 
        y = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
    elif layers == 4:
        conv_1 = Conv2D(filters, kernel_size=(3, embed_size), activation=activation)(x)
        maxpool_1 = MaxPool2D(pool_size=(maxlen - 2, 1))(conv_1)
        conv_2 = Conv2D(filters, kernel_size=(4, embed_size), activation=activation)(x)
        maxpool_2 = MaxPool2D(pool_size=(maxlen - 3, 1))(conv_2)
        conv_3 = Conv2D(filters, kernel_size=(5, embed_size), activation=activation)(x)
        maxpool_3 = MaxPool2D(pool_size=(maxlen - 4, 1))(conv_3)
        y = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])
    elif layers == 5:
        conv_1 = Conv2D(filters, kernel_size=(3, embed_size), activation=activation)(x)
        maxpool_1 = MaxPool2D(pool_size=(maxlen - 2, 1))(conv_1)
        conv_2 = Conv2D(filters, kernel_size=(4, embed_size), activation=activation)(x)
        maxpool_2 = MaxPool2D(pool_size=(maxlen - 3, 1))(conv_2)
        conv_3 = Conv2D(filters, kernel_size=(5, embed_size), activation=activation)(x)
        maxpool_3 = MaxPool2D(pool_size=(maxlen - 4, 1))(conv_3)
        conv_4 = Conv2D(filters, kernel_size=(6, embed_size), activation=activation)(x)
        maxpool_4 = MaxPool2D(pool_size=(maxlen - 5, 1))(conv_4)
        y = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3, maxpool_4])
    else:
        conv_1 = Conv2D(filters, kernel_size=(3, embed_size), activation=activation)(x)
        maxpool_1 = MaxPool2D(pool_size=(maxlen - 2, 1))(conv_1)
        conv_2 = Conv2D(filters, kernel_size=(4, embed_size), activation=activation)(x)
        maxpool_2 = MaxPool2D(pool_size=(maxlen - 3, 1))(conv_2)
        conv_3 = Conv2D(filters, kernel_size=(5, embed_size), activation=activation)(x)
        maxpool_3 = MaxPool2D(pool_size=(maxlen - 4, 1))(conv_3)
        conv_4 = Conv2D(filters, kernel_size=(6, embed_size), activation=activation)(x)
        maxpool_4 = MaxPool2D(pool_size=(maxlen - 5, 1))(conv_4)
        conv_5 = Conv2D(filters, kernel_size=(7, embed_size), activation=activation)(x)
        maxpool_5 = MaxPool2D(pool_size=(maxlen - 6, 1))(conv_5)
        y = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3, maxpool_4, maxpool_5])
    
    y = Flatten()(y)
    y = Dropout(droprate)(y)

    outp = Dense(6, activation="sigmoid")(y)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model
#model = make_model()
#model.summary()

In [30]:
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits = 1, test_size = 0.1)

In [31]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
my_classifier = KerasClassifier(make_model, batch_size=256)

validator = GridSearchCV(my_classifier, param_grid={'activation': ['tanh'],
                                                   'epochs': [5],
                                                   'filters': [32],
                                                   'Sdroprate': [0.1, 0.2, 0.3, 0.4, 0.5],
                                                   'droprate': [0.1, 0.2, 0.3, 0.4, 0.5],
                                                   'layers': [4]},
                                                   cv = cv, 
                                                   scoring = 'roc_auc', 
                                                   n_jobs=1)

In [32]:
grid_result = validator.fit(x_train, y_train)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [33]:
print("Best score: %f with %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
# validator.best_estimator_.model returns the (unwrapped) keras model
best_model = validator.best_estimator_.model

Best score: 0.989065 with {'Sdroprate': 0.3, 'activation': 'tanh', 'droprate': 0.1, 'epochs': 5, 'filters': 32, 'layers': 4}
0.988599 (0.000000) with: {'Sdroprate': 0.1, 'activation': 'tanh', 'droprate': 0.1, 'epochs': 5, 'filters': 32, 'layers': 4}
0.988385 (0.000000) with: {'Sdroprate': 0.1, 'activation': 'tanh', 'droprate': 0.2, 'epochs': 5, 'filters': 32, 'layers': 4}
0.988555 (0.000000) with: {'Sdroprate': 0.1, 'activation': 'tanh', 'droprate': 0.3, 'epochs': 5, 'filters': 32, 'layers': 4}
0.988490 (0.000000) with: {'Sdroprate': 0.1, 'activation': 'tanh', 'droprate': 0.4, 'epochs': 5, 'filters': 32, 'layers': 4}
0.988127 (0.000000) with: {'Sdroprate': 0.1, 'activation': 'tanh', 'droprate': 0.5, 'epochs': 5, 'filters': 32, 'layers': 4}
0.988205 (0.000000) with: {'Sdroprate': 0.2, 'activation': 'tanh', 'droprate': 0.1, 'epochs': 5, 'filters': 32, 'layers': 4}
0.988847 (0.000000) with: {'Sdroprate': 0.2, 'activation': 'tanh', 'droprate': 0.2, 'epochs': 5, 'filters': 32, 'layers': 4}


In [34]:
y_pred = best_model.predict(x_test, batch_size=256)

In [35]:
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, y_pred)
print("\n roc_auc score for keras model: %.6f \n" % (score))


 roc_auc score for keras model: 0.982126 



In [36]:
best_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_36 (InputLayer)           (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_36 (Embedding)        (None, 200, 300)     30000000    input_36[0][0]                   
__________________________________________________________________________________________________
spatial_dropout1d_36 (SpatialDr (None, 200, 300)     0           embedding_36[0][0]               
__________________________________________________________________________________________________
reshape_36 (Reshape)            (None, 200, 300, 1)  0           spatial_dropout1d_36[0][0]       
__________________________________________________________________________________________________
conv2d_141