In [1]:
import numpy as np
import pandas as pd
np.random.seed(8)
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('train.csv')
test_cm = pd.read_csv('test.csv')
test_lb = pd.read_csv('test_labels.csv')
test_all = pd.merge(test_cm, test_lb, on='id')
test = test_all[test_all['toxic'] != -1]

Using TensorFlow backend.


In [2]:
X_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].str.lower()
y_test = test[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [3]:
vocab = 100000
maxlen = 200
embed_size = 300

In [4]:
t = text.Tokenizer(num_words=vocab)
t.fit_on_texts(list(X_train) + list(X_test))

In [5]:
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [6]:
EMBEDDING_FILE = 'crawl-300d-2M.vec'
embeddings_index = dict()
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s pre-trained words' % len(embeddings_index))


Loaded 2000000 pre-trained words


In [7]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab, embed_size))
for word, i in t.word_index.items():
    if i >= vocab: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [8]:
def make_model(k=[2,3,5,7], activation='tanh', filters=32, Sdroprate=0.4, droprate=0.0):

    inp = Input(shape=(maxlen, ))
    x = Embedding(vocab, embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False)(inp)
    x = SpatialDropout1D(Sdroprate)(x)
    x = Reshape((maxlen, embed_size, 1))(x)
    
    conv = dict()
    maxpool = dict()
    for h in k:
        conv[h] = Conv2D(filters, kernel_size=(h, embed_size), activation=activation)(x)
        maxpool[h] = MaxPool2D(pool_size=(maxlen - h + 1, 1))(conv[h])
        
    y = Concatenate(axis=1)([pool for key,pool in maxpool.items()])
    y = Flatten()(y)
    y = Dropout(droprate)(y)

    outp = Dense(6, activation="sigmoid")(y)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model
model = make_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     30000000    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 200, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 200, 300, 1)  0           spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
conv2d_1 (

In [9]:
class RocAuc(Callback):

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.validation_data[0], verbose=0)
        score = roc_auc_score(self.validation_data[1], y_pred)
        print("\n roc_auc - epoch: %d - score: %.6f \n" % (epoch+1, score))

SyntaxError: EOF while scanning triple-quoted string literal (<ipython-input-9-f9389c3a6461>, line 6)

In [10]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.x_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.x_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [11]:
from keras.callbacks import ModelCheckpoint, EarlyStopping
batch_size=256
epochs = 5
file_path = 'saved_models/weights.best.hdf5'
check_point = ModelCheckpoint(filepath=file_path, verbose=1, save_best_only=True)
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)
#roc_auc = RocAuc()
x_trainS, x_val, y_trainS, y_val = train_test_split(x_train, y_train, train_size=0.90, random_state=8)

In [14]:
roc_auc = RocAucEvaluation(validation_data=(x_val, y_val), interval=1)

In [15]:
h = model.fit(x_trainS, y_trainS, batch_size=batch_size, epochs=epochs, validation_data=(x_val, y_val),
                 callbacks=[check_point, early_stop, roc_auc], verbose=2)

Train on 143613 samples, validate on 15958 samples
Epoch 1/5
 - 653s - loss: 0.0823 - acc: 0.9740 - val_loss: 0.0469 - val_acc: 0.9824

Epoch 00001: val_loss improved from inf to 0.04691, saving model to saved_models/weights.best.hdf5

 ROC-AUC - epoch: 1 - score: 0.980438 

Epoch 2/5
 - 667s - loss: 0.0489 - acc: 0.9818 - val_loss: 0.0429 - val_acc: 0.9833

Epoch 00002: val_loss improved from 0.04691 to 0.04288, saving model to saved_models/weights.best.hdf5

 ROC-AUC - epoch: 2 - score: 0.986126 

Epoch 3/5
 - 648s - loss: 0.0450 - acc: 0.9828 - val_loss: 0.0414 - val_acc: 0.9840

Epoch 00003: val_loss improved from 0.04288 to 0.04145, saving model to saved_models/weights.best.hdf5

 ROC-AUC - epoch: 3 - score: 0.987421 

Epoch 4/5
 - 668s - loss: 0.0427 - acc: 0.9836 - val_loss: 0.0408 - val_acc: 0.9844

Epoch 00004: val_loss improved from 0.04145 to 0.04082, saving model to saved_models/weights.best.hdf5

 ROC-AUC - epoch: 4 - score: 0.988260 

Epoch 5/5
 - 655s - loss: 0.0412 - ac

In [16]:
y_pred = model.predict(x_test, batch_size=256)

In [17]:
score = roc_auc_score(y_test, y_pred)
print("\n roc_auc score: %.6f \n" % (score))


 roc_auc score: 0.982246 



In [18]:
from keras.models import load_model
model_saved = load_model(file_path)

In [19]:
model_saved.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     30000000    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 200, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 200, 300, 1)  0           spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
conv2d_1 (

In [20]:
y_pred = model_saved.predict(x_test, batch_size=256)

In [21]:
score = roc_auc_score(y_test, y_pred)
print("\n roc_auc score: %.6f \n" % (score))


 roc_auc score: 0.981639 

