In [1]:
import numpy as np
import pandas as pd
np.random.seed(8)
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

#import os
#os.environ['OMP_NUM_THREADS'] = '4'

train = pd.read_csv('train.csv')
test_cm = pd.read_csv('test.csv')
test_lb = pd.read_csv('test_labels.csv')
test_all = pd.merge(test_cm, test_lb, on='id')
test = test_all[test_all['toxic'] != -1]

Using TensorFlow backend.


In [2]:
train.shape

(159571, 8)

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [4]:
train.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [5]:
X_train = train["comment_text"].values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].values
y_test = test[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [6]:
vocab = 100000
maxlen = 200
embed_size = 300

In [7]:
t = text.Tokenizer(num_words=vocab)
t.fit_on_texts(list(X_train) + list(X_test))

In [8]:
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [9]:
EMBEDDING_FILE = 'crawl-300d-2M.vec'
embeddings_index = dict()
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s pre-trained words' % len(embeddings_index))


Loaded 2000000 pre-trained words


In [10]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab, embed_size))
for word, i in t.word_index.items():
    if i >= vocab: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [11]:
num_filters = 32

inp = Input(shape=(maxlen, ))
x = Embedding(vocab, embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False)(inp)
x = SpatialDropout1D(0.45)(x)
x = Reshape((maxlen, embed_size, 1))(x)

conv_0 = Conv2D(num_filters, kernel_size=(2, embed_size), activation='relu')(x)
maxpool_0 = MaxPool2D(pool_size=(maxlen - 1, 1))(conv_0)
conv_1 = Conv2D(num_filters, kernel_size=(3, embed_size), activation='relu')(x)
maxpool_1 = MaxPool2D(pool_size=(maxlen - 2, 1))(conv_1)
conv_2 = Conv2D(num_filters, kernel_size=(4, embed_size), activation='relu')(x)
maxpool_2 = MaxPool2D(pool_size=(maxlen - 3, 1))(conv_2)
conv_3 = Conv2D(num_filters, kernel_size=(5, embed_size), activation='relu')(x)
maxpool_3 = MaxPool2D(pool_size=(maxlen - 4, 1))(conv_3)
conv_4 = Conv2D(num_filters, kernel_size=(6, embed_size), activation='relu')(x)
maxpool_4 = MaxPool2D(pool_size=(maxlen - 5, 1))(conv_4)

y = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3, maxpool_4])
y = Flatten()(y)
y = Dropout(0.1)(y)

outp = Dense(6, activation="sigmoid")(y)

model = Model(inputs=inp, outputs=outp)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 200)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 200, 300)      30000000                                     
____________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDrop (None, 200, 300)      0                                            
____________________________________________________________________________________________________
reshape_1 (Reshape)              (None, 200, 300, 1)   0                                            
___________________________________________________________________________________________

In [12]:
x_trainS, x_val, y_trainS, y_val = train_test_split(x_train, y_train, train_size=0.90, random_state=8)

In [13]:
class RocAuc(Callback):

    def on_epoch_end(self, epoch, logs={}):
        y_pred = self.model.predict(self.validation_data[0], verbose=0)
        score = roc_auc_score(self.validation_data[1], y_pred)
        print("\n roc_auc - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [14]:
from keras.callbacks import ModelCheckpoint
batch_size=512
epochs = 5
checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.pre_trained.conv2.testscore.hdf5', verbose=1, save_best_only=True)
RocAuc = RocAuc()

In [15]:
h = model.fit(x_trainS, y_trainS, batch_size=batch_size, epochs=epochs, validation_data=(x_val, y_val),
                 callbacks=[checkpointer, RocAuc], verbose=2)

Train on 143613 samples, validate on 15958 samples
Epoch 1/5
Epoch 00000: val_loss improved from inf to 0.05188, saving model to saved_models/weights.best.pre_trained.conv2.testscore.hdf5

 roc_auc - epoch: 1 - score: 0.958207 

867s - loss: 0.0912 - acc: 0.9740 - val_loss: 0.0519 - val_acc: 0.9818
Epoch 2/5
Epoch 00001: val_loss improved from 0.05188 to 0.04634, saving model to saved_models/weights.best.pre_trained.conv2.testscore.hdf5

 roc_auc - epoch: 2 - score: 0.983637 

869s - loss: 0.0562 - acc: 0.9803 - val_loss: 0.0463 - val_acc: 0.9826
Epoch 3/5
Epoch 00002: val_loss improved from 0.04634 to 0.04412, saving model to saved_models/weights.best.pre_trained.conv2.testscore.hdf5

 roc_auc - epoch: 3 - score: 0.986364 

2516s - loss: 0.0513 - acc: 0.9813 - val_loss: 0.0441 - val_acc: 0.9835
Epoch 4/5
Epoch 00003: val_loss did not improve

 roc_auc - epoch: 4 - score: 0.987104 

872s - loss: 0.0490 - acc: 0.9818 - val_loss: 0.0449 - val_acc: 0.9834
Epoch 5/5
Epoch 00004: val_loss d

In [20]:
y_test

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ..., 
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0]])

In [16]:
y_pred = model.predict(x_test, batch_size=256)

In [18]:
y_pred

array([[  1.42437266e-03,   4.58605267e-04,   1.13832753e-03,
          2.18832152e-04,   1.50326930e-03,   3.14336939e-04],
       [  3.60133201e-01,   1.11746425e-02,   5.68318292e-02,
          1.76060712e-03,   6.86836615e-02,   5.27967094e-03],
       [  3.51708800e-01,   1.69059541e-03,   3.22898962e-02,
          5.14577550e-04,   4.68423702e-02,   1.12910233e-02],
       ..., 
       [  9.09519255e-01,   3.39733437e-02,   1.29502222e-01,
          4.65495558e-03,   3.62436593e-01,   6.16120994e-01],
       [  9.98376727e-01,   2.54105687e-01,   9.86995935e-01,
          7.73123465e-03,   9.58500087e-01,   4.15390670e-01],
       [  9.64257773e-03,   7.30560452e-04,   1.79213297e-03,
          1.18136872e-04,   2.50023347e-03,   4.82432224e-04]], dtype=float32)

In [17]:
score = roc_auc_score(y_test, y_pred)
print("\n roc_auc score: %.6f \n" % (score))


 roc_auc score: 0.980166 



In [23]:
submission = pd.read_csv('sample_submission.csv')
y_pred = model.predict(x_test, batch_size=1024)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('submission.csv', index=False)

In [1]:
from keras.models import load_model
model_saved = load_model('saved_models/weights.best.pre_trained.conv2.testscore.hdf5')

Using TensorFlow backend.


In [2]:
model_saved.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 200)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 200, 300)      30000000                                     
____________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDrop (None, 200, 300)      0                                            
____________________________________________________________________________________________________
reshape_1 (Reshape)              (None, 200, 300, 1)   0                                            
___________________________________________________________________________________________