In [6]:
import numpy as np
import pandas as pd
np.random.seed(8)
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import re

import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('train.csv')
test_cm = pd.read_csv('test.csv')
test_lb = pd.read_csv('test_labels.csv')
test_all = pd.merge(test_cm, test_lb, on='id')
test = test_all[test_all['toxic'] != -1]

In [7]:
X_train = train["comment_text"].str.lower()
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].str.lower()
y_test = test[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [8]:
vocab = 100000
maxlen = 200
embed_size = 300

In [9]:
# removing ip address
X_train = X_train.apply(lambda x: re.sub("(?:[0-9]{1,3}\.){3}[0-9]{1,3}","",x))
# removing url link
X_train = X_train.apply(lambda x: re.sub("http://.*com","",x))
# removing username
X_train = X_train.apply(lambda x: re.sub("\[\[.*\]","",x))

In [10]:
from nltk.corpus import stopwords

def filter_stop_words(sentences, stop_words):
    filtered = []
    for sentence in sentences:
        words = sentence.split()
        words_filtered = [word for word in words if word not in stop_words]
        filtered.append(" ".join(words_filtered))
    return filtered
 
stop_words = set(stopwords.words("english"))
 
# Comments in train
X_train_ = filter_stop_words(X_train, stop_words)
 
# Comments in test (excluding labels with -1 values)
X_test_ = filter_stop_words(X_test, stop_words)

In [11]:
t = text.Tokenizer(num_words=vocab)
t.fit_on_texts(list(X_train_) + list(X_test_))

In [12]:
X_train = t.texts_to_sequences(X_train)
X_test = t.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [13]:
EMBEDDING_FILE = 'crawl-300d-2M.vec'
embeddings_index = dict()
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s pre-trained words' % len(embeddings_index))


Loaded 2000000 pre-trained words


In [14]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab, embed_size))
for word, i in t.word_index.items():
    if i >= vocab: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [15]:
def make_model(k=[2,3,4,5], activation='tanh', filters=32, Sdroprate=0.4, droprate=0.1):

    inp = Input(shape=(maxlen, ))
    x = Embedding(vocab, embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False)(inp)
    x = SpatialDropout1D(Sdroprate)(x)
    x = Reshape((maxlen, embed_size, 1))(x)
    
    conv = dict()
    maxpool = dict()
    for h in k:
        conv[h] = Conv2D(filters, kernel_size=(h, embed_size), activation=activation)(x)
        maxpool[h] = MaxPool2D(pool_size=(maxlen - h + 1, 1))(conv[h])
        
    y = Concatenate(axis=1)([pool for key,pool in maxpool.items()])
    
    y = Flatten()(y)
    y = Dropout(droprate)(y)

    outp = Dense(6, activation="sigmoid")(y)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model
model = make_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     30000000    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 200, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 200, 300, 1)  0           spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
conv2d_1 (

In [17]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
my_classifier = KerasClassifier(make_model, verbose=1)
cv = ShuffleSplit(n_splits = 1, test_size = 0.1)
validator = GridSearchCV(my_classifier, param_grid={'k': [[3,6,7,8]],
                                                    'activation': ['tanh'],
                                                   'filters': [32],
                                                   'Sdroprate': [0.4],
                                                   'droprate': [0.05],
                                                   'epochs': [5],
                                                   'batch_size': [256]},
                                                    cv = cv, 
                                                    scoring = 'roc_auc', 
                                                    n_jobs=1, verbose=10)

In [18]:
grid_result = validator.fit(x_train, y_train)

Fitting 1 folds for each of 1 candidates, totalling 1 fits
[CV] Sdroprate=0.4, activation=tanh, batch_size=256, droprate=0.05, epochs=5, filters=32, k=[3, 6, 7, 8] 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[CV]  Sdroprate=0.4, activation=tanh, batch_size=256, droprate=0.05, epochs=5, filters=32, k=[3, 6, 7, 8], score=0.9878520432061898, total=61.0min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 64.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 64.8min finished


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [19]:
print("Best score: %f with %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
# validator.best_estimator_.model returns the (unwrapped) keras model
best_model = validator.best_estimator_.model

Best score: 0.987852 with {'Sdroprate': 0.4, 'activation': 'tanh', 'batch_size': 256, 'droprate': 0.05, 'epochs': 5, 'filters': 32, 'k': [3, 6, 7, 8]}
0.987852 (0.000000) with: {'Sdroprate': 0.4, 'activation': 'tanh', 'batch_size': 256, 'droprate': 0.05, 'epochs': 5, 'filters': 32, 'k': [3, 6, 7, 8]}


In [20]:
y_pred = best_model.predict(x_test, batch_size=256)

In [21]:
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, y_pred)
print("\n roc_auc score for keras model: %.6f \n" % (score))


 roc_auc score for keras model: 0.981922 



In [22]:
best_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 200, 300)     30000000    input_3[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_3 (SpatialDro (None, 200, 300)     0           embedding_3[0][0]                
__________________________________________________________________________________________________
reshape_3 (Reshape)             (None, 200, 300, 1)  0           spatial_dropout1d_3[0][0]        
__________________________________________________________________________________________________
conv2d_9 (