In [1]:
# Import required libraries
import numpy as np
import pandas as pd
np.random.seed(8)
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, Conv1D, MaxPooling1D, GlobalMaxPooling1D 
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import re
import warnings
warnings.filterwarnings('ignore')

# Read in train and test datasets from csv files
train = pd.read_csv('train.csv')
test_cm = pd.read_csv('test.csv')
test_lb = pd.read_csv('test_labels.csv')
# Merge test comments with test labels
test_all = pd.merge(test_cm, test_lb, on='id')
# Remove all test entries with labels equal to -1
test = test_all[test_all['toxic'] != -1]

Using TensorFlow backend.


In [2]:
X_train = train["comment_text"]
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"]
y_test = test[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values

In [3]:
# Define vocabulary size
vocab = 100000
# Define maximum length of a comment
maxlen = 200
# Define embedding size which should equal the embedding size of the pre-trained word vectors
embed_size = 300

In [4]:
# Removing ip address
X_train = X_train.apply(lambda x: re.sub("(?:[0-9]{1,3}\.){3}[0-9]{1,3}","",x))
X_test = X_test.apply(lambda x: re.sub("(?:[0-9]{1,3}\.){3}[0-9]{1,3}","",x))
# Removing url link
X_train = X_train.apply(lambda x: re.sub("http://.*com","",x))
X_test = X_test.apply(lambda x: re.sub("http://.*com","",x))
# Removing username
X_train = X_train.apply(lambda x: re.sub("\[\[.*\]","",x))
X_test = X_test.apply(lambda x: re.sub("\[\[.*\]","",x))

In [5]:
# Filter out stop words
from nltk.corpus import stopwords

def filter_stop_words(sentences, stop_words):
    filtered = []
    for sentence in sentences:
        words = sentence.split()
        words_filtered = [word for word in words if word not in stop_words]
        filtered.append(" ".join(words_filtered))
    return filtered
 
stop_words = set(stopwords.words("english"))
 
# Comments in train
X_train_ = filter_stop_words(X_train, stop_words)
 
# Comments in test (excluding labels with -1 values)
X_test_ = filter_stop_words(X_test, stop_words)

In [6]:
# Tokenize the train dataset
t = text.Tokenizer(num_words=vocab)
t.fit_on_texts(list(X_train_))

In [7]:
# Convert both train and test datasets into sequences
X_train = t.texts_to_sequences(X_train_)
X_test = t.texts_to_sequences(X_test_)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [8]:
# Load pre-trained word vectors
EMBEDDING_FILE = 'crawl-300d-2M.vec'
embeddings_index = dict()
f = open(EMBEDDING_FILE)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s pre-trained words' % len(embeddings_index))

Loaded 2000000 pre-trained words


In [9]:
# Create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab, embed_size))
for word, i in t.word_index.items():
    if i >= vocab: 
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [10]:
# Define make_model function to create a CNN model
def make_model(k=[7], activation='relu', filters=100, Sdroprate=0.5, droprate=0.0):

    inp = Input(shape=(maxlen, ))
    x = Embedding(vocab, embed_size, weights=[embedding_matrix], input_length=maxlen, trainable=False)(inp)
    x = SpatialDropout1D(Sdroprate)(x)
    x = Reshape((maxlen, embed_size, 1))(x)
    
    conv = dict()
    maxpool = dict()
    for h in k:
        conv[h] = Conv2D(filters, kernel_size=(h, embed_size), activation=activation)(x)
        maxpool[h] = MaxPool2D(pool_size=(maxlen - h + 1, 1))(conv[h])
        
    if len(k) == 1:
        y = maxpool[h]
    else:
        y = Concatenate(axis=1)([pool for key,pool in maxpool.items()])
    
    y = Flatten()(y)
    y = Dropout(droprate)(y)

    outp = Dense(6, activation="sigmoid")(y)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model
# Generate the model based on the default parameters
model = make_model()
# Print out the model summary
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 300)          30000000  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 300)          0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 200, 300, 1)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 194, 1, 100)       210100    
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 1, 1, 100)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 100)               0         
__________

In [11]:
# Define the cross validation split
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits = 2, test_size = 0.1)

In [12]:
# Wrap Keras model with KerasClassifier so that it can be used in Sklearn GridSearchCV
# Generate a GridSearchCV instance with the parameters to be tuned
# This is a starting model and we try to check which activation function performs better

from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
my_classifier = KerasClassifier(make_model, verbose=1)

validator = GridSearchCV(my_classifier, param_grid={'k': [[7]],
                                                    'activation': ['relu'],
                                                   'filters': [100],
                                                   'Sdroprate':[0.3,0.4,0.5],
                                                   'droprate': [0.00],
                                                   'epochs': [3],
                                                   'batch_size': [128]},
                                                    cv = cv, 
                                                    scoring = 'roc_auc', 
                                                    verbose=10)

In [13]:
# Run the Grid Search
grid_result = validator.fit(x_train, y_train)

Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] Sdroprate=0.3, activation=relu, batch_size=128, droprate=0.0, epochs=3, filters=100, k=[7] 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Epoch 1/3
Epoch 2/3
Epoch 3/3
[CV]  Sdroprate=0.3, activation=relu, batch_size=128, droprate=0.0, epochs=3, filters=100, k=[7], score=0.9882000380246808, total=34.2min
[CV] Sdroprate=0.3, activation=relu, batch_size=128, droprate=0.0, epochs=3, filters=100, k=[7] 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 39.3min remaining:    0.0s


Epoch 1/3
Epoch 2/3
Epoch 3/3
[CV]  Sdroprate=0.3, activation=relu, batch_size=128, droprate=0.0, epochs=3, filters=100, k=[7], score=0.984340767209594, total=33.8min
[CV] Sdroprate=0.4, activation=relu, batch_size=128, droprate=0.0, epochs=3, filters=100, k=[7] 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 78.1min remaining:    0.0s


Epoch 1/3
Epoch 2/3
Epoch 3/3
[CV]  Sdroprate=0.4, activation=relu, batch_size=128, droprate=0.0, epochs=3, filters=100, k=[7], score=0.9877251870938318, total=33.7min
[CV] Sdroprate=0.4, activation=relu, batch_size=128, droprate=0.0, epochs=3, filters=100, k=[7] 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 116.8min remaining:    0.0s


Epoch 1/3
Epoch 2/3
Epoch 3/3
[CV]  Sdroprate=0.4, activation=relu, batch_size=128, droprate=0.0, epochs=3, filters=100, k=[7], score=0.984133161808232, total=33.6min
[CV] Sdroprate=0.5, activation=relu, batch_size=128, droprate=0.0, epochs=3, filters=100, k=[7] 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 155.5min remaining:    0.0s


Epoch 1/3
Epoch 2/3
Epoch 3/3
[CV]  Sdroprate=0.5, activation=relu, batch_size=128, droprate=0.0, epochs=3, filters=100, k=[7], score=0.9876272808634594, total=33.6min
[CV] Sdroprate=0.5, activation=relu, batch_size=128, droprate=0.0, epochs=3, filters=100, k=[7] 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 194.2min remaining:    0.0s


Epoch 1/3
Epoch 2/3
Epoch 3/3
[CV]  Sdroprate=0.5, activation=relu, batch_size=128, droprate=0.0, epochs=3, filters=100, k=[7], score=0.9826468991886611, total=33.7min


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 232.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 232.9min finished


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [14]:
print("Best score: %f with %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
# Get the best model
best_model = validator.best_estimator_.model

Best score: 0.986270 with {'Sdroprate': 0.3, 'activation': 'relu', 'batch_size': 128, 'droprate': 0.0, 'epochs': 3, 'filters': 100, 'k': [7]}
0.986270 (0.001930) with: {'Sdroprate': 0.3, 'activation': 'relu', 'batch_size': 128, 'droprate': 0.0, 'epochs': 3, 'filters': 100, 'k': [7]}
0.985929 (0.001796) with: {'Sdroprate': 0.4, 'activation': 'relu', 'batch_size': 128, 'droprate': 0.0, 'epochs': 3, 'filters': 100, 'k': [7]}
0.985137 (0.002490) with: {'Sdroprate': 0.5, 'activation': 'relu', 'batch_size': 128, 'droprate': 0.0, 'epochs': 3, 'filters': 100, 'k': [7]}


In [15]:
# Perform prediction with unseen test dataset with the best model
y_pred = best_model.predict(x_test, batch_size=256)

In [16]:
# Calculate the roc_auc score
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, y_pred)
print("\n roc_auc score for keras model: %.6f \n" % (score))


 roc_auc score for keras model: 0.978442 



In [17]:
# Generate a summary report for the best model architecture
best_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 200, 300)          30000000  
_________________________________________________________________
spatial_dropout1d_8 (Spatial (None, 200, 300)          0         
_________________________________________________________________
reshape_8 (Reshape)          (None, 200, 300, 1)       0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 194, 1, 100)       210100    
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 1, 1, 100)         0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 100)               0         
__________