In [1]:
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import re
import itertools
from collections import Counter
import pickle
import pandas as pd

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [3]:
def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences

In [4]:
def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]

In [5]:
def build_vocab(sentences):
    """
    Builds a vocabulary mapping from word to index based on the sentences.
    Returns vocabulary mapping and inverse vocabulary mapping.
    """
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary_inv = list(sorted(vocabulary_inv))
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return [vocabulary, vocabulary_inv]

In [6]:
# DATA_PATH = "../data/"
# X_train = pickle.load(open(DATA_PATH + "X_train.p", "rb"))
# X_dev = pickle.load(open(DATA_PATH + "X_dev.p", "rb"))
# y_train = pickle.load(open(DATA_PATH + "y_train.p", "rb"))
# y_dev = pickle.load(open(DATA_PATH + "y_dev.p", "rb"))
# X_train = X_train['comment_text'].values
# X_train = [clean_str(sent) for sent in X_train]
# X_train = [s.split(" ") for s in X_train]
# sentences_padded = pad_sentences(X_train)
# vocabulary, vocabulary_inv = build_vocab(sentences_padded)
# pickle.dump(vocabulary, open("../data/preprocessed/cnn/vocabulary.p", "wb"))
# pickle.dump(vocabulary_inv, open("../data/preprocessed/cnn/vocabulary_inv.p", "wb"))
# x, y = build_input_data(sentences_padded, y_train, vocabulary)
# X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42)
# pickle.dump(X_train, open("../data/preprocessed/cnn/X_train.p", "wb"))
# pickle.dump(X_test, open("../data/preprocessed/cnn/X_test.p", "wb"))
# pickle.dump(y_train, open("../data/preprocessed/cnn/y_train.p", "wb"))
# pickle.dump(y_test, open("../data/preprocessed/cnn/y_test.p", "wb"))

In [7]:
X_train = pickle.load( open("../data/preprocessed/cnn/X_train.p", "rb"))
X_test = pickle.load( open("../data/preprocessed/cnn/X_test.p", "rb"))
y_train = pickle.load( open("../data/preprocessed/cnn/y_train.p", "rb"))
y_test = pickle.load( open("../data/preprocessed/cnn/y_test.p", "rb"))
vocabulary = pickle.load( open("../data/preprocessed/cnn/vocabulary.p", "rb"))
vocabulary_inv = pickle.load( open("../data/preprocessed/cnn/vocabulary_inv.p", "rb"))

In [8]:
X_train.shape

(89359, 4948)

In [9]:
model_name = "cnn_512filters"

In [10]:
sequence_length = X_train.shape[1] # 56
vocabulary_size = len(vocabulary_inv) # 18765
embedding_dim = 256
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

epochs = 20
batch_size = 10

In [11]:
print("Creating Model...")
inputs = Input(shape=(sequence_length,), dtype='int32')
embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length)(inputs)
reshape = Reshape((sequence_length,embedding_dim,1))(embedding)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=6, activation='sigmoid')(dropout)

model = Model(inputs=inputs, outputs=output)

Creating Model...


In [12]:
checkpoint = ModelCheckpoint("../models/" + model_name + ".h5", monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

In [None]:
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[checkpoint], validation_data=(X_test, y_test))  # starts training

Train on 89359 samples, validate on 22340 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fc50a063d30>

In [None]:
model.evaluate(X_test, y_test)



[0.11420533155799904, 0.9797821752707021]

## Evaluation

In [None]:
X_official_test = pd.read_csv("../data/raw/test.csv")

In [None]:
X_official_test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [None]:
X_official_test = X_official_test['comment_text'].values
X_official_test = [clean_str(sent) for sent in X_official_test]
X_official_test = [s.split(" ") for s in X_official_test]
sentences_padded = pad_sentences(X_official_test)
input_official_test = [[vocabulary[word] if word in vocabulary.keys() else vocabulary['<PAD/>'] for word in sentence][:sequence_length] for sentence in sentences_padded]
input_official_test = np.array(input_official_test)

In [None]:
pred = model.predict(input_official_test)

In [None]:
pred.shape

(153164, 6)

In [None]:
pred[:10, :]

array([[1.0000000e+00, 7.6079491e-04, 9.9977845e-01, 4.5479645e-04,
        6.5788597e-01, 2.8980246e-03],
       [4.1717476e-07, 2.0273018e-10, 2.5410711e-07, 6.9606576e-10,
        9.4658432e-08, 1.0131702e-08],
       [2.4840146e-02, 3.7809627e-05, 5.5967597e-04, 2.2267770e-05,
        3.3064093e-04, 3.0477891e-05],
       [4.9160352e-08, 7.8039498e-08, 3.3403830e-06, 4.7232027e-07,
        2.6196167e-06, 7.4032030e-08],
       [1.7843114e-12, 1.6204430e-09, 6.9018929e-10, 9.2310645e-11,
        2.3779442e-10, 1.8485312e-11],
       [2.8221211e-06, 7.3520727e-07, 2.0043837e-05, 5.2830787e-06,
        3.8845370e-05, 7.1646940e-07],
       [2.9325328e-08, 1.7341399e-09, 1.1999602e-07, 1.0083566e-07,
        1.3109195e-08, 5.0725123e-07],
       [4.0071312e-01, 4.7665526e-07, 3.3666592e-04, 2.1422446e-07,
        1.8792100e-04, 6.1540923e-08],
       [4.7383113e-12, 5.0560233e-12, 9.1178835e-11, 1.2671983e-12,
        1.2710340e-10, 4.4644449e-12],
       [6.5379454e-18, 5.0153257e-13,

In [None]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [None]:
model_json = model.to_json()
with open("../models/" + model_name + ".json", "w") as json_file:
    json_file.write(model_json)
print("Saved model to disk")

Saved model to disk


In [None]:
sample_submission = pd.read_csv('../submissions/sample_submission.csv')
sample_submission[list_classes] = pred
sample_submission.to_csv("../submissions/" + model_name + ".csv", index=False)