In [1]:
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import re
import itertools
from collections import Counter
import pickle
import pandas as pd
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def clean_str(string):
    """
    Tokenization/string cleaning for datasets.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [23]:
def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [0] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences

In [4]:
def build_input_data(sentences, labels, vocabulary):
    """
    Maps sentences and labels to vectors based on a vocabulary.
    """
    x = np.array([[vocabulary[word] if word in vocabulary.keys() else len(vocabulary) for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]

In [5]:
DATA_PATH = "../data/"
X_train = pickle.load(open(DATA_PATH + "X_train.p", "rb"))
X_dev = pickle.load(open(DATA_PATH + "X_dev.p", "rb"))
X_train = X_train['comment_text'].values
X_train = [clean_str(sent) for sent in X_train]

In [50]:

y_train = pickle.load(open(DATA_PATH + "y_train.p", "rb"))
y_dev = pickle.load(open(DATA_PATH + "y_dev.p", "rb"))

In [6]:
tokenizer = Tokenizer(nb_words=154653)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)
vocabulary = tokenizer.word_index
print('Found %s unique tokens.' % len(vocabulary))



Found 154527 unique tokens.


In [19]:
vocabulary["<PAD/>"] = 0

In [24]:
sequences_padded = pad_sentences(sequences)

In [20]:
vocabulary_inv = {v:k for k, v in vocabulary.items()}

In [14]:
embeddings_index = {}
f = open('../../../embeddings/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [25]:
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(vocabulary) + 1, EMBEDDING_DIM))
embedding_matrix[-1] = np.random.rand(EMBEDDING_DIM) # oov-vector
for word, i in vocabulary.items():
    embedding_vector = embeddings_index.get(word)
    if word != "<PAD/>":
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.zeros((1, EMBEDDING_DIM))

In [47]:
np.array(sequences_padded).shape

(111699, 1403)

In [52]:
X_train, X_test, y_train, y_test = train_test_split( np.array(sequences_padded), np.array(y_train), test_size=0.2, random_state=42)

In [30]:
model_name = "cnn_512filters-oov_embeddings"

In [53]:
X_train.shape[1]

1403

In [54]:
len(vocabulary_inv)

154528

In [55]:
sequence_length = X_train.shape[1] # 56
vocabulary_size = len(vocabulary_inv) # 18765
embedding_dim = EMBEDDING_DIM
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

epochs = 20
batch_size = 10

In [56]:
print("Creating Model...")
inputs = Input(shape=(sequence_length,), dtype='int32')
embedding = Embedding(input_dim=vocabulary_size + 1, output_dim=embedding_dim, input_length=sequence_length,
                            weights=[embedding_matrix])(inputs)
reshape = Reshape((sequence_length,embedding_dim,1))(embedding)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=6, activation='sigmoid')(dropout)

model = Model(inputs=inputs, outputs=output)

Creating Model...


In [57]:
checkpoint = ModelCheckpoint("../models/" + model_name + ".h5", monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

In [None]:
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[checkpoint], validation_data=(X_test, y_test))  # starts training

Train on 89359 samples, validate on 22340 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f0bb1dd3898>

## Evaluation

In [None]:
X_official_test = pd.read_csv("../data/raw/test.csv")

In [None]:
X_official_test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [None]:
X_official_test = X_official_test['comment_text'].values
X_official_test = [clean_str(sent) for sent in X_official_test]
sequences = tokenizer.texts_to_sequences(X_official_test)

In [None]:
sequence_length =  X_train.shape[1]
padding_word="<PAD/>"
padded_sentences = []
for i in range(len(X_official_test)):
    sentence = sequences[i]
    if len(sentence) < sequence_length:
        num_padding = sequence_length - len(sentence)
    else:
        num_padding = 0
        sentence = sentence[:sequence_length]
    new_sentence = sentence + [padding_word] * num_padding
    padded_sentences.append(new_sentence)

In [None]:
input_official_test = np.array([[vocabulary[word] if word in vocabulary.keys() else len(vocabulary) - 1 for word in sentence] for sentence in padded_sentences])

In [None]:
input_official_test

array([[154527, 154527, 154527, ...,      0,      0,      0],
       [154527, 154527, 154527, ...,      0,      0,      0],
       [154527, 154527, 154527, ...,      0,      0,      0],
       ...,
       [154527, 154527, 154527, ...,      0,      0,      0],
       [154527, 154527, 154527, ...,      0,      0,      0],
       [154527, 154527, 154527, ...,      0,      0,      0]])

In [None]:
input_official_test.shape

(153164, 1403)

In [None]:
pred = model.predict(input_official_test)

In [None]:
pred.shape

(153164, 6)

In [None]:
pred[:10, :]

array([[0.1892244 , 0.04954368, 0.09346229, 0.02513144, 0.07175802,
        0.03028274],
       [0.1892244 , 0.04954368, 0.09346229, 0.02513144, 0.07175802,
        0.03028274],
       [0.19951826, 0.05145558, 0.09838846, 0.02668684, 0.07560417,
        0.03265261],
       [0.1892244 , 0.04954368, 0.09346229, 0.02513144, 0.07175802,
        0.03028274],
       [0.1892244 , 0.04954368, 0.09346229, 0.02513144, 0.07175802,
        0.03028274],
       [0.1892244 , 0.04954368, 0.09346229, 0.02513144, 0.07175802,
        0.03028274],
       [0.1892244 , 0.04954368, 0.09346229, 0.02513144, 0.07175802,
        0.03028274],
       [0.1892244 , 0.04954368, 0.09346229, 0.02513144, 0.07175802,
        0.03028274],
       [0.1892244 , 0.04954368, 0.09346229, 0.02513144, 0.07175802,
        0.03028274],
       [0.1892244 , 0.04954368, 0.09346229, 0.02513144, 0.07175802,
        0.03028274]], dtype=float32)

In [None]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [None]:
model_json = model.to_json()
with open("../models/" + model_name + ".json", "w") as json_file:
    json_file.write(model_json)
print("Saved model to disk")

Saved model to disk


In [None]:
sample_submission = pd.read_csv('../submissions/sample_submission.csv')
sample_submission[list_classes] = pred
sample_submission.to_csv("../submissions/" + model_name + ".csv", index=False)