### Libraries

In [1]:
import os
import numpy as np
import random
import tensorflow as tf
import keras
from keras.preprocessing import sequence
import matplotlib.pyplot as plt
import datetime
import pandas as pd
import re
import collections
import string
import math
from sklearn.metrics import roc_auc_score
%matplotlib inline

### Read data

In [2]:
filename = './data/prepr_train_character.csv'
train_data_ = pd.read_csv(filename)
print ('Data size:', len(train_data_))
train_data_.head()

Data size: 3947


Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,"""You fuck your dad."""
1,0,20120528192215Z,"""i really don't understand your point.\xa0 It ..."
2,0,,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,,"""listen if you dont wanna get married to a man..."
4,0,20120619094753Z,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."


In [3]:
filename = './data/prepr_valid_character.csv'
valid_data_ = pd.read_csv(filename)
print ('Data size:', len(valid_data_))
valid_data_.head()

Data size: 2647


Unnamed: 0,Insult,Date,Comment,Usage
0,0,,"""THE DRUDGE REPORT\\n\\n\\n\\nYou won't see th...",PublicTest
1,0,20120618222256Z,"""@ian21\xa0""Roger Clemens is the fucking man, ...",PublicTest
2,1,20120618213617Z,"""Agree with Alan you are an extremest idiot. ...",PublicTest
3,0,,"""Really?\\n\\nI see Marc Lamont Hill on variou...",PrivateTest
4,0,20120620003825Z,"""Really suck isn't the word, when many of our ...",PrivateTest


In [4]:
filename = './data/prepr_test_character.csv'
test_data_ = pd.read_csv(filename)
print ('Data size:', len(test_data_))
test_data_.head()

Data size: 2235


Unnamed: 0,id,Date,Comment
0,1,20120603163526Z,"""like this if you are a tribe fan"""
1,2,20120531215447Z,"""you're idiot......................."""
2,3,20120823164228Z,"""I am a woman Babs, and the only ""war on women..."
3,4,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F..."
4,5,20120602223825Z,"""haha green me red you now loser whos winning ..."


In [None]:
train_data = train_data_
valid_data = valid_data_
test_data = test_data_

valid_size = len(valid_data)
train_size = len(train_data)
test_size = len(test_data)

X_train = train_data['Comment']
y_train = train_data['Insult']
X_valid = valid_data['Comment']
y_valid = valid_data['Insult']
X_test = test_data['Comment']
y_test = test_data['Insult']

### Character indexing

In [7]:
vocabulary_size = len(string.ascii_lowercase) + 1 + 1 + 1  # [a-z] + ' ' + END + PAD
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
    if char in string.ascii_lowercase:
        return ord(char) - first_letter + 1
    elif char == ' ':
        return 0
    elif char == 'E':
        return 27
    elif char == 'P':
        return 28
    else:
        print('Unexpected character: %s' % char)
        return 0
    
def id2char(dictid):
    if dictid > 0 and dictid < 27: # id(z) = 26
        return chr(dictid + first_letter - 1)
    elif dictid == 27:
        return 'E'
    elif dictid == 28:
        return 'P'
    else:
        return ' '

def comment2vec(comment_, comment_size):
    N = len(comment_)
    if N > comment_size - 2:
        comment = comment_[:comment_size - 1] + 'E'
    else:
        comment = comment_ + 'E' + 'P' * (comment_size - N - 1)
    res = np.array([char2id(char) for char in comment], dtype='int')
    return res
    
print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


### Generation of batches

In [8]:
class LabeledBatchGenerator(object):
    def __init__(self, comments, batch_size, comment_size, labels):
        self._comments = comments
        self._num_comments = len(comments)
        self._batch_size = batch_size
        self._comment_size = comment_size
        self._labels = labels
        segment = self._num_comments // batch_size
        self._cursor = [offset * segment for offset in range(batch_size)]
        
    def _next_batch(self, step):
        batch = np.zeros(shape=(self._batch_size,1), dtype=np.int)
        for b in range(self._batch_size):
            comment = comment2vec(self._comments[self._cursor[b]], self._comment_size)
            batch[b,0] = comment[step]
            if step == self._comment_size - 1:
                self._cursor[b] = (self._cursor[b] + 1) % self._num_comments
        return batch
    
    def next(self):
        batches = []
        batches_labels = [self._labels[self._cursor[b]] for b in range(self._batch_size)]
        for step in range(0, self._comment_size):
            batches.append(self._next_batch(step))
        return batches, batches_labels

In [119]:
comment_size = 100
batch_size = 64

train_batches, train_labels = LabeledBatchGenerator(X_train.as_matrix(), train_size, comment_size, y_train.as_matrix()).next()
valid_batches, valid_labels = LabeledBatchGenerator(X_valid.as_matrix(), valid_size, comment_size, y_valid.as_matrix()).next()
test_batches, test_labels = LabeledBatchGenerator(X_test.as_matrix(), test_size, comment_size, y_test.as_matrix()).next()

In [None]:
X_train = np.asarray(train_batches).reshape(comment_size, train_size).T
y_train = np.asarray(train_labels).reshape(-1)
X_valid = np.asarray(valid_batches).reshape(comment_size, valid_size).T
y_valid = np.asarray(valid_labels).reshape(-1)
X_test = np.asarray(test_batches).reshape(comment_size, test_size).T
y_test = np.asarray(test_labels).reshape(-1)

In [120]:
def plot_results(losses_, mean_losses_, train_aucs_, valid_aucs_, frequency):
    plt.plot(losses_)
    plt.title('Losses')
    plt.xlabel('Step')
    plt.show()

    plt.plot(frequency * np.arange(1,len(mean_losses_)+1), mean_losses_)
    plt.title('MeanLosses')
    plt.xlabel('Step')
    plt.show()

    plt.plot(train_aucs_, label='Train AUC-ROC')
    plt.plot(frequency * np.arange(1,len(valid_aucs_)+1), valid_aucs_, label='Valid AUC-ROC')
    plt.xlabel('Step')
    plt.title('AUC-ROC')
    plt.legend()
    plt.show()

# KERAS model

In [None]:
embedding_size = 128
patch_size = [3,3,3]
num_filters = 128

inputs = keras.layers.Input(shape=(comment_size,))
embed_inputs = keras.layers.embeddings.Embedding(vocabulary_size, embedding_size)(inputs)

conv_layer_0 = keras.layers.convolutional.Conv1D(num_filters, kernel_size=patch_size[0])(embed_inputs)
pool_layer_0 = keras.layers.pooling.GlobalMaxPooling1D()(conv_layer_0)

conv_layer_1 = keras.layers.convolutional.Conv1D(num_filters, kernel_size=patch_size[1])(embed_inputs)
pool_layer_1 = keras.layers.pooling.GlobalMaxPooling1D()(conv_layer_1)

conv_layer_2 = keras.layers.convolutional.Conv1D(num_filters, kernel_size=patch_size[2])(embed_inputs)
pool_layer_2 = keras.layers.pooling.GlobalMaxPooling1D()(conv_layer_2)

concat_layers = keras.layers.concatenate([pool_layer_0, pool_layer_1, pool_layer_2])

output = keras.layers.Dense(1, activation=None)(concat_layers)
act_output = keras.layers.Activation('sigmoid')(output)

model = keras.models.Model(inputs=inputs, outputs=act_output)

optimizer = keras.optimizers.Adam(lr=0.001, decay=0.9)

model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

print (model.summary())
print (model.output_shape)

print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=100,
          validation_data=(X_valid, y_valid))
score, acc = model.evaluate(X_valid, y_valid,
                            batch_size=batch_size)

model.save_weights(filepath='./checkpoint_dir/characterCNNkeras')

print('Valid score:', score)
print('Valid accuracy:', acc)

In [None]:
y_train_pred = np.round(model.predict(X_train))
print ('Train AUC-ROC:',roc_auc_score(y_train, y_train_pred))

y_valid_pred = np.round(model.predict(X_valid))
print ('Valid AUC-ROC:',roc_auc_score(y_valid, y_valid_pred))

y_test_pred = np.round(model.predict(X_test))
print ('Test AUC-ROC:',roc_auc_score(y_test, y_test_pred))

### Tensorflow graph