### Libraries

In [1]:
import os, io
import numpy as np
import random
import tensorflow as tf
from tensorflow.contrib import learn
import keras
from keras.preprocessing import sequence
import matplotlib.pyplot as plt
import datetime
import pandas as pd
import re
import collections
import string
import math
from sklearn.metrics import roc_auc_score
#%matplotlib inline
import keras.backend.tensorflow_backend as K

Using TensorFlow backend.


### Read data

In [7]:
filename = './data/prepr_full_train.csv'
train_data_ = pd.read_csv(filename)
print ('Data size:', len(train_data_))
train_data_.head()

Data size: 119811


Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,you fuck your dad
1,0,20120528192215Z,i really do not understand your point xa it se...
2,0,,a xc xa majority of canadians can and has been...
3,0,,listen if you dont wanna get married to a man ...
4,0,20120619094753Z,c xe c b u ea n xu u ed ng u u b u eddng bi u ...


In [8]:
filename = './data/prepr_valid.csv'
valid_data_ = pd.read_csv(filename)
print ('Data size:', len(valid_data_))
valid_data_.head()

Data size: 2647


Unnamed: 0,Insult,Date,Comment,Usage
0,0,,the drudge report n n n nyou wo not see this s...,PublicTest
1,0,20120618222256Z,ian xa roger clemens is the fucking man and ne...,PublicTest
2,1,20120618213617Z,agree with alan you are an extremest idiot you...,PublicTest
3,0,,really <SIGNS> n ni see marc lamont hill on va...,PrivateTest
4,0,20120620003825Z,really suck is not the word when many of our n...,PrivateTest


In [9]:
filename = './data/prepr_test.csv'
test_data_ = pd.read_csv(filename)
print ('Data size:', len(test_data_))
test_data_.head()

Data size: 2235


Unnamed: 0,Insult,Date,Comment
0,0,20120603163526Z,like this if you are a tribe fan
1,1,20120531215447Z,you idiot
2,1,20120823164228Z,i am a woman babs and the only war on women i ...
3,1,20120826010752Z,wow you benefitted so many wins this year from...
4,1,20120602223825Z,haha green me red you now loser whos winning n...


In [15]:
train_data = train_data_.dropna(axis=0, how='any')
valid_data = valid_data_
test_data = test_data_

valid_size = len(valid_data)
train_size = len(train_data)
test_size = len(test_data)

X_train = train_data['Comment']
y_train = train_data['Insult']
X_valid = valid_data['Comment']
y_valid = valid_data['Insult']
X_test = test_data['Comment']
y_test = test_data['Insult']

In [16]:
filename = './data/embedding_09.vec'

vocabulary = []
embeddings = []

with open(filename, 'r') as f:
    f.readline()
    lines = f.readlines()
    for line in lines:
        line_to_list = line.split(' ')
        vocabulary.append(line_to_list[0])
        embeddings.append(np.array(line_to_list[1:-1]).astype(np.float))
        
embeddings = np.asarray(embeddings)

In [17]:
vocabulary_size = len(vocabulary)
embedding_size = embeddings.shape[1]
print (vocabulary_size)
print (embedding_size)

157689
100


In [18]:
class LabeledBatchGenerator(object):
    def __init__(self, comments, batch_size, comment_size, labels):
        self._comments = comments
        self._num_comments = len(comments)
        self._batch_size = batch_size
        self._comment_size = comment_size
        self._labels = labels
        segment = self._num_comments // batch_size
        self._cursor = [offset * segment for offset in range(batch_size)]
        
    def _next_batch(self, step):
        batch = np.zeros(shape=(self._batch_size,1), dtype=np.int)
        for b in range(self._batch_size):
            batch[b,0] = self._comments[self._cursor[b]][step]
            if step == self._comment_size - 1:
                self._cursor[b] = (self._cursor[b] + 1) % self._num_comments
        return batch
    
    def next(self):
        batches = []
        batches_labels = [self._labels[self._cursor[b]] for b in range(self._batch_size)]
        for step in range(0, self._comment_size):
            batches.append(self._next_batch(step))
        return batches, batches_labels


In [19]:
comment_size = 100
batch_size = 64

vocab_processor = learn.preprocessing.VocabularyProcessor(comment_size)
pretrain = vocab_processor.fit(vocabulary)

X_train_ = np.array(list(vocab_processor.transform(X_train.as_matrix())))
X_valid_ = np.array(list(vocab_processor.transform(X_valid.as_matrix())))
X_test_ = np.array(list(vocab_processor.transform(X_test.as_matrix())))

train_batches, train_labels = LabeledBatchGenerator(X_train_, train_size, comment_size, y_train.as_matrix()).next()
valid_batches, valid_labels = LabeledBatchGenerator(X_valid_, valid_size, comment_size, y_valid.as_matrix()).next()
test_batches, test_labels = LabeledBatchGenerator(X_test_, test_size, comment_size, y_test.as_matrix()).next()

In [20]:
X_train = np.asarray(train_batches).reshape(comment_size, train_size).T
y_train = np.asarray(train_labels).reshape(-1)
X_valid = np.asarray(valid_batches).reshape(comment_size, valid_size).T
y_valid = np.asarray(valid_labels).reshape(-1)
X_test = np.asarray(test_batches).reshape(comment_size, test_size).T
y_test = np.asarray(test_labels).reshape(-1)

In [21]:
print (X_train.shape)
print (X_valid.shape)
print (X_test.shape)

(119063, 100)
(2647, 100)
(2235, 100)


### Keras model

In [22]:
from keras.models import Model
from keras.layers import Dense, Input, concatenate, Activation
from keras.layers.embeddings import Embedding
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D
from keras.optimizers import Adam, SGD
from keras.layers.core import Dropout, Reshape
from keras.regularizers import l2

In [32]:
patch_size = [3,3,3]
num_filters = 20
regul_coef = 0.01

with K.tf.device('/gpu:6'):
    K.set_session(K.tf.Session(config=K.tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)))

    inputs = Input(shape=(comment_size,))
    embed_inputs = Embedding(vocabulary_size, embedding_size, weights=[embeddings], trainable=True)(inputs)

    output_0 = Conv1D(num_filters, kernel_size=patch_size[0], activation='relu', 
                          kernel_regularizer=l2(regul_coef), padding='same')(embed_inputs)
    output_0 = MaxPooling1D(pool_size=2, strides=1, padding='same')(output_0)

    output_1 = Conv1D(num_filters, kernel_size=patch_size[1], activation='relu', 
                          kernel_regularizer=l2(regul_coef), padding='same')(embed_inputs)
    output_1 = MaxPooling1D(pool_size=2, strides=1, padding='same')(output_1)

    output_2 = Conv1D(num_filters, kernel_size=patch_size[2], activation='relu', 
                          kernel_regularizer=l2(regul_coef), padding='same')(embed_inputs)
    output_2 = MaxPooling1D(pool_size=2, strides=1, padding='same')(output_2)

    output = concatenate([output_0, output_1, output_2], axis=1)
    print('Concatenated:', output.shape)

    output = Reshape(((comment_size * len(patch_size) ) * num_filters,))(output)
    print ('Reshaped:', output.shape)

    output = Dropout(rate=0.5)(output)
    output = Dense(100, activation='relu', kernel_regularizer=l2(regul_coef))(output) 

    output = Dropout(rate=0.5)(output)
    output = Dense(1, activation=None, kernel_regularizer=l2(regul_coef))(output)

    act_output = Activation('sigmoid')(output)
    print ('Final:',act_output.shape)
    model = Model(inputs=inputs, outputs=act_output)

    optimizer = Adam(lr=0.01, decay=0.1)

    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['binary_accuracy'])

    print (model.summary())
    print (model.output_shape)

    print('Train...')
    model.fit(X_train, y_train.reshape(-1,1),
              batch_size=batch_size,
              epochs=10,
              validation_data=(X_valid, y_valid.reshape(-1,1)),
              verbose=1)
    score, acc = model.evaluate(X_valid, y_valid.reshape(-1,1),
                                batch_size=batch_size)

    model.save_weights(filepath='./checkpoint_dir/wordCNNkeras_reddit')

    print('Valid score:', score)
    print('Valid accuracy:', acc)
    
    y_train_pred = np.round(model.predict(X_train))
    print ('Train AUC-ROC:',roc_auc_score(y_train, y_train_pred.reshape(-1)))

    y_valid_pred = np.round(model.predict(X_valid))
    print ('Valid AUC-ROC:',roc_auc_score(y_valid, y_valid_pred.reshape(-1)))

    y_test_pred = np.round(model.predict(X_test))
    print ('Test AUC-ROC:',roc_auc_score(y_test, y_test_pred.reshape(-1)))

Concatenated: (?, 300, 20)
Reshaped: (?, 6000)
Final: (?, 1)
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_10 (InputLayer)            (None, 100)           0                                            
____________________________________________________________________________________________________
embedding_10 (Embedding)         (None, 100, 100)      15768900    input_10[0][0]                   
____________________________________________________________________________________________________
conv1d_28 (Conv1D)               (None, 100, 20)       6020        embedding_10[0][0]               
____________________________________________________________________________________________________
conv1d_29 (Conv1D)               (None, 100, 20)       6020        embedding_10[0][0]               
______________________________

KeyboardInterrupt: 