### Libraries

In [1]:
import os
import numpy as np
import random
import tensorflow as tf
import keras
from keras.preprocessing import sequence
import matplotlib.pyplot as plt
import datetime
import pandas as pd
import re
import collections
import string
import math
from sklearn.metrics import roc_auc_score
#%matplotlib inline
import keras.backend.tensorflow_backend as K

Using TensorFlow backend.


### Read data

In [2]:
filename = './data/prepr_full_train.csv'
train_data_ = pd.read_csv(filename)
print ('Data size:', len(train_data_))
train_data_.head()

Data size: 119811


Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,you fuck your dad
1,0,20120528192215Z,i really do not understand your point xa it se...
2,0,,a xc xa majority of canadians can and has been...
3,0,,listen if you dont wanna get married to a man ...
4,0,20120619094753Z,c xe c b u ea n xu u ed ng u u b u eddng bi u ...


In [3]:
filename = './data/prepr_valid.csv'
valid_data_ = pd.read_csv(filename)
print ('Data size:', len(valid_data_))
valid_data_.head()

Data size: 2647


Unnamed: 0,Insult,Date,Comment,Usage
0,0,,the drudge report n n n nyou wo not see this s...,PublicTest
1,0,20120618222256Z,ian xa roger clemens is the fucking man and ne...,PublicTest
2,1,20120618213617Z,agree with alan you are an extremest idiot you...,PublicTest
3,0,,really <SIGNS> n ni see marc lamont hill on va...,PrivateTest
4,0,20120620003825Z,really suck is not the word when many of our n...,PrivateTest


In [4]:
filename = './data/prepr_test.csv'
test_data_ = pd.read_csv(filename)
print ('Data size:', len(test_data_))
test_data_.head()

Data size: 2235


Unnamed: 0,Insult,Date,Comment
0,0,20120603163526Z,like this if you are a tribe fan
1,1,20120531215447Z,you idiot
2,1,20120823164228Z,i am a woman babs and the only war on women i ...
3,1,20120826010752Z,wow you benefitted so many wins this year from...
4,1,20120602223825Z,haha green me red you now loser whos winning n...


In [5]:
train_data = train_data_.dropna(axis=0, how='any')
valid_data = valid_data_
test_data = test_data_

valid_size = len(valid_data)
train_size = len(train_data)
test_size = len(test_data)

X_train = train_data['Comment']
y_train = train_data['Insult']
X_valid = valid_data['Comment']
y_valid = valid_data['Insult']
X_test = test_data['Comment']
y_test = test_data['Insult']

### Compose vocabulary

In [6]:
vocabulary_size = 50000

def build_vocabulary(words):
    counter_words = collections.Counter(words)
    count = [['UNK', -3], ['<EOS>', 0], ['<PAD>', 0], ['<SIGNS>', counter_words['<SIGNS>']]]
    count.extend(counter_words.most_common(vocabulary_size - 4))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            index = 0
            unk_count = unk_count + 1
        data.append(index)
    count[0][1] = unk_count
    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) 
    return data, count, dictionary, reverse_dictionary

words = ' '.join(X_train)
print ('Length of train data in words:', len(words))

data, count, dictionary, reverse_dictionary = build_vocabulary(words.split(' '))
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])

print ('Dict:', dictionary['UNK'])
print ('Reverse dict:', reverse_dictionary[0])
print ('Counter:', count[dictionary['UNK']])
print ('Data:', data[0])
print ('Words:', words[:10])
del words

Length of train data in words: 45475008
Most common words (+UNK) [['UNK', 115253], ['<EOS>', 0], ['<PAD>', 0], ['<SIGNS>', 56993], ('the', 364016)]
Sample data [7, 73, 24, 3278, 6, 130, 23, 13, 243, 24]
Dict: 0
Reverse dict: UNK
Counter: ['UNK', 115253]
Data: 7
Words: you fuck y


### Word indexing

In [7]:
def word2id(word):
    if word in dictionary:
        return dictionary[word]
    else:
        return 0 # UNK

def id2word(id_):
    return reverse_dictionary[id_]


def comment2vec(comment):
    global comment_size
    split = comment.split(' ')
    res = np.array([word2id(word) for word in split], dtype='int')
    return res

def vec2comment(vec):
    global comment_size
    return ' '.join([id2word(id_) for id_ in vec])


### Generation of batches

In [8]:
class LabeledBatchGenerator(object):
    def __init__(self, comments, batch_size, comment_size, labels):
        self._comments = comments
        self._num_comments = len(comments)
        self._batch_size = batch_size
        self._comment_size = comment_size
        self._labels = labels
        segment = self._num_comments // batch_size
        self._cursor = [offset * segment for offset in range(batch_size)]
        
    def _next_batch(self, step):
        batch = np.zeros(shape=(self._batch_size,1), dtype=np.int)
        for b in range(self._batch_size):
            comment = comment2vec(self._comments[self._cursor[b]])
            N = len(comment)
            if step < N:
                batch[b,0] = comment[step]
            elif step == N:
                batch[b,0] = word2id('<EOS>')
            elif step > N:
                batch[b,0] = word2id('<PAD>')
            if step == self._comment_size - 1:
                if N > self._comment_size - 1:
                    batch[b,0] = word2id('<EOS>')
                self._cursor[b] = (self._cursor[b] + 1) % self._num_comments
        return batch
    
    def next(self):
        batches = []
        batches_labels = [self._labels[self._cursor[b]] for b in range(self._batch_size)]
        for step in range(0, self._comment_size):
            batches.append(self._next_batch(step))
        return batches, batches_labels

In [9]:
comment_size = 100
batch_size = 64

train_batches, train_labels = LabeledBatchGenerator(X_train.as_matrix(), train_size, comment_size, y_train.as_matrix()).next()
valid_batches, valid_labels = LabeledBatchGenerator(X_valid.as_matrix(), valid_size, comment_size, y_valid.as_matrix()).next()
test_batches, test_labels = LabeledBatchGenerator(X_test.as_matrix(), test_size, comment_size, y_test.as_matrix()).next()

In [10]:
X_train = np.asarray(train_batches).reshape(comment_size, train_size).T
y_train = np.asarray(train_labels).reshape(-1)
X_valid = np.asarray(valid_batches).reshape(comment_size, valid_size).T
y_valid = np.asarray(valid_labels).reshape(-1)
X_test = np.asarray(test_batches).reshape(comment_size, test_size).T
y_test = np.asarray(test_labels).reshape(-1)

### KERAS model

In [11]:
from keras.models import Sequential, Model
from keras.layers import LSTM, Dense, Input, concatenate, Activation, Bidirectional
from keras.layers.embeddings import Embedding
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.optimizers import Adam, SGD
from keras.layers.core import Dropout, Reshape
from keras.regularizers import l2

In [12]:
embedding_size = 128
num_nodes = 64

with K.tf.device('/gpu:5'):
    K.set_session(K.tf.Session(config=K.tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)))

    inputs = Input(shape=(comment_size,))
    embed_inputs = Embedding(vocabulary_size, embedding_size,
                             trainable=True)(inputs)

    lstm_0 = Bidirectional(LSTM(num_nodes, kernel_regularizer=l2(0.001), dropout=0.1))(embed_inputs)
    print ('Bidirectional LSTM:', lstm_0.shape)
    #lstm_1 = Bidirectional(LSTM(num_nodes, kernel_regularizer=l2(0.01), dropout=0.1))(embed_inputs)

    #lstm_2 = Bidirectional(LSTM(num_nodes, kernel_regularizer=l2(0.01), dropout=0.1))(embed_inputs)

    #concat_layers = concatenate([lstm_0, lstm_1, lstm_2], axis=1)
    concat_layers = lstm_0
    print('Concatenated:', concat_layers.shape)

    concat_layers = Dropout(rate=0.1)(concat_layers)
    output = Dense(1, activation=None, kernel_regularizer=l2(0.001))(concat_layers) 
    act_output = Activation('sigmoid')(output)
    print ('Final:',act_output.shape)
    model = Model(inputs=inputs, outputs=act_output)

    optimizer = Adam(lr=0.01, decay=0.1)

    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    print (model.summary())
    print (model.output_shape)

    print('Train...')
    model.fit(X_train, y_train.reshape(-1,1),
              batch_size=batch_size,
              epochs=10,
              validation_data=(X_valid, y_valid.reshape(-1,1)))
    score, acc = model.evaluate(X_valid, y_valid.reshape(-1,1),
                                batch_size=batch_size)

    model.save_weights(filepath='./checkpoint_dir/wordLSTMkeras')

    print('Valid score:', score)
    print('Valid accuracy:', acc)

    y_train_pred = np.round(model.predict(X_train))
    print ('Train AUC-ROC:',roc_auc_score(y_train, y_train_pred))

    y_valid_pred = np.round(model.predict(X_valid))
    print ('Valid AUC-ROC:',roc_auc_score(y_valid, y_valid_pred))

    y_test_pred = np.round(model.predict(X_test))
    print ('Test AUC-ROC:',roc_auc_score(y_test, y_test_pred))

Bidirectional LSTM: (?, 128)
Concatenated: (?, 128)
Final: (?, 1)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 128)          6400000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
_________________________________________________________________
activation_1 (Activation)    (None, 1)                 0         
Total params: 6,498,945
Trainable params: 6,498,945
Non-trainable params: 0


KeyboardInterrupt: 