In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
import gensim
W2V_PATH = 'word2vec/GoogleNews-vectors-negative300.bin'
w2v = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)

Using TensorFlow backend.


In [3]:
tf.test.is_gpu_available()

True

In [4]:
temp_data = pd.read_csv('data/train.csv')
split_num = int(len(temp_data)*0.8)
test_data = temp_data.iloc[split_num:]
train_data = temp_data.iloc[:split_num]
print(len(train_data))
print(len(test_data))

train_data

127656
31915


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [5]:
import collections
import re

def clean_punc(input_string):
    proc_string = input_string.replace('<',' <less ')
    proc_string = proc_string.replace('>',' <greater> ')
    proc_string = re.sub("https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",' <url> ',proc_string)
    proc_string = proc_string.replace(' <less ',' <less> ')
    proc_string = proc_string.replace('?',' <question> ')
    proc_string = proc_string.replace('...',' <suspension> ')
    proc_string = proc_string.replace('. ',' <period> ')
    proc_string = proc_string if not proc_string.endswith('.') else proc_string[:-1]
    proc_string = proc_string.replace('/',' <slash> ')
    proc_string = proc_string.replace('\\',' <backslash> ')
    proc_string = proc_string.replace('; ',' <semicolon> ')
    proc_string = proc_string.replace(': ',' <colon> ')
    proc_string = proc_string.replace(', ',' <comma> ')
    proc_string = proc_string.replace('!',' <exclame> ')
    proc_string = proc_string.replace('\n',' <newline> ')
    proc_string = proc_string.replace(' - ',' <dash> ')
    proc_string = proc_string.replace('""',' <quote> ')
    proc_string = proc_string.replace('"',' <quote> ')
    proc_string = proc_string.replace('(',' <openbracket> ')
    proc_string = proc_string.replace(')',' <closebracket> ')
    return proc_string

def clean_word(input_word):
    out_word = input_word.lower()
    if ( out_word.startswith("'") and out_word.endswith("'")):
        out_word = out_word[1:-1]
    
    if len(out_word)>0:
        out_word = out_word if not out_word[-1] in ['.',':',';',','] else out_word[:-1]
    
    return out_word
    

In [6]:
comments = [clean_punc(comment) for comment in train_data.comment_text]
comment_words = []
for comment in comments:
    comment_words.append ([word for word in comment.split()])
flatten = lambda l: [item for sublist in l for item in sublist]

flat_comments = flatten(comment_words)

word_counts = collections.Counter()
for word in flat_comments:
    word_counts[word]+=1

In [7]:
test_comments = [clean_punc(comment) for comment in test_data.comment_text]
test_comment_words = []
for comment in test_comments:
    test_comment_words.append ([word for word in comment.split()])

In [8]:
labels_train = train_data.as_matrix(columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'])
labels_test = test_data.as_matrix(columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'])

In [9]:
print(len(word_counts))

very_common = [word for word,_ in word_counts.most_common(100)]

very_common[:20]

294429


['<comma>',
 'the',
 '<period>',
 '<newline>',
 'to',
 '<quote>',
 'of',
 'and',
 'a',
 'I',
 'you',
 'is',
 'that',
 'in',
 'it',
 '<exclame>',
 'for',
 '<closebracket>',
 'not',
 'on']

In [10]:
filtered_words = [word for word in word_counts.keys() if word_counts[word]>4]


In [11]:
comment_lens = [len(comment) for comment in comment_words]
print("Average: {}".format(sum(comment_lens)/float(len(comment_lens))))
for perc in range(5,101,5):
    print("{0} Percentile : {1}".format(perc,np.percentile(comment_lens,perc)))

Average: 79.68248260951307
5 Percentile : 7.0
10 Percentile : 10.0
15 Percentile : 13.0
20 Percentile : 17.0
25 Percentile : 20.0
30 Percentile : 24.0
35 Percentile : 28.0
40 Percentile : 33.0
45 Percentile : 37.0
50 Percentile : 43.0
55 Percentile : 49.0
60 Percentile : 56.0
65 Percentile : 64.0
70 Percentile : 74.0
75 Percentile : 88.0
80 Percentile : 106.0
85 Percentile : 134.0
90 Percentile : 178.0
95 Percentile : 269.0
100 Percentile : 4950.0


In [12]:
'wikipedia' in w2v.vocab

True

In [13]:
def map_word(in_word):
    out_vector = np.zeros(307)
    if in_word.isupper():
        out_vector[300] = 1 #Flag shouting
    if in_word.islower():
        out_vector[301] = 1 #Flag normal text
    work_word = in_word.lower()
    if work_word in very_common:
        out_vector[302] = 1 #Flag 100 most common words
    
    if work_word[0] == '<':
        out_vector[303] = 1 #Flag punctuation we replaced and return
        return out_vector
    
    if work_word in w2v.vocab and work_word in filtered_words:
        out_vector[:300] = w2v[work_word]
        return out_vector
    
    if work_word[0] == work_word[-1] and work_word[0] in ['_','*',"'"]:
        out_vector[304] = 1 #Flag words with emphasis
        work_word = work_word[1:-1]
    
    if len(work_word)>0:
        work_word = work_word if not work_word[-1] in ['.',':',';',',',"'"] else work_word[:-1]

    if work_word in w2v.vocab and work_word in filtered_words:
        out_vector[:300] = w2v[work_word]
        return out_vector
    
    out_vector[305] = 1 #Flag unknown words
    return out_vector

In [14]:
print(map_word('*WIKIPEDIA*'))

[ 0.21875    -0.12207031 -0.00296021  0.02429199  0.08300781 -0.01977539
  0.00396729 -0.09570312  0.11035156 -0.37109375  0.12451172 -0.54296875
 -0.09912109  0.08544922 -0.16894531 -0.10205078  0.22753906 -0.07421875
 -0.03015137 -0.35742188 -0.11523438 -0.01171875  0.27148438 -0.01049805
 -0.22070312 -0.17578125 -0.18847656  0.18554688 -0.08007812 -0.05615234
 -0.05151367 -0.11132812 -0.24609375 -0.09912109 -0.14550781  0.08447266
 -0.12792969  0.29882812  0.24609375  0.10449219  0.12402344 -0.07324219
  0.15625     0.59765625  0.28125     0.00970459 -0.171875   -0.25585938
 -0.24511719 -0.171875   -0.24121094 -0.10302734 -0.17578125 -0.05834961
  0.18945312 -0.08349609  0.11279297  0.07470703 -0.27148438 -0.3203125
  0.12158203 -0.04052734  0.13378906 -0.18457031  0.01904297 -0.19433594
 -0.203125   -0.24414062  0.16113281  0.02490234 -0.11035156  0.16015625
 -0.23632812 -0.19628906 -0.14550781  0.10546875  0.07177734 -0.14257812
 -0.03857422  0.20703125  0.30078125  0.06591797  0.

In [15]:
def process_comment(input_comment):
    result_matrix = np.zeros((250,307))
    if (len(input_comment) == 0):
        return result_matrix
    
    input_comment = input_comment[:250]
    temp_matrix = [map_word(word) for word in input_comment]
    result_matrix[-len(input_comment):,:] = temp_matrix
    return result_matrix

In [16]:
#Test the process comment function
np.array(process_comment(comment_words[21])).shape

(250, 307)

# Approach with pre-encoded Features

Doesn't work unfortunately - not nearly enough memory for that. So disabled.

# Approach using live generation

In [28]:
from random import shuffle

def get_batches(comment_words,labels,batch_size):
    
    num_inputs = len(comment_words)
    num_batches = num_inputs//batch_size
    
    labels_true = []
    offset = []
    for i in range(0,6):
        labels_true.append([])
        offset.append(0)

    for i,label in enumerate(labels):
        for ii in range(0,6):
            if label[ii] == 1:
                labels_true[ii].append(i)
    
    no_labels = [i for i,label in enumerate(labels) if sum(label)==0]
    shuffle(no_labels)
    
    offset.append(0)
    group_size = batch_size // 8
    
    for ii in range(0,num_batches):
        indicies = set()
        for i in range(0,6):
            for iii in range(0,group_size):
                indicies.add(labels_true[i][offset[i]])
                offset[i]+=1
                if (offset[i]==len(labels_true[i])):
                    offset[i]=0
                    shuffle(labels_true[i])
                
        num_remaining = batch_size - len(indicies)
        for iii in range(0,num_remaining):
            indicies.add(no_labels[offset[6]])
            offset[6]+=1
            if offset[6] == len(no_labels):
                offset[6]=0
                shuffle(no_labels)
            
        features = []
        for iii in indicies:
            features.append(process_comment(comment_words[iii]))
        
        return_labels = [[max(labels[i]) for i in indicies]]
        yield features, return_labels

def get_test_batches(comment_words,labels,batch_size):
    num_inputs = len(comment_words)
    num_batches = num_inputs//batch_size

    for ii in range(0,num_batches):
        end = ii * batch_size + batch_size if ii * batch_size + batch_size <= num_inputs else num_inputs - 1
        indicies = range(ii * batch_size,end)
        features = []
        for iii in indicies:
            features.append(process_comment(comment_words[iii]))
        
        return_labels = [[max(labels[i]) for i in indicies]]
        yield features, return_labels
    

In [18]:
labels_true = []
for i in range(0,6):
    labels_true.append([])

for i,label in enumerate(labels_train):
    for ii in range(0,6):
        if label[ii] == 1:
            labels_true[ii].append(i)
    

In [19]:
[len(the_list) for the_list in labels_true]

[12257, 1284, 6780, 386, 6295, 1100]

In [85]:
# Hyper-paramters
layer_size = 512
layer_count = 1
keep_prob_training = 0.6
learning_rate = 0.001
epochs = 20
batch_size=128
comment_length = 250
embed_size = 307
n_labels = 1
checkpoint_path = 'cpa2n9'

In [86]:
# Building a graph and placeholders
graph = tf.Graph()

with graph.as_default():
    inputs_ = tf.placeholder(tf.float32,[batch_size,comment_length,embed_size],name='inputs')
    labels_ = tf.placeholder(tf.float32,[None,None],name='outputs')
    keep_prob_ = tf.placeholder(tf.float32,name='keep_prob')

In [87]:
# Build the network
with graph.as_default():
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=layer_size,activation=tf.tanh)
    drop = tf.contrib.rnn.DropoutWrapper(cell=lstm_cell,input_keep_prob=keep_prob_)
    network = drop
    for _ in range(layer_count):
        network = tf.contrib.rnn.MultiRNNCell([network])

    initial_state = network.zero_state(batch_size,tf.float32)

In [88]:
# Forward pass
with graph.as_default():
    outputs, final_state = tf.nn.dynamic_rnn(network,inputs_,initial_state=initial_state)

In [89]:
# Get outputs
with graph.as_default():
    predictions = tf.contrib.layers.flatten(outputs)
    predictions = tf.contrib.layers.fully_connected(predictions, n_labels, activation_fn=tf.sigmoid)
    cost = tf.losses.log_loss(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [90]:
# Determine accuracy on test set
with graph.as_default():
    validation_metrics_var_scope = "validation_metrics"
    binary_pred = tf.cast(tf.round(predictions), tf.bool)
    binary_labels = tf.cast(labels_, tf.bool)
    correct_pos = tf.reduce_sum(tf.cast(tf.logical_and(binary_pred,binary_labels),tf.int32))
    false_pos = tf.reduce_sum(tf.cast(tf.logical_and(binary_pred,tf.logical_not(binary_labels)),tf.int32))
    false_neg = tf.reduce_sum(tf.cast(tf.logical_and(tf.logical_not(binary_pred),binary_labels),tf.int32))
    correct_neg = tf.reduce_sum(tf.cast(tf.logical_and(tf.logical_not(binary_pred),tf.logical_not(binary_labels)),tf.int32))
    auc = tf.metrics.auc(labels=labels_,predictions=predictions,name=validation_metrics_var_scope)
    accuracy = tf.metrics.accuracy(labels=labels_,predictions=predictions,name=validation_metrics_var_scope)

In [91]:
#Training
with graph.as_default():
    saver = tf.train.Saver()
    
n_batches = len(comment_words)//batch_size

val_acc = []
false_pos_list = []

last_checkpoint = tf.train.latest_checkpoint(checkpoint_path)

print("Starting...")

iteration = 0
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    validation_metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope=validation_metrics_var_scope)
    validation_metrics_init_op = tf.variables_initializer(var_list=validation_metrics_vars, name='validation_metrics_init')
    sess.run(validation_metrics_init_op)
    
    if last_checkpoint != None:
        saver.restore(sess,last_checkpoint)
        print("Restored checkpoint from {}.".format(last_checkpoint))    
    
    for e in range(epochs):
        state = sess.run(initial_state)
            
        for ii,(x,y) in enumerate(get_batches(comment_words,labels_train,batch_size),1):
            feed = {inputs_:x,
                    labels_:y,
                    keep_prob_:keep_prob_training,
                    initial_state:state}
            
            loss, state, _ = sess.run([cost,final_state,optimizer],feed_dict=feed)
            iteration += 1
            
            if iteration%10==0:
                print("\rEpoch: {}/{}".format(e, epochs),
                      "Iteration: {}/{}".format(iteration, n_batches*epochs),
                      "Train loss: {:.3f}".format(loss),end='')

            if iteration%200==0:
                val_acc.clear()
                false_pos_list.clear()
                total_correct_pos = 0
                total_false_pos = 0
                total_correct_neg = 0
                total_false_neg = 0
                
                val_state = sess.run(initial_state)
                sess.run(validation_metrics_init_op)
                if iteration%2000==0:
                    test_subset_x,test_subset_y = test_comment_words,labels_test
                else:
                    test_subset_x,test_subset_y = test_comment_words[:1024],labels_test[:1024]
                
                for x, y in get_test_batches(test_subset_x, test_subset_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y,
                            keep_prob_: 1,
                            initial_state: val_state}
                    
                    auc_val, batch_acc, n_correct_pos, n_correct_neg, n_false_pos, n_false_neg, val_state = sess.run([auc, accuracy, correct_pos, correct_neg, false_pos, false_neg, final_state], feed_dict=feed)
                    
                    val_acc.append(batch_acc)
                    auc_value = auc_val[1]
                    total_correct_pos += n_correct_pos
                    total_false_pos += n_false_pos
                    total_correct_neg += n_correct_neg
                    total_false_neg += n_false_neg
                print("\nDuring epoch {}".format(e))
                print("  Val acc      : {}".format(np.mean(val_acc)))
                print("  AuC          : {}".format(auc_value))
                #print("  Correct pos  : {}".format('  '.join(['{:5}'.format(x) for x in total_correct_pos])))
                #print("  False neg    : {}".format('  '.join(['{:5}'.format(x) for x in total_false_neg])))
                #print("  Correct neg  : {}".format('  '.join(['{:5}'.format(x) for x in total_correct_neg])))
                #print("  False pos    : {}\n".format('  '.join(['{:5}'.format(x) for x in total_false_pos])))
                print("  Correct pos  : {:5}".format(total_correct_pos))
                print("  False neg    : {:5}".format(total_false_neg))
                print("  Correct neg  : {:5}".format(total_correct_neg))
                print("  False pos    : {:5}\n".format(total_false_pos))
                
                
                saver.save(sess, "{}/epoch{}iter{}.ckpt".format(checkpoint_path,e,iteration))

Starting...
Epoch: 0/20 Iteration: 200/19940 Train loss: 0.564
During epoch 0
  Val acc      : 0.0
  AuC          : 0.47018536925315857
  Correct pos  : 14208
  False neg    :     0
  Correct neg  :     0
  False pos    : 116864

Epoch: 0/20 Iteration: 320/19940 Train loss: 0.562

KeyboardInterrupt: 

In [None]:
sum(labels_train)

In [None]:
len([labels for labels in labels_train if sum(labels)>0])

In [None]:
sum(labels_test[:1024])

In [None]:
labels_test

In [None]:
test_data

In [None]:
comment_words