In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [63]:
import gensim
W2V_PATH = 'word2vec/glove.6B.100d.w2vformat.txt'
w2v = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=False)

KeyboardInterrupt: 

In [3]:
tf.test.is_gpu_available()

True

In [4]:
temp_data = pd.read_csv('data/train.csv')
split_num = int(len(temp_data)*0.8)
test_data = temp_data.iloc[split_num:]
train_data = temp_data.iloc[:split_num]
print(len(train_data))
print(len(test_data))

train_data

127656
31915


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [5]:
labels_train = train_data.as_matrix(columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'])
labels_test = test_data.as_matrix(columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'])

In [6]:
import collections
import re

def clean_punc(input_string):
    proc_string = input_string.replace('<',' <less ')
    proc_string = proc_string.replace('>',' <greater> ')
    proc_string = re.sub("https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",' <url> ',proc_string)
    proc_string = proc_string.replace(' <less ',' <less> ')
    proc_string = proc_string.replace('?',' <question> ')
    proc_string = proc_string.replace('...',' <suspension> ')
    proc_string = proc_string.replace('. ',' <period> ')
    proc_string = proc_string if not proc_string.endswith('.') else proc_string[:-1]
    proc_string = proc_string.replace('/',' <slash> ')
    proc_string = proc_string.replace('\\',' <backslash> ')
    proc_string = proc_string.replace('; ',' <semicolon> ')
    proc_string = proc_string.replace(': ',' <colon> ')
    proc_string = proc_string.replace(', ',' <comma> ')
    proc_string = proc_string.replace('!',' <exclame> ')
    proc_string = proc_string.replace('\n',' <newline> ')
    proc_string = proc_string.replace(' - ',' <dash> ')
    proc_string = proc_string.replace('""',' <quote> ')
    proc_string = proc_string.replace('"',' <quote> ')
    proc_string = proc_string.replace('(',' <openbracket> ')
    proc_string = proc_string.replace(')',' <closebracket> ')
    return proc_string

def clean_word(input_word):
    out_word = input_word.lower()
    if ( out_word.startswith("'") and out_word.endswith("'")):
        out_word = out_word[1:-1]
    
    if len(out_word)>0:
        out_word = out_word if not out_word[-1] in ['.',':',';',','] else out_word[:-1]
    
    return out_word
    

In [7]:
comments = [clean_punc(comment) for comment in train_data.comment_text]
comment_words = []
for comment in comments:
    comment_words.append ([clean_word(word) for word in comment.split()])
flatten = lambda l: [item for sublist in l for item in sublist]

flat_comments = flatten(comment_words)

word_counts = collections.Counter()
for word in flat_comments:
    word_counts[word]+=1

In [8]:
test_comments = [clean_punc(comment) for comment in test_data.comment_text]
test_comment_words = []
for comment in test_comments:
    test_comment_words.append ([clean_word(word) for word in comment.split()])
    
flat_comments = flatten(test_comment_words)

for word in flat_comments:
    word_counts[word]+=1

In [9]:
print("Total words: {}".format(len(word_counts)))

very_common = [word for word,_ in word_counts.most_common(100)]

Total words: 277174


# Enough playing, let's build an embedding

In [10]:
# Embedding Hyper-paramters
comment_length = 100
embed_size = 300
n_labels = 6

In [11]:
filtered_words = set([word for num,word in enumerate(word_counts.keys()) if word_counts[word]>5 and word not in very_common])
len_embedding = len(filtered_words)
len_embedding

40434

In [12]:
word_to_int = {word:num for num,word in enumerate(filtered_words)}
int_to_word = {num:word for num,word in enumerate(filtered_words)}

embeddings = np.zeros([len_embedding,embed_size])

for word in filtered_words:
    if word in w2v.vocab:
        embeddings[word_to_int[word],:] = w2v[word]
    else:
        embeddings[word_to_int[word],:] = np.random.uniform(size=[1,embed_size])

In [13]:
def map_word(in_word):
    work_word = clean_word(in_word)
    return word_to_int[work_word]
    

In [14]:
def process_comment(input_comment):
    result_matrix = np.zeros((comment_length))
    temp_matrix = [word_to_int[word] for word in input_comment if word in filtered_words]
    if (len(temp_matrix) == 0):
        return result_matrix

    temp_matrix = temp_matrix[-comment_length:]
    
    result_matrix[-len(temp_matrix):] = temp_matrix
    return result_matrix

In [15]:
import time

start = time.perf_counter()
#Pre-build integer arrays
print("Training comments:")
comment_ints =[]
for i in range(0,len(comment_words)):
    comment_ints.append(process_comment(comment_words[i]))
    if (i%100==0 and i >0):
        elapsed = time.perf_counter() - start
        print("\rProcessed {}/{} in {}.  ETA {}.".format(i,len(comment_words),elapsed,(len(comment_words)-i)*elapsed/i),end='')

start = time.perf_counter()
print("\nTest comments:")
test_comment_ints =[]
for i in range(0,len(test_comment_words)):
    test_comment_ints.append(process_comment(test_comment_words[i]))
    if (i%100==0 and i >0):
        elapsed = time.perf_counter() - start
        print("\rProcessed {}/{} in {}.  ETA {}.".format(i,len(comment_words),elapsed,(len(comment_words)-i)*elapsed/i),end='')

np.array(comment_ints).shape

Training comments:


Processed 100/127656 in 0.0013179390007280745.  ETA 1.6811102717687028.Processed 200/127656 in 0.0023943670003063744.  ETA 1.5258822019552463.Processed 300/127656 in 0.003274384000178543.  ETA 1.390041495755795.Processed 400/127656 in 0.004089916001248639.  ETA 1.301165876637242.Processed 500/127656 in 0.005091352000818006.  ETA 1.2947919100320286.Processed 600/127656 in 0.0062113110016071005.  ETA 1.3153072177003196.Processed 700/127656 in 0.007293491000382346.  ETA 1.322789204920773.Processed 800/127656 in 0.008365901001525344.  ETA 1.3265809218118738.Processed 900/127656 in 0.009263911000743974.  ETA 1.3047292253447813.Processed 1000/127656 in 0.010259685001074104.  ETA 1.2994506634960417.Processed 1100/127656 in 0.011107031001301948.  ETA 1.2778740140006994.Processed 1200/127656 in 0.012056131001372705.  ETA 1.2704750849246558.Processed 1300/127656 in 0.012959032001162996.  ETA 1.2595780365684242.Processed 1400/127656 in 0.013923636000981787.  ETA 1.2556732763856833.

(127656, 100)

## That took too long - better save the results...
And provide for reloading them

In [59]:
# Hyper-paramters
layer_size = 1024
layer_count = 3
hidden_fc_layers = [100]
keep_prob_training = 0.5
learning_rate = 0.00001
epochs = 100
batch_size=128

checkpoint_path = 'a4cp7'

# Approach using improved embeddings

In [52]:
from random import shuffle

# A function to get the lists of inputs with each label
def get_label_lists(labels):
    labels_true = []
    offset = []
    for i in range(0,n_labels):
        labels_true.append([])
        offset.append(0)

    for i,label in enumerate(labels):
        for ii in range(0,n_labels):
            if label[ii] == 1:
                labels_true[ii].append(i)
    
    labels_true.append([i for i,label in enumerate(labels) if sum(label)==0])
    offset.append(0)
    
    for i in range(0,n_labels+1):
        shuffle(labels_true[i])
    
    return labels_true,offset
    
def get_batches(input_ints,labels,batch_size):
    
    num_inputs = len(input_ints)
    num_batches = num_inputs//batch_size
    
    labels_true,offset = get_label_lists(labels)
    group_size = batch_size // 7
    list_length = [len(labels_list) for labels_list in labels_true]
    
    for ii in range(0,num_batches):
        indicies = set()
        for i in range(0,n_labels+1):
            indicies.update([labels_true[i][ii % list_length[i]] for ii in range(offset[i],offset[i]+group_size) ])
            offset[i]+=group_size
            if offset[i]>=list_length[i]:
                offset[i]=0
                shuffle(labels_true[i])
            #for iii in range(0,group_size):
            #    indicies.add(labels_true[i][offset[i]])
            #    offset[i]+=1
            #    if (offset[i]==len(labels_true[i])):
            #        offset[i]=0
            #        shuffle(labels_true[i])
             
        while len(indicies) < batch_size:
            indicies.add(labels_true[n_labels][offset[n_labels] % list_length[n_labels]])
            offset[n_labels]+=1
            if offset[n_labels] == list_length[n_labels]:
                offset[n_labels]=0
                shuffle(labels_true[n_labels])
            
        features = np.array([input_ints[i] for i in indicies])
        return_labels = np.array([labels[i] for i in indicies])
        yield features, return_labels

def get_test_batches(input_ints,labels,batch_size):
    num_inputs = len(input_ints)
    num_batches = num_inputs//batch_size
    if (num_inputs > num_batches * batch_size):
        num_batches += 1
        
    for ii in range(0,num_batches):
        end = ii * batch_size + batch_size if ii * batch_size + batch_size <= num_inputs else num_inputs - 1
        indicies = [0] * batch_size
        indicies[:end-ii*batch_size] = range(ii * batch_size,end)
        
        features = np.array([input_ints[i] for i in indicies])
        return_labels = np.array([labels[i] for i in indicies])
        yield features, return_labels
    

# Alright - enough with the pre-processing, let's build a network
Firstly, define placeholders and the embedding (only variable that is being explicitly defined).

Embedding is initialised by copying it from the list built earlier - only way I can find to do partial transfer learning and partial random.

In [53]:
# Building a graph and placeholders
graph = tf.Graph()

with graph.as_default():
    inputs_ = tf.placeholder(tf.int32,[None,comment_length],name='inputs')
    labels_ = tf.placeholder(tf.float32,[None,None],name='outputs')
    keep_prob_ = tf.placeholder(tf.float32,name='keep_prob')
    initial_embedding_ = tf.placeholder(tf.float32,[len_embedding,embed_size],name='embed')
    embedding_var = tf.Variable(tf.constant(0.0, shape=[len_embedding,embed_size]),
                trainable=True, name='embed_var')
    
    embedding_init = embedding_var.assign(initial_embedding_)

## Build the LSTM network
Just based on hyper-paratmeters defined earlier.

In [54]:
# Build the LSTM network
with graph.as_default():
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(num_units=layer_size)
    drop = tf.contrib.rnn.DropoutWrapper(cell=lstm_cell,input_keep_prob=keep_prob_)
    network = drop
    for _ in range(layer_count):
        network = tf.contrib.rnn.MultiRNNCell([network])

    initial_state = network.zero_state(batch_size,tf.float32)

In [55]:
# Forward pass
with graph.as_default():
    embed = tf.nn.embedding_lookup(embedding_var, inputs_)
    outputs, final_state = tf.nn.dynamic_rnn(network,embed,initial_state=initial_state)

In [56]:
# Get outputs
with graph.as_default():
    predictions = tf.contrib.layers.flatten(outputs)
    for size in hidden_fc_layers:
        predictions = tf.contrib.layers.fully_connected(predictions, size, activation_fn=tf.tanh)
        #predictions = tf.nn.leaky_relu(predictions,alpha=0.2)
        predictions = tf.nn.dropout(predictions,keep_prob_)
    predictions = tf.contrib.layers.fully_connected(predictions, n_labels, activation_fn=tf.sigmoid)
    cost = tf.losses.sigmoid_cross_entropy(labels_, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [57]:
# Determine accuracy on test set
with graph.as_default():
    validation_metrics_var_scope = "validation_metrics"
    binary_pred = tf.cast(tf.round(predictions), tf.bool)
    binary_labels = tf.cast(labels_, tf.bool)
    accuracy = tf.reduce_sum(tf.cast(tf.equal(binary_pred,binary_labels),tf.int32))
    correct_pos = tf.reduce_sum(tf.cast(tf.logical_and(binary_pred,binary_labels),tf.int32),axis=0)
    false_pos = tf.reduce_sum(tf.cast(tf.logical_and(binary_pred,tf.logical_not(binary_labels)),tf.int32),axis=0)
    false_neg = tf.reduce_sum(tf.cast(tf.logical_and(tf.logical_not(binary_pred),binary_labels),tf.int32),axis=0)
    correct_neg = tf.reduce_sum(tf.cast(tf.logical_and(tf.logical_not(binary_pred),tf.logical_not(binary_labels)),tf.int32),axis=0)
    auc = tf.metrics.auc(labels=labels_,predictions=predictions,name=validation_metrics_var_scope)

In [60]:
#Training
with graph.as_default():
    saver = tf.train.Saver()
    
n_batches = len(comment_words)//batch_size

val_acc = []
false_pos_list = []

last_checkpoint = tf.train.latest_checkpoint(checkpoint_path)

print("Starting...")

iteration = 0
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(embedding_init,feed_dict={initial_embedding_:embeddings})
    
    validation_metrics_vars = tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope=validation_metrics_var_scope)
    validation_metrics_init_op = tf.variables_initializer(var_list=validation_metrics_vars, name='validation_metrics_init')
    sess.run(validation_metrics_init_op)
    
    if last_checkpoint != None:
        saver.restore(sess,last_checkpoint)
        print("Restored checkpoint from {}.".format(last_checkpoint))    
    
    for e in range(epochs):
        state = sess.run(initial_state)
            
        for ii,(x,y) in enumerate(get_batches(comment_ints,labels_train,batch_size),1):
            feed = {inputs_:x,
                    labels_:y,
                    keep_prob_:keep_prob_training,
                    initial_state:state}
            
            loss, state, _ = sess.run([cost,final_state,optimizer],feed_dict=feed)
            iteration += 1
            
            if iteration%100==0:
                print("\rEpoch: {}/{}".format(e, epochs),
                      "Iteration: {}/{}".format(iteration, n_batches*epochs),
                      "Train loss: {:.3f}".format(loss),end='')

            if iteration%2000==0:
                val_acc.clear()
                false_pos_list.clear()
                total_correct_pos = 0
                total_false_pos = 0
                total_correct_neg = 0
                total_false_neg = 0
                
                val_state = sess.run(initial_state)
                sess.run(validation_metrics_init_op)
                if iteration%2000==0:
                    test_subset_x,test_subset_y = test_comment_ints,labels_test
                else:
                    test_subset_x,test_subset_y = test_comment_ints[:4096],labels_test[:4096]
                
                for x, y in get_test_batches(test_subset_x, test_subset_y, batch_size):
                    feed = {inputs_: x,
                            labels_: y,
                            keep_prob_: 1,
                            initial_state: val_state}
                    
                    auc_val, n_correct_pos, n_correct_neg, n_false_pos, n_false_neg, val_state, batch_acc = sess.run([auc, correct_pos, correct_neg, false_pos, false_neg, final_state,accuracy], feed_dict=feed)
                    #print(predictions)
                    val_acc.append(batch_acc/len(test_subset_y))
                    auc_value = auc_val[1]
                    total_correct_pos += n_correct_pos
                    total_false_pos += n_false_pos
                    total_correct_neg += n_correct_neg
                    total_false_neg += n_false_neg
                print("During epoch {}".format(e))
                print("  Val acc      : {}".format(np.mean(val_acc)))
                print("  AuC          : {}".format(auc_value))
                print("  Correct pos  : {}".format('  '.join(['{:5}'.format(x) for x in total_correct_pos])))
                print("  False neg    : {}".format('  '.join(['{:5}'.format(x) for x in total_false_neg])))
                print("  Correct neg  : {}".format('  '.join(['{:5}'.format(x) for x in total_correct_neg])))
                print("  False pos    : {}\n".format('  '.join(['{:5}'.format(x) for x in total_false_pos])))
                
                
                saver.save(sess, "{}/epoch{}iter{}.ckpt".format(checkpoint_path,e,iteration))

Starting...
Epoch: 2/100 Iteration: 2000/99700 Train loss: 0.584During epoch 2
  Val acc      : 0.023210903963653457
  AuC          : 0.9154759049415588
  Correct pos  :  2341     50   1294      0   1162      0
  False neg    :   696    261    375     92    420    305
  Correct neg  : 25847  31650  29690  31908  29557  31695
  False pos    :  3116     39    641      0    861      0

Epoch: 4/100 Iteration: 4000/99700 Train loss: 0.583During epoch 4
  Val acc      : 0.02330289832367225
  AuC          : 0.9223653078079224
  Correct pos  :  2572     95   1312      3   1179      0
  False neg    :   465    216    357     89    403    305
  Correct neg  : 25936  31605  29910  31896  29726  31694
  False pos    :  3027     84    421     12    692      1

Epoch: 6/100 Iteration: 6000/99700 Train loss: 0.575During epoch 6
  Val acc      : 0.023508444305185647
  AuC          : 0.9132415056228638
  Correct pos  :  2434     92   1213     10   1067      9
  False neg    :   603    219    456     8

KeyboardInterrupt: 

In [61]:
submit_data = pd.read_csv('data/test.csv')

submit_comments = [clean_punc(comment) for comment in submit_data.comment_text]
submit_comment_ints = []
#submit_comments = submit_comments[:204]
for comment in submit_comments:
    words = [word for word in comment.split()]
    submit_comment_ints.append (process_comment(words))
    
label_placeholder = np.zeros([len(submit_comments),n_labels])
results = []

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    last_checkpoint = tf.train.latest_checkpoint(checkpoint_path)
    saver.restore(sess,last_checkpoint)

    for x, y in get_test_batches(submit_comment_ints, label_placeholder, batch_size):
        feed = {inputs_: x,
            keep_prob_: 1,
            initial_state: val_state}
        #print (x)
        pred, val_state = sess.run([predictions, final_state], feed_dict=feed)

        for the_pred in pred:
            results.append(the_pred)
        
        print("\rDone: {}/{}".format(len(results), len(label_placeholder)),end='')

results = results[:len(submit_comment_ints)]

submission = pd.concat([submit_data['id'],pd.DataFrame(results,columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'])],axis=1)

submission.to_csv('submission.csv',index=False, float_format='%.4f')

INFO:tensorflow:Restoring parameters from a4cp7/epoch38iter38000.ckpt
Done: 153216/153164

In [None]:
sum(labels_test[:1024])

In [None]:
word_counts['explanation']

In [None]:
[ len([word for word in comment if word in filtered_words])/len(comment) for comment in comment_words[0:10]]

In [None]:
'norman' in word_counts.keys()