In [1]:
import pandas as pd
import numpy as np

In [2]:
import gensim
W2V_PATH = 'word2vec\GoogleNews-vectors-negative300.bin'
w2v = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)

Using TensorFlow backend.


In [3]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
train_data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [4]:
import collections
import re

def clean_punc(input_string):
    proc_string = input_string.replace('<',' <less ')
    proc_string = proc_string.replace('>',' <greater> ')
    proc_string = re.sub("https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",' <url> ',proc_string)
    proc_string = proc_string.replace(' <less ',' <less> ')
    proc_string = proc_string.replace('?',' <question> ')
    proc_string = proc_string.replace('...',' <suspension> ')
    proc_string = proc_string.replace('. ',' <period> ')
    proc_string = proc_string if not proc_string.endswith('.') else proc_string[:-1]
    proc_string = proc_string.replace('/',' <slash> ')
    proc_string = proc_string.replace('\\',' <backslash> ')
    proc_string = proc_string.replace('; ',' <semicolon> ')
    proc_string = proc_string.replace(': ',' <colon> ')
    proc_string = proc_string.replace(', ',' <comma> ')
    proc_string = proc_string.replace('!',' <exclame> ')
    proc_string = proc_string.replace('\n',' <newline> ')
    proc_string = proc_string.replace(' - ',' <dash> ')
    proc_string = proc_string.replace('""',' <quote> ')
    proc_string = proc_string.replace('"',' <quote> ')
    proc_string = proc_string.replace('(',' <openbracket> ')
    proc_string = proc_string.replace(')',' <closebracket> ')
    return proc_string

def clean_word(input_word):
    out_word = input_word.lower()
    if ( out_word.startswith("'") and out_word.endswith("'")):
        out_word = out_word[1:-1]
    
    if len(out_word)>0:
        out_word = out_word if not out_word[-1] in ['.',':',';'] else out_word[:-1]
    
    return out_word
    

In [7]:
comments = [clean_punc(comment) for comment in train_data.comment_text]
comment_words = []
for comment in comments:
    comment_words.append ([word for word in comment.split()])
flatten = lambda l: [item for sublist in l for item in sublist]

flat_comments = flatten(comment_words)

word_counts = collections.Counter()
for word in flat_comments:
    word_counts[word]+=1

In [8]:
test_comments = [clean_punc(comment) for comment in test_data.comment_text]
test_comment_words = []
for comment in test_comments:
    test_comment_words.append ([word for word in comment.split()])

In [9]:
labels_train = train_data.as_matrix(columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'])
labels_test = test_data.as_matrix(columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'])

In [10]:
print(len(word_counts))

very_common = [word for word,_ in word_counts.most_common(100)]

very_common[:20]

342237


['<comma>',
 'the',
 '<period>',
 '<newline>',
 'to',
 '<quote>',
 'of',
 'and',
 'a',
 'I',
 'you',
 'is',
 'that',
 'in',
 'it',
 '<exclame>',
 'for',
 '<closebracket>',
 'not',
 'on']

In [23]:
filtered_words = [word for word in word_counts.keys() if word_counts[word]>4]


In [14]:
comment_lens = [len(comment) for comment in comment_words]
print("Average: {}".format(sum(comment_lens)/float(len(comment_lens))))
for perc in range(5,101,5):
    print("{0} Percentile : {1}".format(perc,np.percentile(comment_lens,perc)))

Average: 79.67606895989873
5 Percentile : 7.0
10 Percentile : 10.0
15 Percentile : 13.0
20 Percentile : 17.0
25 Percentile : 20.0
30 Percentile : 24.0
35 Percentile : 28.0
40 Percentile : 33.0
45 Percentile : 37.0
50 Percentile : 43.0
55 Percentile : 49.0
60 Percentile : 56.0
65 Percentile : 64.0
70 Percentile : 74.0
75 Percentile : 88.0
80 Percentile : 106.0
85 Percentile : 133.0
90 Percentile : 178.0
95 Percentile : 269.0
100 Percentile : 4950.0


In [30]:
'wikipedia' in w2v.vocab

True

In [32]:
def map_word(in_word):
    out_vector = np.zeros(307)
    if in_word.isupper():
        out_vector[300] = 1 #Flag shouting
    if in_word.islower():
        out_vector[301] = 1 #Flag normal text
    work_word = in_word.lower()
    if work_word in very_common:
        out_vector[302] = 1 #Flag 100 most common words
    
    if work_word[0] == '<':
        out_vector[303] = 1 #Flag punctuation we replaced and return
        return out_vector
    
    if work_word in w2v.vocab and work_word in filtered_words:
        out_vector[:300] = w2v[work_word]
        return out_vector
    
    if work_word[0] == work_word[-1] and work_word[0] in ['_','*',"'"]:
        out_vector[304] = 1 #Flag words with emphasis
        work_word = work_word[1:-1]
    
    if len(work_word)>0:
        work_word = work_word if not work_word[-1] in ['.',':',';',',',"'"] else work_word[:-1]

    if work_word in w2v.vocab and work_word in filtered_words:
        out_vector[:300] = w2v[work_word]
        return out_vector
    
    out_vector[305] = 1 #Flag unknown words
    return out_vector

In [34]:
print(map_word('*WIKIPEDIA*'))

[ 0.21875    -0.12207031 -0.00296021  0.02429199  0.08300781 -0.01977539
  0.00396729 -0.09570312  0.11035156 -0.37109375  0.12451172 -0.54296875
 -0.09912109  0.08544922 -0.16894531 -0.10205078  0.22753906 -0.07421875
 -0.03015137 -0.35742188 -0.11523438 -0.01171875  0.27148438 -0.01049805
 -0.22070312 -0.17578125 -0.18847656  0.18554688 -0.08007812 -0.05615234
 -0.05151367 -0.11132812 -0.24609375 -0.09912109 -0.14550781  0.08447266
 -0.12792969  0.29882812  0.24609375  0.10449219  0.12402344 -0.07324219
  0.15625     0.59765625  0.28125     0.00970459 -0.171875   -0.25585938
 -0.24511719 -0.171875   -0.24121094 -0.10302734 -0.17578125 -0.05834961
  0.18945312 -0.08349609  0.11279297  0.07470703 -0.27148438 -0.3203125
  0.12158203 -0.04052734  0.13378906 -0.18457031  0.01904297 -0.19433594
 -0.203125   -0.24414062  0.16113281  0.02490234 -0.11035156  0.16015625
 -0.23632812 -0.19628906 -0.14550781  0.10546875  0.07177734 -0.14257812
 -0.03857422  0.20703125  0.30078125  0.06591797  0.

In [35]:
def process_comment(input_comment):
    result_matrix = np.zeros((250,307))
    if (len(input_comment) == 0):
        return result_matrix
    
    input_comment = input_comment[:250]
    temp_matrix = [map_word(word) for word in input_comment]
    result_matrix[-len(input_comment):,:] = temp_matrix
    return result_matrix

In [36]:
#Test the process comment function
np.array(process_comment(comment_words[21])).shape

(250, 307)

In [None]:
train_features = []
processed = 0
for one_comment in comment_words:
    train_features.append(process_comment(one_comment))
    processed += 1
    if ( processed % 100 ) == 0:
        print('Processed {0} of train features:'.format(processed))

train_features = np.array(train_features)

Processed 100 of train features:
Processed 200 of train features:
Processed 300 of train features:
Processed 400 of train features:
Processed 500 of train features:
Processed 600 of train features:
Processed 700 of train features:
Processed 800 of train features:
Processed 900 of train features:
Processed 1000 of train features:
Processed 1100 of train features:
Processed 1200 of train features:
Processed 1300 of train features:
Processed 1400 of train features:
Processed 1500 of train features:
Processed 1600 of train features:
Processed 1700 of train features:
Processed 1800 of train features:
Processed 1900 of train features:
Processed 2000 of train features:
Processed 2100 of train features:
Processed 2200 of train features:
Processed 2300 of train features:
Processed 2400 of train features:
Processed 2500 of train features:
Processed 2600 of train features:
Processed 2700 of train features:
Processed 2800 of train features:
Processed 2900 of train features:
Processed 3000 of train

In [None]:
test_features = []
processed = 0
for one_comment in test_comment_words:
    test_features.append(process_comment(one_comment))
    processed += 1
    if ( processed % 100 ) == 0:
        print('Processed {0} of {1} test features:'.format(processed,len(test_features)))

test_features = np.array(test_features)

In [None]:
np.savez_compressed('train_features',features=train_features,labels=train_labels)
np.savez_compressed('test_features',features=test_features,labels=test_labels)