In [89]:
import pandas as pd
import numpy as np

In [97]:
import gensim
W2V_PATH = 'word2vec\GoogleNews-vectors-negative300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)

In [1]:
train_data = pd.read_csv('data/train.csv')

train_data

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [147]:
import collections
import re

def clean_punc(input_string):
    proc_string = input_string.replace('<',' <less ')
    proc_string = proc_string.replace('>',' <greater> ')
    proc_string = re.sub("https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",' <url> ',proc_string)
    proc_string = proc_string.replace(' <less ',' <less> ')
    proc_string = proc_string.replace('?',' <question> ')
    proc_string = proc_string.replace('...',' <suspension> ')
    proc_string = proc_string.replace('. ',' <period> ')
    proc_string = proc_string if not proc_string.endswith('.') else proc_string[:-1]
    proc_string = proc_string.replace('/',' <slash> ')
    proc_string = proc_string.replace('\\',' <backslash> ')
    proc_string = proc_string.replace('; ',' <semicolon> ')
    proc_string = proc_string.replace(': ',' <colon> ')
    proc_string = proc_string.replace(', ',' <comma> ')
    proc_string = proc_string.replace('!',' <exclame> ')
    proc_string = proc_string.replace('\n',' <newline> ')
    proc_string = proc_string.replace(' - ',' <dash> ')
    proc_string = proc_string.replace('""',' <quote> ')
    proc_string = proc_string.replace('"',' <quote> ')
    proc_string = proc_string.replace('(',' <openbracket> ')
    proc_string = proc_string.replace(')',' <closebracket> ')
    return proc_string

def clean_word(input_word):
    out_word = input_word.lower()
    if ( out_word.startswith("'") and out_word.endswith("'")):
        out_word = out_word[1:-1]
    
    if len(out_word)>0:
        out_word = out_word if not out_word[-1] in ['.',':',';'] else out_word[:-1]
    
    return out_word
    
comments = [clean_punc(comment) for comment in train_data.comment_text]
comment_words = []
for comment in comments:
    comment_words.append ([word for word in comment.split()])
flatten = lambda l: [item for sublist in l for item in sublist]

flat_comments = flatten(comment_words)

word_counts = collections.Counter()
for word in flat_comments:
    word_counts[word]+=1

In [156]:
labels = train_data.as_matrix(columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'])


In [136]:
print(len(word_counts))

very_common = [word for word,_ in word_counts.most_common(100)]

very_common

279447


['the',
 '<comma>',
 '<period>',
 '<newline>',
 'to',
 '<quote>',
 'of',
 'and',
 'a',
 'you',
 'i',
 'is',
 'that',
 'in',
 'it',
 '<exclame>',
 'for',
 'this',
 'not',
 '<closebracket>',
 'on',
 '<openbracket>',
 'be',
 'as',
 'have',
 'are',
 '<question>',
 'your',
 'with',
 '<slash>',
 'if',
 'article',
 'was',
 'or',
 'but',
 'page',
 'my',
 'an',
 'from',
 'by',
 'do',
 'at',
 'about',
 'me',
 'wikipedia',
 'so',
 'can',
 'what',
 '<colon>',
 'there',
 'all',
 'talk',
 'has',
 'will',
 'please',
 'would',
 'no',
 'one',
 'just',
 'like',
 'they',
 'he',
 'which',
 'any',
 'been',
 'should',
 'more',
 'we',
 "don't",
 'some',
 'other',
 '<dash>',
 'who',
 'see',
 '<suspension>',
 'here',
 'also',
 'his',
 'think',
 'know',
 'because',
 'how',
 'am',
 "i'm",
 'people',
 'why',
 'edit',
 "it's",
 'only',
 'out',
 'up',
 'when',
 'use',
 'then',
 '<semicolon>',
 'articles',
 'may',
 'were',
 'time',
 'did']

In [120]:
filtered_words = [word for id,word in int_to_word.items() if word_counts[word]>4]
int_to_word = {key: word for key,word in enumerate(filtered_words)}
word_to_int = {word: key for key,word in enumerate(filtered_words)}


In [121]:
comment_ints = []
for comment in comment_words:
    comment_ints.append([word_to_int[word] for word in comment if word in word_to_int])
comment_lens = [len(comment) for comment in comment_ints]

In [122]:
print("Average: {}".format(sum(comment_lens)/float(len(comment_lens))))
for perc in range(5,101,5):
    print("{0} Percentile : {1}".format(perc,np.percentile(comment_lens,perc)))

Average: 76.66832945836023
5 Percentile : 6.0
10 Percentile : 9.0
15 Percentile : 13.0
20 Percentile : 16.0
25 Percentile : 19.0
30 Percentile : 23.0
35 Percentile : 27.0
40 Percentile : 31.0
45 Percentile : 36.0
50 Percentile : 41.0
55 Percentile : 47.0
60 Percentile : 54.0
65 Percentile : 62.0
70 Percentile : 72.0
75 Percentile : 85.0
80 Percentile : 103.0
85 Percentile : 129.0
90 Percentile : 172.0
95 Percentile : 260.0
100 Percentile : 4950.0


In [126]:
unmapped = [word for word in filtered_words if word not in model.vocab]

In [137]:
def map_word(in_word):
    out_vector = np.zeros(306)
    if in_word.isupper():
        out_vector[300] = 1 #Flag shouting
    if in_word.islower():
        out_vector[301] = 1 #Flag normal text
    work_word = in_word.lower()
    if work_word in very_common:
        out_vector[302] = 1 #Flag 100 most common words
    if work_word[0] == '<':
        out_vector[303] = 1 #Flag punctuation we replaced and return
        return out_vector
    
    if work_word in model.vocab:
        out_vector[:300] = model[work_word]
        return out_vector
    
    if work_word[0] == work_word[-1] and work_word[0] in ['_','*',"'"]:
        out_vector[304] = 1 #Flag words with emphasis
        work_word = work_word[1:-1]
    
    if len(work_word)>0:
        work_word = work_word if not work_word[-1] in ['.',':',';',',',"'"] else work_word[:-1]

    if work_word in model.vocab:
        out_vector[:300] = model[work_word]
        return out_vector
    
    out_vector[305] = 1 #Flag unknown words
    return out_vector

In [145]:
print(map_word('<comma>'))

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0

In [148]:
def process_comment(input_comment):
    result_matrix = np.zeros((250,306))
    if (len(input_comment) == 0):
        return result_matrix
    
    input_comment = input_comment[:250]
    temp_matrix = [map_word(word) for word in input_comment]
    result_matrix[-len(input_comment):,:] = temp_matrix
    return result_matrix

In [153]:
#Test the process comment function
np.array(process_comment(comment_words[21])).shape

(250, 306)

In [150]:
features = []
for one_comment in comment_words:
    features.append(process_comment(one_comment))

features = np.array(features)

['You',
 'are',
 'gay',
 'or',
 'antisemmitian',
 '<question>',
 '<newline>',
 '<newline>',
 'Archangel',
 'WHite',
 'Tiger',
 '<newline>',
 '<newline>',
 'Meow',
 '<exclame>',
 'Greetingshhh',
 '<exclame>',
 '<newline>',
 '<newline>',
 'Uh',
 '<comma>',
 'there',
 'are',
 'two',
 'ways',
 '<comma>',
 'why',
 'you',
 'do',
 'erased',
 'my',
 'comment',
 'about',
 'WW2',
 '<comma>',
 'that',
 'holocaust',
 'was',
 'brutally',
 'slaying',
 'of',
 'Jews',
 'and',
 'not',
 'gays',
 '<slash>',
 'Gypsys',
 '<slash>',
 'Slavs',
 '<slash>',
 'anyone',
 '<suspension>',
 '<newline>',
 '<newline>',
 '1',
 '<dash>',
 'If',
 'you',
 'are',
 'anti-semitian',
 '<comma>',
 'than',
 'shave',
 'your',
 'head',
 'bald',
 'and',
 'go',
 'to',
 'the',
 'skinhead',
 'meetings',
 '<exclame>',
 '<newline>',
 '<newline>',
 '2',
 '<dash>',
 'If',
 'you',
 'doubt',
 'words',
 'of',
 'the',
 'Bible',
 '<comma>',
 'that',
 'homosexuality',
 'is',
 'a',
 'deadly',
 'sin',
 '<comma>',
 'make',
 'a',
 'pentagram',
 '