In [1]:
import pandas as pd
import numpy as np
import tqdm
import time
import re
import string
import gc
from tqdm._tqdm_notebook import tqdm_notebook

In [2]:
cont_patterns = [
    (b'US', b'United States'),
    (b'IT', b'Information Technology'),
    (b'(W|w)on\'t', b'will not'),
    (b'(C|c)an\'t', b'can not'),
    (b'(I|i)\'m', b'i am'),
    (b'(A|a)in\'t', b'is not'),
    (b'(\w+)\'ll', b'\g<1> will'),
    (b'(\w+)n\'t', b'\g<1> not'),
    (b'(\w+)\'ve', b'\g<1> have'),
    (b'(\w+)\'s', b'\g<1> is'),
    (b'(\w+)\'re', b'\g<1> are'),
    (b'(\w+)\'d', b'\g<1> would'),
]
patterns = [(re.compile(regex), repl) for (regex, repl) in cont_patterns]

def clean_text(text):
    """ Simple text clean up process"""
    # 1. Go to lower case (only good for english)
    # Go to bytes_strings as I had issues removing all \n in r""
    clean = bytes(text.lower(), encoding="utf-8")
    # 2. Drop \n and  \t
    clean = clean.replace(b"\n", b" ")
    clean = clean.replace(b"\t", b" ")
    clean = clean.replace(b"\b", b" ")
    clean = clean.replace(b"\r", b" ")
    # 3. Replace english contractions
    for (pattern, repl) in patterns:
        clean = re.sub(pattern, repl, clean)
    # 4. Drop puntuation
    # I could have used regex package with regex.sub(b"\p{P}", " ")
    exclude = re.compile(b'[%s]' % re.escape(bytes(string.punctuation, encoding='utf-8')))
    clean = b" ".join([exclude.sub(b'', token) for token in clean.split()])
    # 5. Drop numbers - as a scientist I don't think numbers are toxic ;-)
    clean = re.sub(b"\d+", b" ", clean)
    # 6. Remove extra spaces - At the end of previous operations we multiplied space accurences
    clean = re.sub(b'\s+', b' ', clean)
    # Remove ending space if any
    clean = re.sub(b'\s+$', b'', clean)
    # 7. Now replace words by words surrounded by # signs
    # e.g. my name is bond would become #my# #name# #is# #bond#
    # clean = re.sub(b"([a-z]+)", b"#\g<1>#", clean)
    #clean = re.sub(b" ", b"# #", clean)  # Replace space
    #clean = b"#" + clean + b"#"  # add leading and trailing #

    return str(clean, 'utf-8')

In [3]:
def preprocess(df):
    df["clean_comment"] = df["comment_text"].apply(lambda x: clean_text(x))

In [4]:
SMALL_DATA = True
print("--- Loading Files")
start_time = time.time()
train = pd.read_hdf('../input/train.h5')
test = pd.read_hdf('../input/test.h5')
print("--- Finished Loading %s" % (time.time() - start_time))

if SMALL_DATA:
    print("Using small data")
    train = train[:100]
    test = test[:100]

print("--- Preprocess")
start_time = time.time()
preprocess(train)
preprocess(test)
print("--- Finished preprocess %s" % (time.time() - start_time))

--- Finished Loading 2.366823434829712
Using small data
--- Finished preprocess 0.05950498580932617


In [5]:
drop_f = [x for x in train.columns if x not in ['target', 'clean_comment', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat']]
train.drop(drop_f, axis=1, inplace=True)
gc.collect()
train.head()

Unnamed: 0,target,severe_toxicity,obscene,identity_attack,insult,threat,clean_comment
0,0.0,0.0,0.0,0.0,0.0,0.0,this is so cool it is like would you want your...
1,0.0,0.0,0.0,0.0,0.0,0.0,thank you this would make my life a lot less a...
2,0.0,0.0,0.0,0.0,0.0,0.0,this is such an urgent design problem kudos to...
3,0.0,0.0,0.0,0.0,0.0,0.0,is this something i will be able to install on...
4,0.893617,0.021277,0.0,0.021277,0.87234,0.0,haha you guys are a bunch of losers


In [6]:
# import embeddings
