In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
import io
from contextlib import redirect_stdout

import nltk
from nltk.tokenize import word_tokenize

from tqdm import tqdm
import hunspell
import editdistance #Levenshtein distance

from joblib import Parallel, delayed
import multiprocessing

In [2]:
f = open('./glove.840B.300d.txt') #Use this as our list of valid words
valid_words = []
for line in f:
    values = line.split()
    word = ' '.join(values[:-300])
    #coefs = np.asarray(values[-300:], dtype='float32')
    valid_words.append(word)
f.close()

In [3]:
num_cores = multiprocessing.cpu_count()
spellchecker = hunspell.HunSpell('./index.dic', './index.aff')

f = open("badwords.txt", "r") #Using a dictionary for duplicates (fuck vs fuk vs f_u_c_k) might be better
badwords = f.read().splitlines()
f.close()

word_set = set(valid_words + badwords) #Much faster searching, add the badwords too.
del(valid_words) #clean up
regex_url = re.compile(r'''http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+''')
regex_email = re.compile(r'''(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])''')
regex_underscorelink = re.compile(r'\[(.*)_link:.*\]',re.IGNORECASE)
regex_tags = re.compile(r'\[\[(category|wikipedia|user).*\]\]')
regex_strayapos = re.compile(r'''((?<=[^a-zA-Z])'|'(?![a-zA-Z]))''')
regex_nums = re.compile(r'\d+')
regex_notascii = re.compile(r'[^\x00-\x7f]')
punct = '''!"#$%&\()*+,-./:;<=>?@[\\]^_`{|}~'''

def clean_sentence(dirty_sentence):
    dirty_sentence = dirty_sentence.lower() #Lowercase
    dirty_sentence = regex_underscorelink.sub('', dirty_sentence) #Remove everything inside [wiki_link: ...] etc
    dirty_sentence = regex_tags.sub('', dirty_sentence) #Remove some extra tags
    dirty_sentence = regex_url.sub('', dirty_sentence) #Remove URLs
    dirty_sentence = regex_email.sub('', dirty_sentence) #Remove E-mails
    dirty_sentence = regex_nums.sub('', dirty_sentence) #Remove numbers    
    dirty_sentence = regex_notascii.sub('', dirty_sentence) #Remove some non-ASCII (hunspell has some difficulties otherwise)
    dirty_sentence = regex_strayapos.sub('', dirty_sentence) #Remove stray apostrophes
    dirty_sentence = dirty_sentence.translate(str.maketrans({a:None for a in punct})) #Remove punctuation, except apostrophe
    cleaning = word_tokenize(dirty_sentence)
    
    wordlist = []
    for word in cleaning:            
        if word in word_set: #If this is a word, add it
            wordlist.append(word)
        else:
            autotry = spellchecker.suggest(word) #Try to correct word
            if len(autotry)==0:
                wordlist.append(word) #No suggestions found, add it as-is
                continue
            
            autocorrect = autotry[0] #Take the first suggestion
            if editdistance.eval(word, autocorrect) >3: #Suggestion looks too different from word, leave it alone and add
                wordlist.append(word)
                continue
            
            wordlist.append(autocorrect) #Replace word with our suggested word
    
    return " ".join(wordlist)

In [5]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_sentences_test = test["comment_text"].fillna("CVxTz").values

In [6]:
#Can do: tail -f log.txt
#To monitor progress
f = open("log.txt", "w")
print("Cleaning {:d} sentences".format(len(list_sentences_train)))
with redirect_stdout(f):
    clean_train = Parallel(n_jobs=num_cores, verbose=50, batch_size = 2)(delayed(clean_sentence)(i) for i in list_sentences_train)
f.close()

train["comment_text"]=clean_train
train.to_csv("train_cleaned.csv", index = False)

Cleaning 95851 sentences
done cleaning
starting to save


In [8]:
f = open("log.txt", "w")
print("Cleaning {:d} sentences".format(len(list_sentences_test)))
with redirect_stdout(f):
    clean_test = Parallel(n_jobs=num_cores, verbose=50, batch_size = 2)(delayed(clean_sentence)(i) for i in list_sentences_test)
f.close()

test["comment_text"]=clean_test
test.to_csv("test_cleaned.csv", index = False)

Cleaning 226998 sentences
