<h1 id="tocheading">Data Preprocessing</h1>
<div id="toc"></div>

In [22]:
%%javascript
$.getScript('../ipython_notebook_toc_nonumbers.js')

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from scipy.special import logit, expit
import string
import re    #for regex
import nltk
from nltk.corpus import stopwords
import spacy
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize, TweetTokenizer   
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import collections
import re
import sys
import pickle
import spacy
import enchant
import gensim
import operator



In [3]:
DATA_PATH = "../data/"

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
d = enchant.Dict("en_US")
nlp = spacy.load('en')
eng_stopwords = set(stopwords.words("english"))

In [4]:
train = pd.read_csv(DATA_PATH + 'raw/train.csv').fillna(' ')
test = pd.read_csv(DATA_PATH + 'raw/test.csv').fillna(' ')

In [5]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [6]:
# Separate toxic comments from clean ones:
train_toxic = train[(train.toxic == 1) | (train.obscene == 1) | (train.threat == 1) | (train.insult == 1) | (train.identity_hate == 1)]
train_toxic.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0


In [7]:
with open("../ling_src/obscene_words.txt", "r") as f:
    content = f.readlines()
swear_words = set([x.strip() for x in content])
print('Number of swear words in vocabulary: ', len(swear_words))

Number of swear words in vocabulary:  1145


## TF-IDF

In [8]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [9]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 5),
    max_features=25000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [10]:
train_features_tfidf = hstack([train_char_features, train_word_features])
test_features_tfidf = hstack([test_char_features, test_word_features])

In [12]:
pickle.dump(train_features_tfidf, open(DATA_PATH + "train_features_tfidf.p", "wb"))
pickle.dump(test_features_tfidf, open(DATA_PATH + "test_features_tfidf.p", "wb"))

## Simple features

In [13]:
def add_simple_features(dataset_original):
    dataset = dataset_original.copy(deep = True)
    dataset['count_sent']=dataset["comment_text"].apply(lambda x: len(re.findall("\n",str(x)))+1)
    #Word count in each comment:
    dataset['count_word']=dataset["comment_text"].apply(lambda x: len(str(x).split()))
    #Unique word count
    dataset['count_unique_word']=dataset["comment_text"].apply(lambda x: len(set(str(x).split())))
    #Letter count
    dataset['count_letters']=dataset["comment_text"].apply(lambda x: len(str(x)))
    #punctuation count
    dataset["count_punctuations"] =dataset["comment_text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    #upper case words count
    dataset["count_words_upper"] = dataset["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    #title case words count
    dataset["count_words_title"] = dataset["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    #Number of stopwords
    dataset["count_stopwords"] = dataset["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
    #Average length of the words
    dataset["mean_word_len"] = dataset["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    #derived features
    #Word count percent in each comment:
    dataset['word_unique_percent']=dataset['count_unique_word']*100/dataset['count_word']
    #derived features
    #Punct percent in each comment:
    dataset['punct_percent']=dataset['count_punctuations']*100/dataset['count_word']
    dataset["count_swear_words"] = dataset["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.lower() in swear_words]))
    return dataset

In [14]:
train = add_simple_features(train)
test = add_simple_features(test) 

  out=out, **kwargs)


In [16]:
train.to_csv(DATA_PATH + 'preprocessed/train.csv')
test.to_csv(DATA_PATH + 'preprocessed/test.csv')

In [None]:
### code for finding potential obscene words that were not in the original list

# toxic_vocab = []
# for idx, row in train_toxic.iterrows():
#     text = str(row['comment_text']).lower()
#     text = re.sub('([.,!?()])', r' \1 ', text)
#     text = re.sub('\s{2,}', ' ', text)
#     for w in text.split():
#         if w not in eng_stopwords:
#             toxic_vocab.append(w)

# c = collections.Counter(toxic_vocab)

# sid = SentimentIntensityAnalyzer()

# for x in set(toxic_vocab):
#     if x not in swear_words:
#         score = sid.polarity_scores(x)
#         if score['compound'] < 0:
#             print(x)

# for x in c.most_common(50):
#     if x[0] not in swear_words:
#         print(x[0])


## Cleaning

### emoticons

Replacing emoticons with their word meaning.

In [17]:
emoticons = {":-)": "happy", ":)": "happy", ":-]"":]": "happy",":-3": "happy",":3": "happy",":->": "happy",":>": "happy", \
                 "8-)": "happy","8)": "happy",":-}": "happy",":}": "happy",":o)": "happy",":c)": "happy",":^)": "happy", \
                 "=]": "happy","=)": "happy",":-D": "happy",":D": "laugh", "8-D": "laugh","8D": "laugh", "x-D": "laugh", \
                 "xD": "laugh", "X-D": "laugh","XD": "laugh", "=D": "laugh","=3": "happy", "B^D": "laugh",":-(": "sad", \
                 ":(": "sad",":-c": "sad",":c": "sad",":-<": "sad",":<": "sad",":-[": "sad",":[": "sad",":-||": "sad", \
                 ">:[": "angry",":{": "sad",":@": "sad",">:(": "angry",";-)": "wink",";)": "wink","*-)": "wink", \
                 "*)": "wink",";-]": "wink",";]": "wink",";^)": "wink",":-,": "wink",";D": "laugh", \
                 ":-/": "scepticism",":/": "scepticism",":-.": "scepticism",">:\\": "angry",">:/": "angry", \
                 ":\\": "scepticism","=/": "scepticism","=\\": "scepticism",":L": "scepticism","=L": "scepticism", \
                 ":S": "scepticism"}
emoticons_re = {}
for key, val in emoticons.items():
    new_key = key
    for c in new_key:
        if c in ['[','\\','^','$','.','|','?','*','+','(',')']:
            new_key = new_key.replace(c, "\\" + c)
        new_key = new_key.replace("\\\|", "\\|")
    regex = re.compile(new_key + "+")
    emoticons_re[regex] = val

In [18]:
def replace_emoticons(text, tag = 0):
    transformed_text = text
    for emoticon in emoticons_re.keys():
        if emoticon.search(text):
            for m in emoticon.finditer(text):
                if tag:
                    placeholder = " [EMOTICON:" + emoticons_re[emoticon] + "] "
                else:
                    placeholder = " " + emoticons_re[emoticon] + " "
                transformed_text = transformed_text.replace(m.group(), placeholder)
    return transformed_text


### lowercase + tokens + lemmata + spellcheck

Converting to lowercase, removing multiple spaces, usernames, ip, users. Tokenization, spellchecking, lemmatization.

In [19]:
def clean(comment):
    
    comment = comment.lower() #Convert to lower case
    comment = re.sub("\\n"," ",comment)
    comment = comment.strip() #remove \n
    comment = re.sub(' +',' ',comment)
    comment = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)  # remove leaky elements like ip,user
    comment = re.sub("\[\[.*\]","",comment)     # removing usernames
    comment = replace_emoticons(comment)
    
    tokens = [word.text for word in nlp(comment)]
    original_tokens = tokens
    tokens = [token for token in tokens if not token in eng_stopwords]
    words_spellchecked = [d.suggest(token)[0] if not d.check(token) and len(d.suggest(token)) > 0 and not token in string.punctuation else token for token in tokens ]
    lemmata = [word.lemma_ for word in nlp(comment)]
    
    clean_comment = " ".join(tokens)
    spellchecked_comment = " ".join(words_spellchecked)
    return clean_comment, spellchecked_comment, lemmata, tokens, original_tokens

In [20]:
example_text = train_toxic['comment_text'].values[1]
print(example_text)

clean_comment, spellchecked_comment, lemmata, tokens, original_tokens = clean(train_toxic['comment_text'].values[1])

print(clean_comment)

print(tokens)

print(original_tokens)

print(lemmata)

Hey... what is it..
@ | talk .
What is it... an exclusive group of some WP TALIBANS...who are good at destroying, self-appointed purist who GANG UP any one who asks them questions abt their ANTI-SOCIAL and DESTRUCTIVE (non)-contribution at WP?



In [21]:
# Spellchecking doesn't make a lot of sense:
print(spellchecked_comment)



In [22]:
def add_linguistic_features(dataset_original):
    dataset = dataset_original.copy(deep = True)
    dataset.loc[:, 'lemmata'] = ''
    dataset.loc[:, 'lemmata'] = dataset.loc[:, 'lemmata'].astype(object)
    dataset.loc[:, 'tokens'] = ''
    dataset.loc[:, 'tokens'] = dataset.loc[:, 'tokens'].astype(object)
    dataset.loc[:, 'original_tokens'] = ''
    dataset.loc[:, 'original_tokens'] = dataset.loc[:, 'original_tokens'].astype(object)
    for idx, row in dataset.iterrows():
        clean_comment, spellchecked_comment, lemmata, tokens, original_tokens = clean(row['comment_text'])
        dataset.set_value(idx, 'clean_comment', clean_comment)
        dataset.set_value(idx, 'spellchecked_comment', spellchecked_comment)
        dataset.set_value(idx, 'lemmata', lemmata)
        dataset.set_value(idx, 'tokens', tokens)
        dataset.set_value(idx, 'original_tokens', original_tokens)
    dataset["sentiment"] = dataset["comment_text"].apply(lambda x: sid.polarity_scores(x)['compound'])
    dataset["mean_sentiment"] = dataset["comment_text"].apply(lambda x: np.mean([sid.polarity_scores(w)['compound'] for w in x.split()])) 
    return dataset

In [None]:
train = add_linguistic_features(train)
test = add_linguistic_features(test)

train.to_csv(DATA_PATH + 'preprocessed/train.csv')
test.to_csv(DATA_PATH + 'preprocessed/test.csv')

## Embeddings & Out-of-vocabulary

These files are too big for git, can be acquired here:
* Glove https://nlp.stanford.edu/projects/glove/
* word2vec https://code.google.com/archive/p/word2vec/


In [12]:
EMBEDDING_FOLDER = "../../../embeddings/"
GLOVE_PATH = EMBEDDING_FOLDER + "glove.6B.300d.txt"
WORD2VEC_PATH = EMBEDDING_FOLDER + "GoogleNews-vectors-negative300.bin"

In [7]:
def get_coefs(word,*arr):
    return word, np.asarray(arr, dtype='float32')
    
def load_pretrained(word_embedding_type):
    if word_embedding_type == 'Glove':
        embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(GLOVE_PATH))
        embedding_model = np.stack(embeddings_index.values())
        vocab_embedding = set(embeddings_index.keys())
    elif word_embedding_type == 'word2vec':
        embedding_model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary = True, unicode_errors = 'ignore')
        vocab_embedding = embedding_model.vocab
    return embedding_model, vocab_embedding

def check_word_vocab(word, vocab):
    if word in vocab:
        return True, [word], 'none'
    elif word.lower() in vocab:
        return True, [word.lower()], 'lower'
    elif word.capitalize() in vocab:
        return True, [word.capitalize()], 'capitalize'
    elif word.lower().capitalize() in vocab:
        return True, [word.lower().capitalize()], 'lower+capitalize'
    else:
        return False, None, None

def generate_word_vocab(datasets):
    vocab = set()
    cntr = 0
    for dataset in datasets:
        for idx, row in dataset.iterrows():
            print("Progress: %0.0f %%" % (100*cntr/len(dataset)), end = "\r")
            cntr += 1
            sys.stdout.flush()
            vocab.update([token.text for token in nlp(row['comment_text'])])
    sys.stdout.flush()
    vocab = list(vocab)
    vocab.sort()
    print("vocab size: ", len(vocab))
    vocab_encoded = list2encoded_vocab(vocab)
    return vocab_encoded

def list2encoded_vocab(lst):
    return {word: idx for idx, word in enumerate(lst)}

def filter_term(term, case_replacements, mode, spellcheck_replacements = {}):
    if term in spellcheck_replacements.keys():
        term = spellcheck_replacements[term]
    if term in case_replacements.keys():
        if case_replacements[term] == 'none':
            return term
        elif case_replacements[term] == 'lower':
            return term.lower()
        elif case_replacements[term] == 'capitalize':
            return term.capitalize()
        else:
            return term.lower().capitalize()
    else:
        if term in string.punctuation:
            return ''
        else:
            if mode == "replace":
                return '#OOV#'
            elif mode == "remove":
                return ''
            
            
def find_oov(vocab_encoded, vocab_embedding, spell_check_on = False):
    
    '''
    Separates vocabulary into the words that are present in word embedding model and that are not.
    Input:
        vocab_encoded - existing vocabulary in a form {'cat': 0, 'hog': 1, ... }
        vocab_embedding - the vocabulary of word embeddings models
        spell_check_on - whether to try to find a correction so that the word will belong to vocabulary (optional)
    Output:
        vocab_clean - vocabulary with the words present in word embedding models in a form {'cat': 0, 'hog': 1, ... }
        oov - list of unique out-of-vocabulary (in respect to word embedding model) words
    '''
    
    print(spell_check_on)
    
    vocab_clean = set()
    case_replacements = {}
    if spell_check_on:
        spellcheck_replacements = {}
    oov = []
    vocab_sorted = sorted(vocab_encoded.items(), key=operator.itemgetter(1))
    for i, elem in enumerate(vocab_sorted):
        print("Progress: %0.0f %%" % (100*i/len(vocab_sorted)), end="\r")
        sys.stdout.flush()
        res = check_word_vocab(elem[0], vocab_embedding)
        if res[0]:
            vocab_clean.update(res[1])
            case_replacements[elem[0]] = res[2]
        else:
            if spell_check_on:
                is_continue = True
                if len(elem[0]) > 3:
                    for x in elem[0]:
                        if x in string.punctuation:
                            oov.append(elem[0])
                            is_continue = False
                    if is_continue:
                        spell_check = d.suggest(elem[0])
                        if len(spell_check) > 0:
                            res = check_word_vocab(spell_check[0], vocab_embedding)
                            if res[0]:
                                vocab_clean.update(res[1])
                                case_replacements[elem[0]] = res[2]
                                spellcheck_replacements[elem[0]] = spell_check[0]
                            else:
                                oov.append(elem[0])
                else:
                    oov.append(elem[0])
            else:
                oov.append(elem[0])
    vocab_clean = list2encoded_vocab(list(vocab_clean))
    oov = list(set(oov))
    if spell_check_on:
         return vocab_clean, oov, case_replacements, spellcheck_replacements
    return vocab_clean, oov, case_replacements, {}

def pprint_oov(oov):
    for a, b, c in zip(oov[200:290][::3], oov[200:290][1::3], oov[200:290][2::3]):
        print('{:<30}{:<30}{:<}'.format(a, b, c))

In [9]:
train_clean = pd.read_csv(DATA_PATH + 'preprocessed/train_ling.csv')
test_clean = pd.read_csv(DATA_PATH + 'preprocessed/test_ling.csv')

In [13]:
embedding_model, vocab_embedding = load_pretrained("word2vec")

In [14]:
len(vocab_embedding)

3000000

In [15]:
vocab = generate_word_vocab([train_clean])

vocab size:  265198


In [16]:
pickle.dump(vocab, open("../data/preprocessed/vocab_word2vec.p", "wb"))

In [18]:
vocab_clean, oov, case_replacements, spellcheck_replacements = find_oov(vocab, vocab_embedding)

False
Progress: 100 %rogress: 24 %Progress: 25 %Progress: 27 %Progress: 27 %Progress: 28 %Progress: 28 %Progress: 36 %Progress: 37 %Progress: 38 %Progress: 39 %Progress: 40 %Progress: 42 %Progress: 43 %Progress: 45 %Progress: 47 %Progress: 48 %Progress: 49 %Progress: 50 %Progress: 51 %Progress: 52 %Progress: 56 %Progress: 58 %Progress: 59 %Progress: 59 %Progress: 60 %Progress: 61 %Progress: 62 %Progress: 63 %Progress: 65 %Progress: 67 %Progress: 70 %Progress: 71 %Progress: 72 %Progress: 73 %Progress: 74 %Progress: 76 %Progress: 76 %Progress: 78 %Progress: 80 %Progress: 81 %Progress: 81 %Progress: 82 %Progress: 82 %Progress: 85 %Progress: 88 %Progress: 90 %Progress: 92 %Progress: 92 %Progress: 93 %Progress: 95 %Progress: 96 %Progress: 99 %Progress: 100 %

In [19]:
pprint_oov(oov)

Hawkens1993                   1477                          A2-AB)/(A2-AB
Ichiki                        Kiapiz                        CO/4302/2002TABDate
Radio(Between                 autheticated                  http://www.stopwar.org.uk/index.php/usa-war-on-terror/2158-michael-moore-why-i-dont-support-the-troops-america-and-neither-do-you
Cculber007                    -Wiggalama                    I.K.Gill
CP30777                       94.4.32.46                    Stevertigo
url}.                         eybaná                        theireven
Wikivoyage                    http://books.google.co.in/books?ei=5RiUTYfiJMrWrQfh3JnzCw&ct;=result&id;=41MIAAAAQAAJ&dq;=abhira+afghanistan&q;=abhirasnewbiiee
recreator                     5.6.5                         BlazikenMaster
http://www.thekillersmusic.com/story/news/artwork_for_happy_birthday_guadalupeVGMusic                       pleeeeeeeeeeeeeaaaaaaaaaaaaaaaaaaaaaassssssssssssssssseeeeeeeeeeeee!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!1111111

In [20]:
pickle.dump(vocab_clean, open("../data/preprocessed/vocab_clean_word2vec.p", "wb"))
pickle.dump(oov, open("../data/preprocessed/oov_word2vec.p", "wb"))
pickle.dump(case_replacements, open("../data/preprocessed/case_replacements_word2vec.p", "wb"))
pickle.dump(spellcheck_replacements, open("../data/preprocessed/spellcheck_replacements_word2vec.p", "wb"))

## contractions

In [24]:
contractions = pickle.load(open("../ling_src/contractions.p", "rb"))

In [31]:
def expand_contractions(text):
    for key, val in contractions.items():
        text = text.replace(key, val)
    return text

In [32]:
expand_contractions('I didn\'t see him')

'I did not see him'

In [33]:
train['comment_text'] = train['comment_text'].apply(expand_contractions)

In [34]:
test['comment_text'] = test['comment_text'].apply(expand_contractions)

In [36]:
train.to_csv(DATA_PATH + 'preprocessed/train_expanded_contractions.csv')
test.to_csv(DATA_PATH + 'preprocessed/test_expanded_contractions.csv')