<h1 id="tocheading">Data Preprocessing</h1>
<div id="toc"></div>

In [1]:
%%javascript
$.getScript('../ipython_notebook_toc_nonumbers.js')

<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack
from scipy.special import logit, expit
import string
import re    #for regex
import nltk
from nltk.corpus import stopwords
import spacy
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize, TweetTokenizer   
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import collections
import re
import sys
import pickle
import spacy
import enchant
import gensim
import operator
import sys, datetime



In [3]:
DATA_PATH = "../data/"
sid = SentimentIntensityAnalyzer()
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
d = enchant.Dict("en_US")
nlp = spacy.load('en')
eng_stopwords = set(stopwords.words("english"))

In [4]:
train = pd.read_csv(DATA_PATH + 'raw/train.csv').fillna(' ')
test = pd.read_csv(DATA_PATH + 'raw/test.csv').fillna(' ')

In [5]:
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [6]:
# Separate toxic comments from clean ones:
train_toxic = train[(train.toxic == 1) | (train.obscene == 1) | (train.threat == 1) | (train.insult == 1) | (train.identity_hate == 1)]
train_toxic.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,1,0,0,0,0,0
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",1,0,0,0,0,0
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",1,0,1,0,1,0


In [7]:
with open("../ling_src/obscene_words.txt", "r") as f:
    content = f.readlines()
swear_words = set([x.strip() for x in content])
print('Number of swear words in vocabulary: ', len(swear_words))

Number of swear words in vocabulary:  1166


## remove rare & common words

In [7]:
train_toxic['tokens'] = train_toxic['comment_text'].apply(nltk.tokenize.word_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [10]:
freq_dist = nltk.FreqDist([x for y in train_toxic['tokens'] for x in y])

In [12]:
len(freq_dist)

52026

In [36]:
rare_words = list(freq_dist.most_common())[-1000:]

In [37]:
common_words = list(freq_dist.most_common())[:1000]

In [39]:
def remove_rare_word(tokens):
    return [word for word in tokens if word not in rare_words]
def remove_common_words(tokens):
    return [word for word in tokens if word not in common_words]

In [40]:
train_toxic['no_rare_tokens'] = train_toxic['tokens'].apply(remove_rare_word)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [41]:
train_toxic['no_common_tokens'] = train_toxic['tokens'].apply(remove_common_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


## spellchecker

In [42]:
from textblob import TextBlob

In [43]:
def convert_textblob(text:str):
    return TextBlob(text)

In [44]:
train_toxic['textblob'] = train_toxic['comment_text'].apply(convert_textblob)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [46]:
def textblob_spellcheck(textblob):
    return textblob.correct()

In [None]:
train_toxic['textblob_spellcheck'] = train_toxic['textblob'].apply(textblob_spellcheck)

## hatebase speech

In [None]:
words_ethnicity = pickle.load(open("../ling_src/words_ethnicity.p", 'rb'))

## language

In [7]:
from polyglot.detect import Detector

In [10]:
from alphabet_detector import AlphabetDetector
ad = AlphabetDetector()

In [29]:
def determine_language(x):
    try:
        return Detector(x).language.code
    except Exception as e:
        return 'unknown'
def determine_alphabet(x):
    try:
        return ' '.join(ad.detect_alphabet(x))
    except Exception as e:
        return 'unknown'

In [30]:
train_toxic['alphabet'] = train_toxic['comment_text'].apply(determine_alphabet)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [20]:
train_toxic['lang'] = train_toxic['comment_text'].apply(determine_language)

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

In [31]:
import collections
collections.Counter(train_toxic['alphabet'].values) 

Counter({'ARABIC LATIN': 4,
         'CJK LATIN': 5,
         'CUNEIFORM LATIN': 1,
         'FULLWIDTH LATIN': 1,
         'GREEK LATIN': 16,
         'GREEK LATIN CYRILLIC': 2,
         'HEBREW ARABIC LATIN': 1,
         'HEBREW LATIN': 2,
         'KATAKANA LATIN': 2,
         'LATIN': 16174,
         'LATIN CYRILLIC': 14,
         'RUNIC LATIN': 1,
         'SCRIPT LATIN': 1,
         'SINHALA LATIN': 1})

In [35]:
train['alphabet'] = train['comment_text'].apply(determine_alphabet)

In [36]:
collections.Counter(train['alphabet'].values) 

Counter({'': 7,
         'ARABIC LATIN': 54,
         'ARABIC LATIN CYRILLIC': 2,
         'BENGALI LATIN': 13,
         'BOPOMOFO LATIN': 1,
         'CJK ARABIC LATIN': 1,
         'CJK ARABIC LATIN CYRILLIC': 1,
         'CJK HIRAGANA LATIN CYRILLIC': 2,
         'CJK LATIN': 258,
         'CJK LATIN HIRAGANA': 47,
         'CUNEIFORM LATIN': 37,
         'CUNEIFORM RUNIC LATIN': 1,
         'DEVANAGARI BENGALI LATIN': 1,
         'DEVANAGARI LATIN': 45,
         'DOUBLE-STRUCK LATIN': 1,
         'ETHIOPIC LATIN': 19,
         'FULLWIDTH LATIN': 1,
         'GOTHIC MODIFIER GREEK LATIN': 1,
         'GREEK ARABIC LATIN CYRILLIC': 1,
         'GREEK CYRILLIC LATIN HIRAGANA': 1,
         'GREEK ETHIOPIC LATIN': 1,
         'GREEK HEBREW LATIN': 2,
         'GREEK LATIN': 416,
         'GREEK LATIN CYRILLIC': 27,
         'GREEK MASCULINE LATIN': 1,
         'GREEK MODIFIER LATIN CYRILLIC': 2,
         'GURMUKHI LATIN': 4,
         'GURMUKHI MODIFIER DEVANAGARI ARABIC LATIN': 1,
     

## TF-IDF

In [8]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [9]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(1, 5),
    max_features=25000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)

In [10]:
train_features_tfidf = hstack([train_char_features, train_word_features])
test_features_tfidf = hstack([test_char_features, test_word_features])

In [11]:
pickle.dump(train_features_tfidf, open(DATA_PATH + "train_features_tfidf.p", "wb"))
pickle.dump(test_features_tfidf, open(DATA_PATH + "test_features_tfidf.p", "wb"))

## Simple features

In [12]:
def add_simple_features(dataset_original):
    dataset = dataset_original.copy(deep = True)
    dataset['count_sent']=dataset["comment_text"].apply(lambda x: len(re.findall("\n",str(x)))+1)
    #Word count in each comment:
    dataset['count_word']=dataset["comment_text"].apply(lambda x: len(str(x).split()))
    #Unique word count
    dataset['count_unique_word']=dataset["comment_text"].apply(lambda x: len(set(str(x).split())))
    #Letter count
    dataset['count_letters']=dataset["comment_text"].apply(lambda x: len(str(x)))
    #punctuation count
    dataset["count_punctuations"] =dataset["comment_text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    #upper case words count
    dataset["count_words_upper"] = dataset["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
    #title case words count
    dataset["count_words_title"] = dataset["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    #Number of stopwords
    dataset["count_stopwords"] = dataset["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
    #Average length of the words
    dataset["mean_word_len"] = dataset["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    #derived features
    #Word count percent in each comment:
    dataset['word_unique_percent']=dataset['count_unique_word']*100/dataset['count_word']
    #derived features
    #Punct percent in each comment:
    dataset['punct_percent']=dataset['count_punctuations']*100/dataset['count_word']
    dataset["count_swear_words"] = dataset["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.lower() in swear_words]))
    return dataset

In [13]:
train = add_simple_features(train)
test = add_simple_features(test) 

  out=out, **kwargs)


In [14]:
train.to_csv(DATA_PATH + 'preprocessed/train.csv')
test.to_csv(DATA_PATH + 'preprocessed/test.csv')

In [15]:
### code for finding potential obscene words that were not in the original list

# toxic_vocab = []
# for idx, row in train_toxic.iterrows():
#     text = str(row['comment_text']).lower()
#     text = re.sub('([.,!?()])', r' \1 ', text)
#     text = re.sub('\s{2,}', ' ', text)
#     for w in text.split():
#         if w not in eng_stopwords:
#             toxic_vocab.append(w)

# c = collections.Counter(toxic_vocab)

# sid = SentimentIntensityAnalyzer()

# for x in set(toxic_vocab):
#     if x not in swear_words:
#         score = sid.polarity_scores(x)
#         if score['compound'] < 0:
#             print(x)

# for x in c.most_common(50):
#     if x[0] not in swear_words:
#         print(x[0])


## Cleaning

### emoticons

Replacing emoticons with their word meaning.

In [16]:
emoticons = {":-)": "happy", ":)": "happy", ":-]"":]": "happy",":-3": "happy",":3": "happy",":->": "happy",":>": "happy", \
                 "8-)": "happy","8)": "happy",":-}": "happy",":}": "happy",":o)": "happy",":c)": "happy",":^)": "happy", \
                 "=]": "happy","=)": "happy",":-D": "happy",":D": "laugh", "8-D": "laugh","8D": "laugh", "x-D": "laugh", \
                 "xD": "laugh", "X-D": "laugh","XD": "laugh", "=D": "laugh","=3": "happy", "B^D": "laugh",":-(": "sad", \
                 ":(": "sad",":-c": "sad",":c": "sad",":-<": "sad",":<": "sad",":-[": "sad",":[": "sad",":-||": "sad", \
                 ">:[": "angry",":{": "sad",":@": "sad",">:(": "angry",";-)": "wink",";)": "wink","*-)": "wink", \
                 "*)": "wink",";-]": "wink",";]": "wink",";^)": "wink",":-,": "wink",";D": "laugh", \
                 ":-/": "scepticism",":/": "scepticism",":-.": "scepticism",">:\\": "angry",">:/": "angry", \
                 ":\\": "scepticism","=/": "scepticism","=\\": "scepticism",":L": "scepticism","=L": "scepticism", \
                 ":S": "scepticism"}
emoticons_re = {}
for key, val in emoticons.items():
    new_key = key
    for c in new_key:
        if c in ['[','\\','^','$','.','|','?','*','+','(',')']:
            new_key = new_key.replace(c, "\\" + c)
        new_key = new_key.replace("\\\|", "\\|")
    regex = re.compile(new_key + "+")
    emoticons_re[regex] = val

def replace_emoticons(text, tag = 0):
    transformed_text = text
    for emoticon in emoticons_re.keys():
        if emoticon.search(text):
            for m in emoticon.finditer(text):
                if tag:
                    placeholder = " [EMOTICON:" + emoticons_re[emoticon] + "] "
                else:
                    placeholder = " " + emoticons_re[emoticon] + " "
                transformed_text = transformed_text.replace(m.group(), placeholder)
    return transformed_text

### lowercase + tokens + lemmata + spellcheck

Converting to lowercase, removing multiple spaces, usernames, ip, users. Tokenization, spellchecking, lemmatization.

In [17]:
def clean(comment):
    comment = comment.lower() #Convert to lower case
    comment = re.sub("\\n"," ",comment)
    comment = comment.strip() #remove \n
    comment = re.sub(' +',' ',comment)
    comment = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}","",comment)  # remove leaky elements like ip,user
    comment = re.sub("\[\[.*\]","",comment)     # removing usernamed
#     comment = replace_emoticons(comment)
    return comment

In [36]:
def heavy_clean(comment):
    comment = clean(comment)
    for key, val in contractions_lowercase.items():
        comment = comment.replace(key, val)
    comment = re.sub('([!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~])', r' \1 ', comment) # padding punctuation
    comment = re.sub('\s{2,}', ' ', comment) 
#     comment = ' '.join([x for x in comment.split() if x not in eng_stopwords])
#     comment = replace_emoticons(comment)
    return comment

In [24]:
contractions_lowercase = {k.lower():v.lower() for k, v in contractions.items()}

In [32]:
train['heavy_clean'] = train['comment_text'].apply(heavy_clean)
train.to_csv(DATA_PATH + 'preprocessed/train_heavy_clean_no-stopwords.csv')
test['heavy_clean'] = test['comment_text'].apply(heavy_clean)
test.to_csv(DATA_PATH + 'preprocessed/test_heavy_clean_no-stopwords.csv')

In [37]:
train['heavy_clean'] = train['comment_text'].apply(heavy_clean)
train.to_csv(DATA_PATH + 'preprocessed/train_heavy_clean_stopwords.csv')
test['heavy_clean'] = test['comment_text'].apply(heavy_clean)
test.to_csv(DATA_PATH + 'preprocessed/test_heavy_clean_stopwords.csv')

In [7]:
def generate_nlp(text):
    return nlp(text)
def tokenize(text):
    return [word.text for word in text]
def remove_stopwords(tokens):
    return [token for token in tokens if not token in eng_stopwords]
def spellcheck(tokens):
    words_spellchecked = [d.suggest(token)[0] if not d.check(token) and len(d.suggest(token)) > 0 and not token in string.punctuation else token for token in tokens]
    return " ".join(words_spellchecked)
def lemmatize(text):
    return [word.lemma_ for word in text]

In [10]:
def add_linguistic_features(dataset_original):
    dataset = dataset_original.copy(deep = True)
    print('Cleaning texts.')
    begin = datetime.datetime.now()
    dataset['clean_text'] = dataset['comment_text'].apply(clean)
    print("Finished cleaning. Time elapsed:",  datetime.datetime.now() - begin)
    dataset['nlp'] = dataset['comment_text'].apply(generate_nlp)
    print("Finished NLP. Time elapsed:",  datetime.datetime.now() - begin)
    dataset['tokens'] = dataset['nlp'].apply(tokenize)
    print("Finished tokenizing. Time elapsed:",  datetime.datetime.now() - begin)
    dataset['lemmata'] = dataset['nlp'].apply(lemmatize)
    print("Finished lemmatizing. Time elapsed:",  datetime.datetime.now() - begin)
    dataset['no_stopwords'] = dataset['tokens'].apply(remove_stopwords)
    print("Finished removing stopwords. Time elapsed:",  datetime.datetime.now() - begin)
    return dataset

In [11]:
def add_spellcheck(dataset_original):
    dataset = dataset_original.copy(deep = True)
    dataset['spellcheck'] = dataset['tokens'].apply(spellcheck)
    print("Finished spellchecking. Time elapsed:",  datetime.datetime.now() - begin)
    return dataset

In [12]:
def add_sentiment(dataset_original, level = 'sentence'):
    print('Calculating sentiment.')
    begin = datetime.datetime.now()
    dataset = dataset_original.copy(deep = True)
    if level == 'sentence':
        dataset["sentiment"] = dataset["comment_text"].apply(lambda x: sid.polarity_scores(x)['compound'])
    elif level == 'word':
        dataset["mean_sentiment"] = dataset["comment_text"].apply(lambda x: np.mean([sid.polarity_scores(w)['compound'] for w in x.split()])) 
    print('Sentiment calculated. Time elapsed: %d',  (datetime.datetime.now() - begin).seconds)
    return dataset

In [17]:
train_preprocessed = add_linguistic_features(train)
train_preprocessed.to_csv(DATA_PATH + 'preprocessed/train.csv')

Cleaning texts.
Finished cleaning. Time elapsed: 0:00:09.800715
Finished NLP. Time elapsed: 1:12:31.142858
Finished tokenizing. Time elapsed: 1:12:50.982745
Finished lemmatizing. Time elapsed: 1:13:09.474122
Finished removing stopwords. Time elapsed: 1:13:12.586340


In [19]:
test_preprocessed = add_linguistic_features(test)
test_preprocessed.to_csv(DATA_PATH + 'preprocessed/test.csv')

Cleaning texts.
Finished cleaning. Time elapsed: 0:00:08.707683
Finished NLP. Time elapsed: 1:33:20.827369
Finished tokenizing. Time elapsed: 1:33:37.481318
Finished lemmatizing. Time elapsed: 1:33:52.894425
Finished removing stopwords. Time elapsed: 1:33:56.486631


In [None]:
train_preprocessed = add_sentiment(train_preprocessed)
train_preprocessed.to_csv(DATA_PATH + 'preprocessed/train_sentiment.csv')
test_preprocessed = add_sentiment(test_preprocessed)
test_preprocessed.to_csv(DATA_PATH + 'preprocessed/test_sentiment.csv')

Calculating sentiment.
Sentiment calculated. Time elapsed: %d 216
Calculating sentiment.
Sentiment calculated. Time elapsed: %d 0


In [None]:
train_preprocessed = add_spellcheck(train_preprocessed)
train_preprocessed.to_csv(DATA_PATH + 'preprocessed/train_spellcheck.csv')
test_preprocessed = add_spellcheck(test_preprocessed, 'test')
test_preprocessed.to_csv(DATA_PATH + 'preprocessed/test_spellcheck.csv')

## Embeddings & Out-of-vocabulary

These files are too big for git, can be acquired here:
* Glove https://nlp.stanford.edu/projects/glove/
* word2vec https://code.google.com/archive/p/word2vec/


In [None]:
EMBEDDING_FOLDER = "../../../embeddings/"
GLOVE_PATH = EMBEDDING_FOLDER + "glove.6B.300d.txt"
WORD2VEC_PATH = EMBEDDING_FOLDER + "GoogleNews-vectors-negative300.bin"

In [None]:
def get_coefs(word,*arr):
    return word, np.asarray(arr, dtype='float32')
    
def load_pretrained(word_embedding_type):
    if word_embedding_type == 'Glove':
        embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(GLOVE_PATH))
        embedding_model = np.stack(embeddings_index.values())
        vocab_embedding = set(embeddings_index.keys())
    elif word_embedding_type == 'word2vec':
        embedding_model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary = True, unicode_errors = 'ignore')
        vocab_embedding = embedding_model.vocab
    return embedding_model, vocab_embedding

def check_word_vocab(word, vocab):
    if word in vocab:
        return True, [word], 'none'
    elif word.lower() in vocab:
        return True, [word.lower()], 'lower'
    elif word.capitalize() in vocab:
        return True, [word.capitalize()], 'capitalize'
    elif word.lower().capitalize() in vocab:
        return True, [word.lower().capitalize()], 'lower+capitalize'
    else:
        return False, None, None

def generate_word_vocab(datasets):
    vocab = set()
    cntr = 0
    for dataset in datasets:
        for idx, row in dataset.iterrows():
            print("Progress: %0.0f %%" % (100*cntr/len(dataset)), end = "\r")
            cntr += 1
            sys.stdout.flush()
            vocab.update([token.text for token in nlp(row['comment_text'])])
    sys.stdout.flush()
    vocab = list(vocab)
    vocab.sort()
    print("vocab size: ", len(vocab))
    vocab_encoded = list2encoded_vocab(vocab)
    return vocab_encoded

def list2encoded_vocab(lst):
    return {word: idx for idx, word in enumerate(lst)}

def filter_term(term, case_replacements, mode, spellcheck_replacements = {}):
    if term in spellcheck_replacements.keys():
        term = spellcheck_replacements[term]
    if term in case_replacements.keys():
        if case_replacements[term] == 'none':
            return term
        elif case_replacements[term] == 'lower':
            return term.lower()
        elif case_replacements[term] == 'capitalize':
            return term.capitalize()
        else:
            return term.lower().capitalize()
    else:
        if term in string.punctuation:
            return ''
        else:
            if mode == "replace":
                return '#OOV#'
            elif mode == "remove":
                return ''
            
            
def find_oov(vocab_encoded, vocab_embedding, spell_check_on = False):
    
    '''
    Separates vocabulary into the words that are present in word embedding model and that are not.
    Input:
        vocab_encoded - existing vocabulary in a form {'cat': 0, 'hog': 1, ... }
        vocab_embedding - the vocabulary of word embeddings models
        spell_check_on - whether to try to find a correction so that the word will belong to vocabulary (optional)
    Output:
        vocab_clean - vocabulary with the words present in word embedding models in a form {'cat': 0, 'hog': 1, ... }
        oov - list of unique out-of-vocabulary (in respect to word embedding model) words
    '''
    
    print(spell_check_on)
    
    vocab_clean = set()
    case_replacements = {}
    if spell_check_on:
        spellcheck_replacements = {}
    oov = []
    vocab_sorted = sorted(vocab_encoded.items(), key=operator.itemgetter(1))
    for i, elem in enumerate(vocab_sorted):
        print("Progress: %0.0f %%" % (100*i/len(vocab_sorted)), end="\r")
        sys.stdout.flush()
        res = check_word_vocab(elem[0], vocab_embedding)
        if res[0]:
            vocab_clean.update(res[1])
            case_replacements[elem[0]] = res[2]
        else:
            if spell_check_on:
                is_continue = True
                if len(elem[0]) > 3:
                    for x in elem[0]:
                        if x in string.punctuation:
                            oov.append(elem[0])
                            is_continue = False
                    if is_continue:
                        spell_check = d.suggest(elem[0])
                        if len(spell_check) > 0:
                            res = check_word_vocab(spell_check[0], vocab_embedding)
                            if res[0]:
                                vocab_clean.update(res[1])
                                case_replacements[elem[0]] = res[2]
                                spellcheck_replacements[elem[0]] = spell_check[0]
                            else:
                                oov.append(elem[0])
                else:
                    oov.append(elem[0])
            else:
                oov.append(elem[0])
    vocab_clean = list2encoded_vocab(list(vocab_clean))
    oov = list(set(oov))
    if spell_check_on:
         return vocab_clean, oov, case_replacements, spellcheck_replacements
    return vocab_clean, oov, case_replacements, {}

def pprint_oov(oov):
    for a, b, c in zip(oov[200:290][::3], oov[200:290][1::3], oov[200:290][2::3]):
        print('{:<30}{:<30}{:<}'.format(a, b, c))

In [None]:
train_clean = pd.read_csv(DATA_PATH + 'preprocessed/train_ling.csv')
test_clean = pd.read_csv(DATA_PATH + 'preprocessed/test_ling.csv')

In [None]:
embedding_model, vocab_embedding = load_pretrained("word2vec")

In [None]:
len(vocab_embedding)

In [None]:
vocab = generate_word_vocab([train_clean])

In [None]:
pickle.dump(vocab, open("../data/preprocessed/vocab_word2vec.p", "wb"))

In [None]:
vocab_clean, oov, case_replacements, spellcheck_replacements = find_oov(vocab, vocab_embedding)

In [None]:
pprint_oov(oov)

In [None]:
pickle.dump(vocab_clean, open("../data/preprocessed/vocab_clean_word2vec.p", "wb"))
pickle.dump(oov, open("../data/preprocessed/oov_word2vec.p", "wb"))
pickle.dump(case_replacements, open("../data/preprocessed/case_replacements_word2vec.p", "wb"))
pickle.dump(spellcheck_replacements, open("../data/preprocessed/spellcheck_replacements_word2vec.p", "wb"))

## contractions

In [2]:
contractions = pickle.load(open("../ling_src/contractions.p", "rb"))

In [5]:
contractions

{"'tis": 'it is',
 "'twas": 'it was',
 "I'd": 'I had',
 "I'll": 'I shall',
 "I'm": 'I am',
 "I'm'a": 'I am going to',
 "I've": 'I have',
 "ain't": 'am not',
 "amn't": 'am not',
 "aren't": 'are not',
 "can't ": 'cannot',
 "could've": 'could have',
 "couldn't": 'could not',
 "daren't": 'dare not',
 "daresn't": 'dare not',
 "dasn't": 'dare not',
 "didn't": 'did not',
 "doesn't": 'does not',
 "don't": 'do not',
 "e'er": 'ever',
 'gonna': 'going to',
 'gotta': 'got to',
 "hadn't": 'had not',
 "hasn't": 'has not',
 "haven't": 'have not',
 "he'd": 'he had',
 "he'll": 'he shall',
 "he's": 'he has',
 "how'd": 'how did',
 "how'll": 'how will',
 "how's": 'how has',
 "isn't": 'is not',
 "it'd": 'it would',
 "it'll": 'it shall',
 "it's": 'it has',
 "let's": 'let us',
 "ma'am": 'madam',
 "may've": 'may have',
 "mayn't": 'may not',
 "might've": 'might have',
 "mightn't": 'might not',
 "must've": 'must have',
 "mustn't": 'must not',
 "ne'er": 'never',
 "needn't": 'need not',
 "noun're": 'noun are ',
 

In [3]:
def expand_contractions(text):
    for key, val in contractions.items():
        text = text.replace(key, val)
    return text

In [4]:
expand_contractions('I didn\'t see him')

'I did not see him'

In [None]:
train['comment_text'] = train['comment_text'].apply(expand_contractions)

In [None]:
test['comment_text'] = test['comment_text'].apply(expand_contractions)

In [None]:
train.to_csv(DATA_PATH + 'preprocessed/train_expanded_contractions.csv')
test.to_csv(DATA_PATH + 'preprocessed/test_expanded_contractions.csv')