# Preprocessing experiments

Inspired by https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings , check what's the situtatoin with the current approach - Keras Tokenizer + Embedding constants (e.g. MAX_WORDS)

In [1]:
import keras
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn import metrics

from collections import defaultdict
import operator
import re
from tqdm import tqdm


print(os.listdir("../input"))

Using TensorFlow backend.


['embeddings', 'train.csv', 'sample_submission.csv', 'test.csv']


In [2]:
MAX_SEQUENCE_LENGTH = 60
MAX_WORDS = 95000
EMBEDDINGS_LOADED_DIMENSIONS = 300

In [3]:
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

In [4]:
BATCH_SIZE = 256
Q_FRACTION = 1
questions = df_train.sample(frac=Q_FRACTION)
question_texts = questions["question_text"].values
question_targets = questions["target"].values
test_texts = df_test["question_text"].fillna("_na_").values

print(f"Working on {len(questions)} questions")

Working on 1306122 questions


In [5]:
def load_embeddings(file):
    embeddings = {}
    with open(file, encoding="utf8", errors='ignore') as f:
        def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
        embeddings = dict(get_coefs(*line.split(" ")) for line in f)
        
    print('Found %s word vectors.' % len(embeddings))
    return embeddings

In [6]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(list(df_train["question_text"].values))

# Preprocessing

Define a Preprocessor class based on https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings
It's future goal would be to contribute to a known model and explore any improvement in the final score

In [9]:
class Preprocessor:
    def __init__(self, embeddings: dict):
        self.embeddings = embeddings

    def build_tf_dict(self, sentences: list):
        """
        Build a simple TF (term frequency) dictionary for all words in the provided sentences.
        """
        tf_dict = defaultdict(int)
        for sentence in sentences:
            for word in sentence:
                tf_dict[word] += 1
        return tf_dict

    def check_coverage(self, tf_dictionary: dict):
        """
        Build a simple list of words that are not embedded. Can be used down the stream to preprocess them to something
        known.
        """
        in_vocabulary = defaultdict(int)
        out_of_vocabulary = defaultdict(int)
        in_count = 0
        out_count = 0

        for word in tqdm(tf_dictionary):
            if word in self.embeddings:
                in_vocabulary[word] = self.embeddings[word]
                in_count += tf_dictionary[word]
            else:
                out_of_vocabulary[word] = tf_dictionary[word]
                out_count += tf_dictionary[word]

        percent_tf = len(in_vocabulary) / len(tf_dictionary)
        percent_all = in_count / (in_count + out_count)
        print('Found embeddings for {:.2%} of vocabulary and {:.2%} of all text'.format(percent_tf, percent_all))

        return sorted(out_of_vocabulary.items(), key=operator.itemgetter(1))[::-1]

    def clean_punctuation(self, text: list):
        result = text
        
        for punct in "/-'":
            result = result.replace(punct, ' ')
        for punct in '&':
            result = result.replace(punct, f' {punct} ')
        for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
            result = result.replace(punct, '')

        return result

    def clean_digits(self, text: list):
        result = text
        result = re.sub('[0-9]{5,}', '#####', result)
        result = re.sub('[0-9]{4}', '####', result)
        result = re.sub('[0-9]{3}', '###', result)
        result = re.sub('[0-9]{2}', '##', result)
        return result

    def clean_misspelling(self, text: list):
        mispell_dict = {'colour':'color',
                'centre':'center',
                'didnt':'did not',
                'doesnt':'does not',
                'isnt':'is not',
                'shouldnt':'should not',
                'favourite':'favorite',
                'travelling':'traveling',
                'counselling':'counseling',
                'theatre':'theater',
                'cancelled':'canceled',
                'labour':'labor',
                'organisation':'organization',
                'wwii':'world war 2',
                'citicise':'criticize',
                'instagram': 'social medium',
                'whatsapp': 'social medium',
                'snapchat': 'social medium'}

        def _get_mispell(mispell_dict):
            mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
            return mispell_dict, mispell_re

        mispellings, mispellings_re = _get_mispell(mispell_dict)

        def replace(match):
            return mispellings[match.group(0)]

        return mispellings_re.sub(replace, text)
    
    def apply_cleaning_function(self, fn, texts: list, description = ""):
        result = [fn(text) for text in tqdm(texts)]
        sentences = [text.split() for text in result]
        tf_dict = self.build_tf_dict(sentences)
        oov = self.check_coverage(tf_dict)
        print(oov[:10])

        return result

    def preprocess_for_embeddings_coverage(self, texts: list):
        result = texts

        sentences = [text.split() for text in result]
        tf_dict = self.build_tf_dict(sentences)
        oov = self.check_coverage(tf_dict)

        result = self.apply_cleaning_function(lambda x: self.clean_punctuation(x), result, "Cleaning punctuation...")
        result = self.apply_cleaning_function(lambda x: self.clean_digits(x), result, "Cleaning numbers...")
        result = self.apply_cleaning_function(lambda x: self.clean_misspelling(x), result, "Cleaning misspelled words...")

        return result

In [None]:
from gensim.models import KeyedVectors

embedding_files = [
    "../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin",
    "../input/embeddings/glove.840B.300d/glove.840B.300d.txt",
    "../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec",
    "../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt"]
    
load_embedding_functions = [ #
    lambda: KeyedVectors.load_word2vec_format(embedding_files[0], binary=True),
    lambda: load_embeddings(embedding_files[1]),
    lambda: load_embeddings(embedding_files[2]),
    lambda: load_embeddings(embedding_files[3])]

for index, load_embeddings_fn in enumerate(load_embedding_functions):
    print(f"Training with {embedding_files[index]}")
    print(f"==============================================================================================================")
    embeddings = load_embeddings_fn()
    preprocessor = Preprocessor(embeddings)
    preprocessor.preprocess_for_embeddings_coverage(question_texts)

Training with ../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin
