In [74]:
import numpy as np
import pandas as pd
import os, time, gc, pickle, random
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F
from pytorch_pretrained_bert import BertTokenizer, BertModel
from bert_embedding import BertEmbedding
import apex # used for 16 bit
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
import mxnet as mx # used for GPU
#for getting num good and bad words
from wordcloud import STOPWORDS
from collections import defaultdict
import operator
import swifter # speed up feature gen - multiple cores

# Logging for BERT
import logging
logging.basicConfig(level=logging.INFO)


In [75]:
# helper functions
string.printable
ascii_chars = string.printable
ascii_chars += " áéíóúàèìòùâêîôûäëïöüñõç"

#checks if a string of text contains any nonenglish characters (excluding punctuations, spanish, and french characters)
def contains_non_english(text):
    if all(char in ascii_chars for char in text):
        return 0
    else:
        return 1
    
#clean non english characters from string of text
def remove_non_english(text):
    return ''.join(filter(lambda x: x in ascii_chars, text))


def get_first_word(word):
    if(type(word) != "float"):
        return word.split(" ")[0]
    return "-1"

def get_cap_vs_length(df):
    df['caps_vs_length'] = df['num_caps'].divide(df['num_chars'])
    df.loc[~np.isfinite(df['caps_vs_length']), 'caps_vs_length'] = 0
    
"""    mask = (df['num_chars'] != 0)
    df_valid = df[mask]
    
    df.loc[mask, 'caps_vs_length'] = df_valid['num_caps'] / df_valid['num_chars']"""

def get_unique_word_over_num_words(df):
    df['unique_word_over_num_words'] = df['num_unique_words'].divide(df['num_words'])
    df.loc[~np.isfinite(df['unique_word_over_num_words']), 'unique_word_over_num_words'] = 0
    
def get_avg_word_len(df):
    df['avg_word_len'] = df['total_word_length'].divide(df['num_words'])
    df.loc[~np.isfinite(df['avg_word_len']), 'avg_word_len'] = 0

def get_avg_unique_word_len(df):
    df['avg_unique_word_len'] = df['total_unique_word_length'].divide(df['num_unique_words'])
    df.loc[~np.isfinite(df['avg_unique_word_len']), 'avg_unique_word_len'] = 0
    
def calc_max_word_len(sentence):
    maxLen = 0
    for word in sentence:
        maxLen = max(maxLen, len(word))
    return maxLen

def calc_min_word_len(sentence):
    minLen = 999999
    for word in sentence:
        minLen = min(minLen, len(word))
    return minLen

def calc_total_word_len(sentence):
    cnt = 0
    for x in sentence:
        cnt+=len(x)
    return cnt

def calc_total_unique_word_len(sentence):
    words = set(sentence)
    return calc_total_word_len(words)

#removes all single characters except for "I" and "a"
def remove_singles(text):
    return ' '.join( [w for w in text.split() if ((len(w)>1) or (w.lower() == "i") or (w.lower() == "a"))] )
    
#combines multiple whitespaces into single
def clean_text(x):
    x = str(x)
    for punct in "&/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&':
        x = x.replace(punct, '')
    x = re.sub( '\s+', ' ', x).strip()
    
    
# Text cleaning
# TODO: speed up this func
def pad_chars(text,punct):
    for p in punct:
        text = re.sub('(?<=\w)([!?,])', r' \1', text)
    return text
    
symbols_iv = """?,./-()"$=…*&+′[ɾ̃]%:^\xa0\\{}–“”;!<`®ạ°#²|~√_α→>—£，。´×@π÷？ʿ€の↑∞ʻ℅в•−а年！∈∩⊆§℃θ±≤͡⁴™си≠∂³ி½△¿¼∆≥⇒¬∨∫▾Ω＾γµº♭ー̂ɔ∑εντσ日Γ∪φβ¹∘¨″⅓ɑː✅✓（）∠«»்ுλ∧∀،＝ɨʋδɒ¸☹μΔʃɸηΣ₅₆◦·ВΦ☺❤♨✌≡ʌʊா≈⁰‛：ﬁ„¾ρ⟨⟩˂⅔≅－＞¢⁸ʒは⬇♀؟¡⋅ɪ₁₂ɤ◌ʱ、▒ْ；☉＄∴✏ωɹ̅।ـ☝♏̉̄♡₄∼́̀⁶⁵¦¶ƒˆ‰©¥∅・ﾟ⊥ª†ℕ│ɡ∝♣／☁✔❓∗➡ℝ位⎛⎝¯⎞⎠↓ɐ∇⋯˚⁻ˈ₃⊂˜̸̵̶̷̴̡̲̳̱̪̗̣̖̎̿͂̓̑̐̌̾̊̕\x92"""        

def split_off_symbols_iv(x):
    for punct in symbols_iv:
        x = x.replace(punct, f' {punct} ')
    return x
    
def neutrailize_bad_words(train,test):
    train1_df = train[train["target"]==1]
    train0_df = train[train["target"]==0]

    ## custom function for ngram generation ##
    def generate_ngrams(text, n_gram=1):
        token = [token for token in text.lower().split(" ") if token != "" if token not in STOPWORDS]
        ngrams = zip(*[token[i:] for i in range(n_gram)])
        return [" ".join(ngram) for ngram in ngrams]

    freq_dict_bad = defaultdict(int)
    for sent in train1_df["comment_text"]:
        for word in generate_ngrams(sent):
            freq_dict_bad[word] += 1
    freq_dict_bad = dict(freq_dict_bad)

    freq_dict_good = defaultdict(int)
    for sent in train0_df["comment_text"]:
        for word in generate_ngrams(sent):
            freq_dict_good[word] += 1
    freq_dict_good = dict(freq_dict_good)

    bad_words = sorted(freq_dict_bad, key=freq_dict_bad.get, reverse=True)[:1000]
    good_words = sorted(freq_dict_good, key=freq_dict_good.get, reverse=True)[:1000]

    print("--- Generating num_bad_words")
    train["num_bad_words"] = train["comment_text"].map(lambda x: num_bad_words(x))
    test["num_bad_words"] = test["comment_text"].map(lambda x: num_bad_words(x))

    print("--- Generating num_good_words")
    train["num_good_words"] = train["comment_text"].map(lambda x: num_good_words(x))
    test["num_good_words"] = test["comment_text"].map(lambda x: num_good_words(x))

    return x

In [76]:
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    
    # Feature generation
    
    def gen_feats(df):
        start_time = time.time()
            
        print("--- Generating non_eng")
        df["non_eng"] = df["comment_text"].swifter.apply(lambda x: contains_non_english(x))

        print("--- Generating first_word")
        df["first_word"] = df["comment_text"].swifter.apply(lambda x: get_first_word(x))

        print("--- Generating num_chars (num chars)")
        df['num_chars'] = df['comment_text'].swifter.apply(len)

        print("--- Generating num_caps")
        df['num_caps'] = df['comment_text'].swifter.apply(lambda comment: sum(1 for c in comment if c.isupper()))

        print("--- Generating caps_vs_length")
        get_cap_vs_length(df)

        #print("--- Generating num_exclamation_marks")
        #df['num_exclamation_marks'] = df['comment_text'].apply(lambda comment: comment.count('!'))

        print("--- Generating num_question_marks")
        df['num_question_marks'] = df['comment_text'].apply(lambda comment: comment.count('?'))

        print("--- Generating num_punctuation")
        df['num_punctuation'] = df['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '.,;:'))

        #print("--- Generating num_symbols")
        #df['num_symbols'] = df['comment_text'].apply(lambda comment: sum(comment.count(w) for w in '*&$%'))

        print("--- Generating num_words")
        df['num_words'] = df['comment_text'].swifter.apply(lambda comment: len(re.sub(r'[^\w\s]','',comment).split(" ")))

        print("--- Generating num_unique_words")
        df['num_unique_words'] = df['comment_text'].swifter.apply(lambda comment: len(set(w for w in comment.split())))

        print("--- Generating unique_word_over_num_words")
        get_unique_word_over_num_words(df)

        #print("--- Generating num_smilies")
        #df['num_smilies'] = df['comment_text'].apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))

        print("--- Generating num_sentences")
        df['num_sentences'] = df['comment_text'].swifter.apply(lambda comment: len(re.split(r'[.!?]+', comment)))

        print("--- Generating max_word_len")
        df['max_word_len'] = df['comment_text'].swifter.apply(lambda comment: calc_max_word_len(re.sub(r'[^\w\s]','',comment).split(" ")))
        
        print("--- Generating min_word_len")
        df['max_word_len'] = df['comment_text'].swifter.apply(lambda comment: calc_min_word_len(re.sub(r'[^\w\s]','',comment).split(" ")))
        
        print("--- Generating total_word_length (num of chars in words)")
        df['total_word_length'] = df['comment_text'].swifter.apply(lambda comment: calc_total_word_len(re.sub(r'[^\w\s]','',comment).split(" ")))
        
        print("--- Generating avg_word_len")
        get_avg_word_len(df)
        
        print("--- Generating total_unique_word_length (num of chars in words)")
        df['total_unique_word_length'] = df['comment_text'].swifter.apply(lambda comment: calc_total_unique_word_len(re.sub(r'[^\w\s]','',comment).split(" ")))
        
        print("--- Generating avg_unique_word_len")
        get_avg_unique_word_len(df)
        
        print("--- Finished Gen Feats. Took %s seconds ---" % (time.time() - start_time))
        

    def cleanText(df):
        start_time = time.time()
        df['comment_text'] = df['comment_text'].swifter.apply(lambda x: split_off_symbols_iv(x)) #increase score
        """print("--- cleaning text")
        df["comment_text"] = df["comment_text"].apply(lambda x: clean_text(x))

        print("--- remove single characters")
        df["comment_text"] = df["comment_text"].apply(lambda x: remove_singles(x))

        print("--- cleaning numbers")
        df["comment_text"] = df["comment_text"].apply(lambda x: clean_numbers(x))

        print("--- cleaning misspellings")
        df["comment_text"] = df["comment_text"].apply(lambda x: replace_typical_misspell(x))

        print("--- filling missing values")
        #clean chinese, korean, japanese characters
        print('cleaning characters')
        df["comment_text"] = df["comment_text"].map(lambda x: remove_non_english(x))
        
        ## fill up the missing values
        df["comment_text"].fillna("").values"""
        print("--- Finished cleaning text. Took %s seconds ---" % (time.time() - start_time))

        
    gen_feats(data)
    #data["comment_text"] = data["comment_text"].astype(str).apply(lambda x: pad_chars(x, punct))
    cleanText(data)
    # print("--- Neutralizing bad words")
    # neutrailize_bad_words(train,test)
    
    return data

# Preprocessing

In [77]:
train = pd.read_hdf('../input/train.h5')
test = pd.read_hdf('../input/test.h5')
SMALL_DATA = False
if (SMALL_DATA):
    train = train[:100]
    test = test[:100]

print("Preprocessing train data ...")
x_train = preprocess(train)
print("Preprocessing test data ...")
x_test = preprocess(test)

Preprocessing train data ...
--- Generating non_eng


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1804874, style=ProgressStyle(description_w…

--- Generating first_word


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1804874, style=ProgressStyle(description_w…

--- Generating num_chars (num chars)


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1804874, style=ProgressStyle(description_w…

--- Generating num_caps


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1804874, style=ProgressStyle(description_w…

--- Generating caps_vs_length
--- Generating num_question_marks
--- Generating num_punctuation
--- Generating num_words


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1804874, style=ProgressStyle(description_w…

--- Generating num_unique_words


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1804874, style=ProgressStyle(description_w…

--- Generating unique_word_over_num_words
--- Generating num_sentences


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1804874, style=ProgressStyle(description_w…

--- Generating max_word_len


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1804874, style=ProgressStyle(description_w…

--- Generating min_word_len


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1804874, style=ProgressStyle(description_w…

--- Generating total_word_length (num of chars in words)


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1804874, style=ProgressStyle(description_w…

--- Generating avg_word_len
--- Generating total_unique_word_length (num of chars in words)


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1804874, style=ProgressStyle(description_w…

--- Generating avg_unique_word_len
--- Finished Gen Feats
--- 179.87786626815796 seconds ---


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=1804874, style=ProgressStyle(description_w…

--- 74.13771867752075 seconds ---
Preprocessing test data ...
--- Generating non_eng


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=97320, style=ProgressStyle(description_wid…

--- Generating first_word


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=97320, style=ProgressStyle(description_wid…

--- Generating num_chars (num chars)


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=97320, style=ProgressStyle(description_wid…

--- Generating num_caps


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=97320, style=ProgressStyle(description_wid…

--- Generating caps_vs_length
--- Generating num_question_marks
--- Generating num_punctuation
--- Generating num_words


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=97320, style=ProgressStyle(description_wid…

--- Generating num_unique_words


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=97320, style=ProgressStyle(description_wid…

--- Generating unique_word_over_num_words
--- Generating num_sentences


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=97320, style=ProgressStyle(description_wid…

--- Generating max_word_len


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=97320, style=ProgressStyle(description_wid…

--- Generating min_word_len


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=97320, style=ProgressStyle(description_wid…

--- Generating total_word_length (num of chars in words)


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=97320, style=ProgressStyle(description_wid…

--- Generating avg_word_len
--- Generating total_unique_word_length (num of chars in words)


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=97320, style=ProgressStyle(description_wid…

--- Generating avg_unique_word_len
--- Finished Gen Feats
--- 10.059473037719727 seconds ---


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=97320, style=ProgressStyle(description_wid…

--- 4.183730363845825 seconds ---
