# Setup

In [1]:
import time
import random
import pandas as pd
import numpy as np
import gc
import re
import torch
from torchtext import data
import spacy
from tqdm import tqdm_notebook, tnrange

from tqdm.auto import tqdm

tqdm.pandas(desc='Progress')
from collections import Counter
from textblob import TextBlob
from nltk import word_tokenize

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torchtext.data import Example
from sklearn.metrics import f1_score
import torchtext
import os 

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# cross validation and metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from torch.optim.optimizer import Optimizer
from unidecode import unidecode
from sklearn.metrics import classification_report

Using TensorFlow backend.


### Basic Parameters

In [2]:
embed_size = 300 # how big is each word vector
max_features = 120000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use
batch_size = 512 # how many samples to process at once
n_epochs = 5 # how many times to iterate over all samples
# n_splits = 5 # Number of K-fold Splits

SEED = 1029

### Ensure determinism in the results

The is for the model built in Pytorch. Determinism is one of the advantages of Pytorch in this competition.

In [3]:
def seed_everything(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

### Code for Loading Embeddings

Functions taken from the kernel:https://www.kaggle.com/gmhost/gru-capsule

In [4]:
def load_glove(word_index):
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.005838499,0.48782197
    embed_size = all_embs.shape[1]
    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    # Why random embedding for OOV? what if use mean?
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    #embedding_matrix = np.random.normal(emb_mean, 0, (nb_words, embed_size)) # std 0
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix 
    
def load_fasttext(word_index):    
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    #embedding_matrix = np.random.normal(emb_mean, 0, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    return embedding_matrix

def load_para(word_index):
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = -0.0053247833,0.49346462
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    #embedding_matrix = np.random.normal(emb_mean, 0, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

# Load Data

In [5]:
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")
df = pd.concat([df_train ,df_test],sort=True)

In [6]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab
vocab = build_vocab(df['question_text'])

In [7]:
sin = len(df_train[df_train["target"]==0])
insin = len(df_train[df_train["target"]==1])
persin = (sin/(sin+insin))*100
perinsin = (insin/(sin+insin))*100            
print("# Sincere questions: {:,}({:.2f}%) and # Insincere questions: {:,}({:.2f}%)".format(sin,persin,insin,perinsin))
# print("Sinsere:{}% Insincere: {}%".format(round(persin,2),round(perinsin,2)))
print("# Test samples: {:,}({:.3f}% of train samples)".format(len(df_test),len(df_test)/len(df_train)))

# Sincere questions: 1,225,312(93.81%) and # Insincere questions: 80,810(6.19%)
# Test samples: 375,806(0.288% of train samples)


# Preprocess

Part of codes borrowed from
* Improve your Score with some Text Preprocessing https://www.kaggle.com/theoviel/improve-your-score-with-some-text-preprocessing

In [8]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def known_contractions(embed):
    known = []
    for contract in contraction_mapping:
        if contract in embed:
            known.append(contract)
    return known

def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

def correct_spelling(x, dic):
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

def unknown_punct(embed, punct):
    unknown = ''
    for p in punct:
        if p not in embed:
            unknown += p
            unknown += ' '
    return unknown

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")    

In [9]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

mispell_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

The following toxic words are extracted from insincere questions, 1384 words in total. Codes can be found at the bottom. (Appendix)

In [10]:
real_toxic = ['soetoro', 'yall', 'islamaphob', 'usur', 'wanker', 'thole', 'cocksuck', 'twat', 'remoan', 'hindian', 'khazari', 
              'terroristan', 'bootlick', 'boglin', 'lathi', 'wmaf', 'auvela', 'simpleton', 'kike', 'hongkonges', 'jewism', 
              'chutzpah', 'jewdar', 'zoophil', 'bozo', 'zarat', 'dirtbag', 'witchhunt', 'choicer', 'wagga', 'zoophilia', 
              'peoplekind', 'moeslim', 'eroupian', 'cishet', 'perki', 'bomp', 'wolyn', 'brimston', 'overjoy', 'metzitzah', 
              'andhbhakt', 'myeshia', 'fack', 'manoo', 'oompa', 'loompa', 'demcorat', 'quranist', 'outbr', 'kagan', 'hylton', 
              'repub', 'hissi', 'teehe', 'sugarcoat', 'srkian', 'blowhard', 'trumpanze', 'caucas', 'kamagra', 'derrier', 'hahahah', 
              'largess', 'overproud', 'sugarbab', 'islamaphobia', 'armalit', 'neckbeard', 'trollbot', 'bonehead', 'insest', 'yanukovych', 
              'hubbel', 'lgbti', 'lgbqt', 'biid', 'fruitcak', 'brassier', 'clemenc', 'awlaki', 'perv', 'khanz', 'idolatr', 'turkifi', 
              'baiter', 'hinduphobia', 'jeran', 'witold', 'henpeck', 'hypocraci', 'ezor', 'hiraba', 'cuckserv', 'zoosad', 'iqer', 
              'mangalorean', 'bhapa', 'rapaci', 'radh', 'islaam', 'homoerot', 'phedophil', 'apeshit', 'incl', 'mugger', 'snivel', 
              'massler', 'undiplomat', 'dirtnap', 'turkist', 'castat', 'abyssinian', 'espinanzo', 'skkim', 'copout', 'japheth', 
              'natzi', 'gullabl', 'fauxcahonta', 'gobbi', 'putrid', 'nonflatt', 'ramgarhia', 'overset', 'underset', 'golddigg', 
              'pugb', 'womyn', 'shaniqua', 'homeboy', 'sotomayor', 'unemancip', 'tinni', 'torward', 'grandstand', 'vori', 'anglophob', 
              'coolaid', 'mafioso', 'facetard', 'fucki', 'puneet', 'gloryhol', 'lbgtq', 'bogan', 'soveriegn', 'smartia', 'dumbassistan', 
              'condoleezza', 'headstrong', 'predilect', 'motherfuckin', 'pinko', 'sincerest', 'allreadi', 'vartheta', 'asshat', 'sodomit', 'bremoan', 'reproach', 'razan', 'vrindavani', 'marinov', 'ghaati', 'gangaj', 'bukura', 'houseboy', 'discust', 'hoser', 'qouran', 'counterexampl', 'bordello', 'sengar', 'weeni', 'pussifi', 'dumbfuck', 'mungu', 'dravidanadu', 'rapistan', 'reappropri', 'senselessli', 'pjak', 'elephantin', 'shithead', 'clawn', 'bikya', 'masr', 'senousa', 'kahala', 'corduta', 'griot', 'womankind', 'lgbtqai', 'bhakth', 'refferandum', 'hahahaha', 'manifort', 'dhimmi', 'bludger', 'sephardim', 'remini', 'minutia', 'nauseum', 'effigi', 'shimon', 'pere', 'fillipino', 'majoosi', 'releasethememo', 'mahtob', 'niggl', 'helpag', 'disproport', 'honnavar', 'supermoon', 'mailmen', 'wahhabist', 'bukkari', 'apostat', 'fingur', 'crabbi', 'rohingnya', 'ruffian', 'madridiot', 'rastrakuta', 'revealingli', 'houri', 'clusterfuck', 'ruinat', 'niggeriah', 'nonchristian', 'wussi', 'vishnuist', 'pooper', 'tattl', 'premanand', 'biphobia', 'iraqui', 'heartlessli', 'discrim', 'sycophant', 'suduc', 'wacko', 'babchenko', 'scrimgeour', 'simper', 'choot', 'generalissimo', 'lgtbq', 'onerror', 'jamil', 'demonrat', 'uygur', 'courteous', 'ignoramu', 'lavon', 'banyak', 'hypercet', 'bhramin', 'dilit', 'whorish', 'girasa', 'genoves', 'uniron', 'libidin', 'awaya', 'demontis', 'stephanopoulo', 'nidal', 'jewconomi', 'respectful', 'kashmirian', 'electri', 'kangz', 'muslims', 'jayda', 'fransen', 'otim', 'mussi', 'gayish', 'hispano', 'frederica', 'rabidli', 'yigal', 'goodlatt', 'witless', 'unashamedli', 'donmeh', 'shimanski', 'idnani', 'devar', 'irfaan', 'ringlead', 'gayism', 'stelter', 'salonika', 'lockstep', 'sexili', 'bancoop', 'uttp', 'gleefulli', 'rapefuge', 'neic', 'magoo', 'katyn', 'mischaracter', 'humdrum', 'bukth', 'japhet', 'podunk', 'circlejerk', 'deicid', 'arkansaw', 'charedi', 'jewplic', 'barracoon', 'shirki', 'dacoiti', 'anglosaxon', 'statian', 'doklan', 'amrendra', 'thang', 'ovadia', 'souad', 'sheesh', 'eever', 'thuggish', 'troglodyt', 'docto', 'dickhol', 'mangina', 'goru', 'actually', 'hinduphob', 'blazingli', 'tooter', 'disengenu', 'boshniak', 'middleclass', 'penish', 'libturd', 'bridgett', 'muslm', 'trumpton', 'chodu', 'wonen', 'chup', 'pakistaini', 'mesta', 'browner', 'overgeneralis', 'wellkom', 'tbey', 'illicitli', 'pedog', 'trumptard', 'bronycon', 'jerkoff', 'unaesthet', 'demosthen', 'waggott', 'murda', 'kaci', 'strozk', 'scandanavian', 'sleepin', 'shiff', 'yobbo', 'liberachi', 'saveg', 'kardasian', 'anglocentr', 'lumberyard', 'esoterica', 'narikoravar', 'loudmouth', 'vagena', 'hilata', 'faantasi', 'schlong', 'maliyali', 'alari', 'braincel', 'eyerollingli', 'ophir', 'muta', 'radhswaomi', 'liberal', 'hypocrisy', 'shemen', 'hemen', 'thatwhi', 'bitingli', 'islamophilia', 'ayodya', 'ilegali', 'farrakan', 'anzakistan', 'knifer', 'scudus', 'sychopant', 'chaibala', 'achh', 'bisaya', 'trigglypuff', 'bakrichod', 'wellassa', 'fichteschen', 'schellingschen', 'shitslam', 'pakhandi', 'ejewc', 'islamif', 'djihaddist', 'freakout', 'suckhol', 'immutur', 'khatriya', 'rituel', 'lakhta', 'ociopath', 'dieudonn', 'discremin', 'atlst', 'tamilzan', 'busload', 'reasorc', 'shitheel', 'nchausen', 'pampuia', 'fiuta', 'ebba', 'putang', 'douchebagi', 'odonnel', 'climateg', 'matherfuck', 'dysfuntin', 'keylor', 'infantino', 'nopenmur', 'oftheir', 'evangilitac', 'bussca', 'moplah', 'whinehous', 'prejust', 'illiteratendra', 'feminim', 'infedel', 'barmaid', 'adharsh', 'betamal', 'swatt', 'walding', 'mudbon', 'spentbtim', 'butterfac', 'liberalaphobia', 'shara', 'turanist', 'laner', 'harrem', 'gonen', 'segev', 'andressen', 'crocdil', 'maharahtra', 'weener', 'lesin', 'pridnestrovi', 'lensmak', 'sccount', 'thereproblem', 'whiet', 'spanke', 'leakendra', 'zuraida', 'kamaruddin', 'dipshit', 'scummer', 'screqe', 'hezbolla', 'extortion', 'cuckholdri', 'heyberi', 'secund', 'mahapow', 'bsbe', 'punjaban', 'playth', 'transgener', 'whitelash', 'usele', 'liliettedo', 'matteer', 'raceism', 'plonker', 'khaleeji', 'deerfield', 'klansmen', 'anymore', 'ladan', 'milionair', 'minsit', 'externalist', 'feto', 'singlepay', 'adoles', 'titfuck', 'tennesseu', 'khurmatu', 'grouper', 'rammayana', 'tkink', 'gurdaa', 'insaan', 'carasso', 'jabotinski', 'parvu', 'roddenberri', 'ashyayyyyy', 'clownish', 'modichod', 'ahilya', 'sherif', 'greedo', 'genetilia', 'mengrelian', 'bashirhat', 'causian', 'otherw', 'soyboy', 'unspeci', 'catestroph', 'burek', 'vagimir', 'poontang', 'rafidhi', 'stinkin', 'motti', 'stoooopid', 'enjiy', 'heim', 'paskitan', 'lieng', 'maffia', 'absentminded', 'naredra', 'retitl', 'saygin', 'yalcin', 'patita', 'exonor', 'yogiji', 'buttsor', 'territorist', 'pedophillo', 'prosucut', 'egsist', 'springstein', 'mifsud', 'donate', 'painti', 'cruellest', 'ugenasist', 'ketevan', 'lgbtqqiaap', 'pangend', 'ginorm', 'biddi', 'incestophil', 'gandl', 'reinfectu', 'zakia', 'belkhiri', 'kulak', 'whypakistan', 'sleazebal', 'shanaya', 'wringli', 'unicivlis', 'perforc', 'ivanca', 'clop', 'phylosaphi', 'coocoo', 'leecher', 'neaderthal', 'tastelessli', 'libera', 'hackwork', 'karandlaj', 'manlyhood', 'chaft', 'zooland', 'bechet', 'omnibook', 'projectmakom', 'practiv', 'thammana', 'serapi', 'christu', 'junkook', 'inshirian', 'kejru', 'puddingpop', 'fyromian', 'unputin', 'aligatt', 'neonazi', 'domex', 'bachchabazi', 'hunker', 'yoshiro', 'multicur', 'venganc', 'steni', 'pepplr', 'julkar', 'khate', 'pasttim', 'middleschool', 'handsmaid', 'tembl', 'dimbl', 'mapilla', 'estabilish', 'baluchi', 'gleeful', 'gawp', 'sinia', 'chinnes', 'peadofilia', 'alaria', 'velayat', 'faqih', 'bastardi', 'abney', 'satti', 'fatherboard', 'parentboard', 'duetert', 'penatra', 'fantiz', 'urbanit', 'cunning', 'cprf', 'meer', 'ancest', 'bafoon', 'thaman', 'zeme', 'dicksuck', 'tiannanmen', 'shaub', 'abhineet', 'iryna', 'bilyk', 'practc', 'reprocuss', 'hardick', 'canuk', 'vulgur', 'obsessed', 'nympho', 'dyfunt', 'badmoth', 'flippin', 'smeel', 'langkian', 'paquin', 'single', 'jewlet', 'palestanian', 'beyter', 'dustruct', 'websight', 'whola', 'baffoon', 'extrimisit', 'hjmni', 'bgettfbrfvsbtrfvnhtgfvtnh', 'rbghnrjthfsgrnhtf', 'bjhgfdv', 'yjvd', 'bbmmjijikjki', 'kkiuipuyffgfff', 'yuyhjun', 'mkijthfnymufynhgb', 'indissolubl', 'flurish', 'athieth', 'inferiort', 'briitsh', 'comonwealth', 'fakestinian', 'cacusian', 'privalg', 'extratestri', 'mooslim', 'trumpin', 'rathar', 'ladhak', 'watchabl', 'gaudiya', 'madresa', 'mahashivratri', 'discreetstor', 'aknowledg', 'loor', 'sphani', 'causs', 'hahahahahahah', 'starin', 'boehner', 'fullstop', 'spose', 'yufeng', 'trumpito', 'gamerg', 'ayman', 'mohyeldin', 'propalestinian', 'pointedli', 'ferozepur', 'dtermin', 'chhatrapathi', 'europen', 'mcgucken', 'mapabl', 'cybertrol', 'braggi', 'ccaus', 'jacker', 'dominionist', 'religeon', 'plumpi', 'crudup', 'magazine', 'paigambar', 'twaught', 'baneerje', 'iolanda', 'blee', 'hardhead', 'mudsling', 'whatabouteri', 'karaiyan', 'straightfoward', 'honseijin', 'regino', 'snowflakey', 'becoming', 'ilik', 'diphteria', 'apostatis', 'jewocraci', 'managementskil', 'seacoast', 'pederast', 'lefist', 'teluguwala', 'chinkistan', 'electiom', 'askhenazi', 'hartal', 'oppres', 'haro', 'preas', 'antyth', 'twirk', 'offa', 'indevu', 'inproportion', 'pthan', 'whabi', 'cilit', 'helsiinki', 'shyt', 'navaj', 'besmirch', 'holocuast', 'nometri', 'hynd', 'shopian', 'sleazebag', 'edglelord', 'lingchi', 'virginit', 'demacrat', 'sigint', 'priiz', 'freakier', 'causinng', 'falangist', 'sexuality', 'leeland', 'shrimpboy', 'tepperberg', 'untermensch', 'dikh', 'pappulog', 'ducj', 'audiancecy', 'ifcso', 'drawl', 'outfought', 'coldheart', 'soreceri', 'superunif', 'expatriot', 'othet', 'demcrat', 'monard', 'longmont', 'eyehol', 'kapolei', 'greasier', 'declassif', 'smush', 'illeagl', 'bretheren', 'izaq', 'rendia', 'kangladesh', 'necermind', 'parda', 'trudea', 'dalyellup', 'cambat', 'congoid', 'asiasoid', 'chandrashekhar', 'lesbiam', 'abeshia', 'terorrist', 'presbyt', 'narandar', 'precariat', 'antihindu', 'antijew', 'gursikh', 'nonpract', 'wirshipp', 'juar', 'zucc', 'azerbajani', 'bullycid', 'jait', 'deepti', 'terrror', 'lezbien', 'rentomojo', 'ifugao', 'narzil', 'afriforum', 'gladden', 'breathian', 'kenedi', 'kallstrom', 'coonserv', 'rajmandir', 'prorogu', 'womais', 'stickup', 'feadup', 'cakemak', 'saffronis', 'sportiv', 'murican', 'pakodaman', 'insterburg', 'memel', 'allenstein', 'pillau', 'helisberg', 'gumbinnen', 'nordenburg', 'kolberg', 'sabhi', 'gyaan', 'baatn', 'emptier', 'shagger', 'coroplast', 'worshop', 'hinsu', 'dumbasseri', 'machedeo', 'doofus', 'beauto', 'wohn', 'kurk', 'donno', 'deletequora', 'caucho', 'gook', 'saipul', 'turnendra', 'ahich', 'tinoo', 'foolest', 'informatio', 'fedoff', 'atttack', 'overcompens', 'politicain', 'miquel', 'sirvent', 'frego', 'talaat', 'bettina', 'arndt', 'ivorian', 'shouldjudiciari', 'preodenti', 'smoochi', 'inshalol', 'deptart', 'jailtim', 'condenscend', 'scroung', 'karsevak', 'laxit', 'theirselev', 'uzbekistani', 'blubd', 'rosneft', 'mindfuckin', 'thyey', 'hijo', 'puta', 'faced', 'kenpeitai', 'thaanai', 'thalaivar', 'untrol', 'janitch', 'iinspit', 'coprophagia', 'witchunt', 'mangoliod', 'bmlm', 'burret', 'anthraci', 'unballanc', 'jewsplain', 'hindni', 'qaum', 'opset', 'tormoil', 'overprivileg', 'jeisu', 'winded', 'disagreemen', 'paskistani', 'gynocentr', 'sittin', 'antifascist', 'beuati', 'parashar', 'annihal', 'mcjew', 'jewovitch', 'jewski', 'ninesensical', 'americocentr', 'rezeki', 'agaw', 'squanch', 'barreto', 'haffiz', 'syeed', 'mizoz', 'godsak', 'dhimmitud', 'mynmar', 'superho','thappad', 'tamacha', 'chooseday', 'fauti', 'doctuh', 'christineti', 'brexitosi', 'behenchod', 'kopelman', 'pusdi', 'binyamin', 'reshapen', 'cockhead', 'baldlos', 'antilgbtq', 'untalk', 'mcanal', 'hugenont', 'reguge', 'mouthof', 'rubbishi', 'nthe', 'pissyleak', 'engllish', 'carnist', 'awhit', 'malkin', 'receb', 'amillion', 'brazillan', 'khandhan', 'unequiv', 'niga', 'titsfuck', 'femwhin', 'varpi', 'maranzano', 'profaci', 'gagliano', 'dickstein', 'bareback', 'dahej', 'drund', 'schlussel', 'sceced', 'khortha', 'howmoth', 'unabashedli', 'demetrio', 'sonscritona', 'sikhim', 'pcoptimum', 'monther', 'cockoroach', 'hissyfit', 'pooran', 'fugli', 'halakha', 'gurl', 'newsclip', 'mahahatma', 'islamic', 'hostpit', 'choota', 'haterd', 'resourceless', 'narcissistici', 'errog', 'jalikatu', 'zzzzzz', 'sapless', 'organix', 'namessak', 'sisterfuck', 'demigaug', 'misshap', 'blubberi', 'frienemi', 'anuwind', 'prognath', 'slimebal', 'ditheist', 'mmel', 'dought', 'mongolianz', 'blurr', 'racewar', 'peadopbil', 'musevi', 'assassinated', 'woebegon', 'bonespur', 'migl', 'chilenean', 'khadem', 'oblongata', 'japhetit', 'formosan', 'waali', 'baai', 'arundati', 'suliman', 'masterbur', 'dumbocrat', 'bungler', 'chakka', 'eunech', 'adrianna', 'barbeau', 'striaght', 'uwhi', 'cointelpro', 'witr', 'pinay', 'skipral', 'adjudg', 'expositor', 'whiteboy', 'boorstein', 'radhik', 'toastsexu', 'evononsens', 'rafidha', 'affan', 'intermesh', 'sunhy', 'raciou', 'fairytail', 'grettel', 'chiptun', 'critics', 'mainstem', 'commons', 'vikern', 'coudnt', 'alphagob', 'codeword', 'toiletpap', 'freemasin', 'kashmi', 'papalist', 'yechuri', 'distabilis', 'royspearsblog', 'mughrabi', 'fvcking', 'arund', 'chineses', 'doodoo', 'profetacid', 'uhmmm', 'abcdefghijklmnopqrstuvwxyz', 'badruddin', 'ajmal', 'buysoma', 'shexian', 'gosch', 'assett', 'promuslim', 'femisint', 'reaccept', 'interjector', 'midianit', 'ickiest', 'msot', 'puntuc', 'intactivist', 'agirl', 'pantyless', 'eneyon', 'bullyboy', 'nomi', 'muslimophobia', 'fleed', 'postbellum', 'transgenu', 'purgegam', 'votr', 'gudi', 'padwa', 'bloodfart', 'muton', 'ugggggggllli', 'trumpid', 'dixieland', 'safegenericpharmaci', 'comradeship', 'avalur', 'frigin', 'khalq', 'grandli', 'seduic', 'countrywomen', 'testicles', 'erop', 'convanc', 'littledyck', 'ricist', 'flurazepam', 'unjoin', 'nunesmemo', 'govshutdown', 'trumpli', 'lonley', 'electrogravit', 'antireligion', 'fornicar', 'inexcus', 'brahui', 'sepratist', 'guiltier', 'footbind', 'senomyx', 'dommi', 'gaurang', 'oslash', 'whatshi', 'ticrapo', 'unwsnt', 'afrucan', 'nutter', 'ncrypto', 'janakpur', 'wheee', 'demecrat', 'videogamedunki', 'dunki', 'kanhaya', 'crufic', 'unclassi', 'poojari', 'fervenc', 'paelo', 'fujianes', 'shoney', 'buereaucrat', 'antiamerican', 'squirmi', 'snakelik', 'hona', 'geftaman', 'milliband', 'indish', 'resound', 'bagdadi', 'unpeac', 'spitter', 'hiten', 'mannish', 'relevantli', 'reinvestigst', 'shrinidhi', 'douchey', 'ocasion', 'eassili', 'ameri', 'mcmuffin', 'practicli', 'nuuk', 'castoff', 'oveckin', 'aremenian', 'communalist', 'whereupon', 'biaexit', 'alquada', 'comfot', 'monomaniac', 'funcaptcha', 'dignidad', 'demascul', 'unbelief', 'pedowood', 'antigunn', 'vryant', 'equalistar', 'ovedu', 'blakc', 'lebian', 'bluewat', 'musilim', 'qstn', 'gordita', 'wowowin', 'masturbatori', 'fuckn', 'muahmmad', 'armostrax', 'pregegn', 'uchranian', 'bhuttani', 'intercors', 'sympathes', 'altai', 'lickiti', 'tamirlarg', 'tamirlnadu', 'kalshnikov', 'dumpsit', 'murtadeen', 'seehof', 'progressivist', 'guliani', 'hizra', 'copulatori', 'husoon', 'vegana', 'homosoci', 'defecat', 'diaoyudao', 'populum', 'selll', 'dervish', 'fashist', 'fader', 'tukd', 'tudk', 'baffoonish', 'sutpid', 'junagarh', 'xingp', 'shaitan', 'coatian', 'patmo', 'jaichand', 'ferrog', 'bholeje', 'mataje', 'hsun', 'rewrot', 'whyall', 'sanyasa', 'khoja', 'ahmadia', 'farcism', 'beefburg', 'clothphemi', 'repsect', 'sarvjeet', 'bumiputera', 'gether', 'scamgress', 'genda', 'huehuehuehuehu', 'cnnblackmail', 'feliformia', 'cadsoft', 'hypocritic', 'barzilai', 'shivdharma', 'guesswork', 'atlanticist', 'chitragong', 'fliud', 'croch', 'haastrup', 'mccray', 'taurus', 'undrip', 'subbotonik', 'sharadevi', 'christianphob', 'lives', 'dusfunt', 'acrit', 'asswhol', 'defiantli', 'diahhrea', 'bitransgend', 'dhesiyam', 'malcont', 'ausi', 'profiel', 'daco', 'flippant', 'agitprop', 'jockin', 'gunderson', 'sportswomen', 'trangend', 'obamaand', 'eeewww', 'mmmmm', 'sodomis', 'bitc', 'piehol', 'kanglu', 'conmen', 'manichaeist', 'honorkil', 'amongua', 'argentian', 'kalapani', 'gruffer', 'bhaichung', 'bhutia', 'transfung', 'eveb', 'comf', 'reaganist', 'napali', 'masshol', 'anthropophagolagnia', 'dharmik']

### Feature Engineering

In [11]:
from sklearn.preprocessing import StandardScaler

def add_features(df):
    
    df['question_text'] = df['question_text'].progress_apply(lambda x:str(x))
    df['total_length'] = df['question_text'].progress_apply(len)
    df['capitals'] = df['question_text'].progress_apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.progress_apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    df['num_words'] = df.question_text.str.count('\S+')
    df['num_unique_words'] = df['question_text'].progress_apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']
    
    def toxic_word_ct(txt):
        ct = 0
        toxic_word = ['liber','democra','jew','muslim','white','gay','hindu','trump','women','black','christian','hate']
        for word in toxic_word:
            if word in txt:
                ct=1
                break
        return ct
    
    def unique_toxic(txt):
        ct = 0
        toxic_word = real_toxic
        for word in toxic_word:
            if word in txt:
                ct=1
                break
        return ct
            
    df['sensetive'] = df['question_text'].progress_apply(toxic_word_ct)
    df['unique_tx'] = df['question_text'].progress_apply(unique_toxic)

    return df

def load_and_prec():
    train_df = pd.read_csv("../input/train.csv")
    test_df = pd.read_csv("../input/test.csv")
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    # Lower
    train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower())
    test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower())

    # Clean the text
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_text(x))
    test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_text(x))
    
    # Clean numbers
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: clean_numbers(x))
    test_df["question_text"] = test_df["question_text"].progress_apply(lambda x: clean_numbers(x))
    
    # Clean speelings
    train_df["question_text"] = train_df["question_text"].progress_apply(lambda x: replace_typical_misspell(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: replace_typical_misspell(x))
    
    # Fill up the missing values
    train_X = train_df["question_text"].fillna("_##_").values
    test_X = test_df["question_text"].fillna("_##_").values

    
    # Add features
    train = add_features(train_df)
    test = add_features(test_df)

    features = train[['caps_vs_length', 'words_vs_unique','sensetive','unique_tx','total_length','capitals','num_words','num_unique_words']].fillna(0)
    test_features = test[['caps_vs_length', 'words_vs_unique','sensetive','unique_tx','total_length','capitals','num_words','num_unique_words']].fillna(0)
    
    # Standardize those features
    ss = StandardScaler()
    ss.fit(np.vstack((features, test_features)))
    features = ss.transform(features)
    test_features = ss.transform(test_features)

    # Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    # Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    # Get the target values
    train_y = train_df['target'].values
    
    # Shuffling the data
    np.random.seed(SEED)
    trn_idx = np.random.permutation(len(train_X))

    train_X = train_X[trn_idx]
    train_y = train_y[trn_idx]
    features = features[trn_idx]
    
    return train_X, test_X, train_y, features, test_features, tokenizer.word_index

In [12]:
x_train, x_test, y_train, features, test_features, word_index = load_and_prec() 

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=1306122, style=ProgressStyle(description_width…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Progress', max=375806, style=ProgressStyle(description_width=…

### Load Embeddings

Two embedding matrices have been used. Glove, and paragram. The mean of the two is used as the final embedding matrix

In [13]:
# missing entries in the embedding are set using np.random.normal so we have to seed here too
seed_everything()

glove_embeddings = load_glove(word_index)
paragram_embeddings = load_para(word_index)
# fasttext_embeddings = load_fasttext(word_index)

embedding_matrix = np.mean([glove_embeddings, paragram_embeddings], axis=0)
embedding_matrix_cat = np.concatenate((glove_embeddings, paragram_embeddings), axis=1)
# vocab = build_vocab(df['question_text'])
# add_lower(embedding_matrix, vocab)
# del glove_embeddings, paragram_embeddings, fasttext_embeddings
del glove_embeddings, paragram_embeddings
gc.collect()

np.shape(embedding_matrix)

  


(120000, 300)

# Modeling

For model diversity, five various architectures are used. Also, since the setting in pytorch and keras are different, it would be interesting to take language into account when considering diversity.
- Pytorch:
    - LSTM+GRU+CapsNet
- Keras:
    - GRU+2Poolings 
    - LSTM+Attention+2Poolings
    - CNN with filter_size = 3,4,5,10
    - GRU+CapsNet

## Pytorch Model
### Cyclic CLR
Code taken from https://www.kaggle.com/dannykliu/lstm-with-attention-clr-in-pytorch

In [14]:
# code inspired from: https://github.com/anandsaha/pytorch.cyclic.learning.rate/blob/master/cls.py
class CyclicLR(object):
    def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
                 step_size=2000, mode='triangular', gamma=1.,
                 scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):

        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} base_lr, got {}".format(
                    len(optimizer.param_groups), len(base_lr)))
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode

        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration

    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma**(x)

    def get_lr(self):
        step_size = float(self.step_size)
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        return lrs


### Model Architecture

Binary LSTM with an attention layer and an additional fully connected layer. Also added extra features taken from a winning kernel of the toxic comments competition. Also using CLR and a capsule Layer. Blended together in concatentation.

Initial idea borrowed from: https://www.kaggle.com/ziliwang/baseline-pytorch-bilstm

In [15]:
import torch as t
import torch.nn as nn
import torch.nn.functional as F

embedding_dim = 300
embedding_path = '../save/embedding_matrix.npy'  # or False, not use pre-trained-matrix
use_pretrained_embedding = True

hidden_size = 64
gru_len = hidden_size

Routings = 4 #5
Num_capsule = 5
Dim_capsule = 5#16
dropout_p = 0.25
rate_drop_dense = 0.28
LR = 0.001
T_epsilon = 1e-7
num_classes = 30


class Embed_Layer(nn.Module):
    def __init__(self, embedding_matrix=None, vocab_size=None, embedding_dim=300):
        super(Embed_Layer, self).__init__()
        self.encoder = nn.Embedding(vocab_size + 1, embedding_dim)
        if use_pretrained_embedding:
            # self.encoder.weight.data.copy_(t.from_numpy(np.load(embedding_path))) # 方法一，加载np.save的npy文件
            self.encoder.weight.data.copy_(t.from_numpy(embedding_matrix))  # 方法二

    def forward(self, x, dropout_p=0.25):
        return nn.Dropout(p=dropout_p)(self.encoder(x))


class GRU_Layer(nn.Module):
    def __init__(self):
        super(GRU_Layer, self).__init__()
        self.gru = nn.GRU(input_size=300,
                          hidden_size=gru_len,
                          bidirectional=True)

    def init_weights(self):
        ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
        hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
        b = (param.data for name, param in self.named_parameters() if 'bias' in name)
        for k in ih:
            nn.init.xavier_uniform_(k)
        for k in hh:
            nn.init.orthogonal_(k)
        for k in b:
            nn.init.constant_(k, 0)

    def forward(self, x):
        return self.gru(x)


# core caps_layer with squash func
class Caps_Layer(nn.Module):
    def __init__(self, input_dim_capsule=gru_len * 2, num_capsule=Num_capsule, dim_capsule=Dim_capsule, \
                 routings=Routings, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Caps_Layer, self).__init__(**kwargs)

        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size  # 暂时没用到
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = self.squash
        else:
            self.activation = nn.ReLU(inplace=True)

        if self.share_weights:
            self.W = nn.Parameter(
                nn.init.xavier_normal_(t.empty(1, input_dim_capsule, self.num_capsule * self.dim_capsule)))
        else:
            self.W = nn.Parameter(
                t.randn(BATCH_SIZE, input_dim_capsule, self.num_capsule * self.dim_capsule))  # 64即batch_size

    def forward(self, x):

        if self.share_weights:
            u_hat_vecs = t.matmul(x, self.W)
        else:
            print('add later')

        batch_size = x.size(0)
        input_num_capsule = x.size(1)
        u_hat_vecs = u_hat_vecs.view((batch_size, input_num_capsule,
                                      self.num_capsule, self.dim_capsule))
        u_hat_vecs = u_hat_vecs.permute(0, 2, 1, 3)  # change into(batch_size,num_capsule,input_num_capsule,dim_capsule)
        b = t.zeros_like(u_hat_vecs[:, :, :, 0])  # (batch_size,num_capsule,input_num_capsule)

        for i in range(self.routings):
            b = b.permute(0, 2, 1)
            c = F.softmax(b, dim=2)
            c = c.permute(0, 2, 1)
            b = b.permute(0, 2, 1)
            outputs = self.activation(t.einsum('bij,bijk->bik', (c, u_hat_vecs)))  # batch matrix multiplication
            # outputs shape (batch_size, num_capsule, dim_capsule)
            if i < self.routings - 1:
                b = t.einsum('bik,bijk->bij', (outputs, u_hat_vecs))  # batch matrix multiplication
        return outputs  # (batch_size, num_capsule, dim_capsule)

    # text version of squash, slight different from original one
    def squash(self, x, axis=-1):
        s_squared_norm = (x ** 2).sum(axis, keepdim=True)
        scale = t.sqrt(s_squared_norm + T_epsilon)
        return x / scale
    
class Capsule_Main(nn.Module):
    def __init__(self, embedding_matrix=None, vocab_size=None):
        super(Capsule_Main, self).__init__()
        self.embed_layer = Embed_Layer(embedding_matrix, vocab_size)
        self.gru_layer = GRU_Layer()
        # important (initalization)
        self.gru_layer.init_weights()
        self.caps_layer = Caps_Layer()
        self.dense_layer = Dense_Layer()

    def forward(self, content):
        content1 = self.embed_layer(content)
        content2, _ = self.gru_layer(content1)  # output(seq_len, batch_size, num_directions * hidden_size)，and hn
        content3 = self.caps_layer(content2)
        output = self.dense_layer(content3)
        return output
    


In [16]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)
    
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        
        fc_layer = 16
        fc_layer1 = 16

        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.embedding_dropout = nn.Dropout2d(0.1)
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)

        self.lstm2 = nn.LSTM(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)

        self.lstm_attention = Attention(hidden_size * 2, maxlen)
        self.gru_attention = Attention(hidden_size * 2, maxlen)
        self.bn = nn.BatchNorm1d(16, momentum=0.5)
        self.linear = nn.Linear(hidden_size*8+3, fc_layer1) #643:80 - 483:60 - 323:40
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(fc_layer**2,fc_layer)
        self.out = nn.Linear(fc_layer, 1)
        self.lincaps = nn.Linear(Num_capsule * Dim_capsule, 1)
        self.caps_layer = Caps_Layer()
    
    def forward(self, x):

        h_embedding = self.embedding(x[0])
        h_embedding = torch.squeeze(
            self.embedding_dropout(torch.unsqueeze(h_embedding, 0)))
        
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)

        ##Capsule Layer        
        content3 = self.caps_layer(h_gru)
        content3 = self.dropout(content3)
        batch_size = content3.size(0)
        content3 = content3.view(batch_size, -1)
        content3 = self.relu(self.lincaps(content3))

        ##Attention Layer
        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)
        
        # global average pooling
        avg_pool = torch.mean(h_gru, 1)
        # global max pooling
        max_pool, _ = torch.max(h_gru, 1)
        
        f = torch.tensor(x[1], dtype=torch.float).cuda()

                #[512,160]
        conc = torch.cat((h_lstm_atten, h_gru_atten,content3, avg_pool, max_pool,f), 1)
        conc = self.relu(self.linear(conc))
        conc = self.bn(conc)
        conc = self.dropout(conc)

        out = self.out(conc)
        
        return out

### Training

In [17]:
class MyDataset(Dataset):
    def __init__(self,dataset):
        self.dataset = dataset

    def __getitem__(self, index):
        data, target = self.dataset[index]

        return data, target, index
    def __len__(self):
        return len(self.dataset)

In [18]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# matrix for the out-of-fold predictions
train_preds = np.zeros((len(x_train)))
# matrix for the predictions on the test set
test_preds = np.zeros((len(df_test)))

# always call this before training for deterministic results
seed_everything()

x_test_cuda = torch.tensor(x_test, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

avg_losses_f = []
avg_val_losses_f = []

In [19]:
# split data in train / validation according to the KFold indeces
# also, convert them to a torch tensor and store them on the GPU (done with .cuda())

x_train_fold = torch.tensor(x_train, dtype=torch.long).cuda()
y_train_fold = torch.tensor(y_train[:, np.newaxis], dtype=torch.float32).cuda()

kfold_X_features = features[:,[0,1]]
kfold_X_valid_features = features[:,[0,1]]
x_val_fold = torch.tensor(x_train, dtype=torch.long).cuda()
y_val_fold = torch.tensor(y_train[:, np.newaxis], dtype=torch.float32).cuda()

model = NeuralNet()

# make sure everything in the model is running on the GPU
model.cuda()

# define binary cross entropy loss
# note that the model returns logit to take advantage of the log-sum-exp trick 
# for numerical stability in the loss
loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum')

step_size = 300
base_lr, max_lr = 0.001, 0.003   
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 
                         lr=max_lr)

################################################################################################
scheduler = CyclicLR(optimizer, base_lr=base_lr, max_lr=max_lr,
           step_size=step_size, mode='exp_range',
           gamma=0.99994)
###############################################################################################

train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)

train = MyDataset(train)
valid = MyDataset(valid)

##No need to shuffle the data again here. Shuffling happens when splitting for kfolds.
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)

valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

# print(f'Fold {i + 1}')
for epoch in range(n_epochs):
    # set train mode of the model. This enables operations which are only applied during training like dropout
    start_time = time.time()
    model.train()

    avg_loss = 0.  
    for i, (x_batch, y_batch, index) in enumerate(train_loader):
        # Forward pass: compute predicted y by passing x to the model.
        ################################################################################################            
        f = kfold_X_features[index]
        y_pred = model([x_batch,f])
        ################################################################################################

        ################################################################################################

        if scheduler:
            scheduler.batch_step()
        ################################################################################################

        # Compute and print loss.
        loss = loss_fn(y_pred, y_batch)

        # Before the backward pass, use the optimizer object to zero all of the
        # gradients for the Tensors it will update (which are the learnable weights
        # of the model)
        optimizer.zero_grad()

        # Backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()

        # Calling the step function on an Optimizer makes an update to its parameters
        optimizer.step()
        avg_loss += loss.item() / len(train_loader)

    # set evaluation mode of the model. This disabled operations which are only applied during training like dropout
    model.eval()

    # predict all the samples in y_val_fold batch per batch
    valid_preds_fold = np.zeros((x_val_fold.size(0)))
    test_preds_fold = np.zeros((len(df_test)))

    avg_val_loss = 0.
    print('checkpoint1')
    for i, (x_batch, y_batch, index) in enumerate(valid_loader):
        f = kfold_X_valid_features[index]
        y_pred = model([x_batch,f]).detach()

        avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
    print('checkpoint2')
    elapsed_time = time.time() - start_time 
    print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
        epoch + 1, n_epochs, avg_loss, avg_val_loss, elapsed_time))
    
avg_losses_f.append(avg_loss)
avg_val_losses_f.append(avg_val_loss) 
# predict all samples in the test set batch per batch
for i, (x_batch,) in enumerate(test_loader):
    f = test_features[i * batch_size:(i+1) * batch_size][:,[0,1]]
    y_pred = model([x_batch,f]).detach()

    test_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

#     train_preds[valid_idx] = valid_preds_fold
train_preds = valid_preds_fold
# test_preds += test_preds_fold / len(splits)

print('All \t loss={:.4f} \t val_loss={:.4f} \t '.format(np.average(avg_losses_f),np.average(avg_val_losses_f)))

checkpoint1


  


checkpoint2
Epoch 1/5 	 loss=72.8270 	 val_loss=56.9911 	 time=368.02s
checkpoint1
checkpoint2
Epoch 2/5 	 loss=56.9855 	 val_loss=59.4770 	 time=369.26s
checkpoint1
checkpoint2
Epoch 3/5 	 loss=53.6386 	 val_loss=44.1465 	 time=368.94s
checkpoint1
checkpoint2
Epoch 4/5 	 loss=50.9223 	 val_loss=38.0433 	 time=368.41s
checkpoint1
checkpoint2
Epoch 5/5 	 loss=48.0601 	 val_loss=38.1754 	 time=369.48s
All 	 loss=48.0601 	 val_loss=38.1754 	 


## Keras Model

### Setup

In [None]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import *
import keras.backend as K
from keras.callbacks import *
import tensorflow as tf
from keras.layers import *
from keras.optimizers import Adam
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers

In [21]:
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result
th_f1 = threshold_search(y_train,train_preds)
print(th_f1)

{'threshold': 0.24, 'f1': 0.7890822681670845}


  'precision', 'predicted', average, warn_for)


In [22]:
trainX = x_train; trainy = y_train; test_X = x_test

In [23]:
def f1(y_true, y_pred):
    '''
    metric from here 
    https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras
    '''
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def train_pred(model, epochs=2,callback=None,fea=False):
    for e in range(epochs):
        if fea==False:
            model.fit(train_X, train_y, batch_size=512, epochs=1, validation_data=(val_X, val_y),callbacks=callback,verbose=1)
            pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)
        else:
            model.fit([train_X,train_f], train_y, batch_size=512, epochs=1, validation_data=([val_X,val_f], val_y),callbacks=callback,verbose=1)
            pred_val_y = model.predict([val_X,val_f], batch_size=1024, verbose=0)
        
        best_thresh = 0.5
        best_score = 0.0
        
        for thresh in np.arange(0.1, 0.501, 0.01):
            thresh = np.round(thresh, 2)
            score = f1_score(val_y, (pred_val_y > thresh).astype(int))
            if score > best_score:
                best_thresh = thresh
                best_score = score
        print("Epoch: ", e, "-    best Val F1 Score: {:.4f}, best threshold: {:.4f}".format(best_score,best_thresh))
    report = classification_report(val_y, (pred_val_y > thresh).astype(int))
    print('classification report:\n',report)
#     model.fit(trainX,trainy,batch_size=1024,epochs=1,verbose=1)
    if fea==False:
        pred_test_y = model.predict([test_X],batch_size=1024, verbose=0)
    else:
        pred_test_y = model.predict([test_X,test_features],batch_size=1024, verbose=0)
    print('=' * 60)
    return pred_val_y, pred_test_y, best_thresh,report

In [24]:
def model_gru_2pool(embedding_matrix):
    inp = Input(shape=(maxlen,))
    fea = Input(shape=(8,), name='Features')
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    conc = concatenate([conc,fea])
    conc = Dense(64, activation="relu")(conc)
#     conc = Dropout(0.1)(conc)
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=[inp,fea], outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1])
    return model

In [25]:
def model_lstm_atten2(embedding_matrix):
    
    inp = Input(shape=(maxlen,))
    
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.1)(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    y = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    
    atten_1 = Attention(maxlen)(x) # skip connect
    atten_2 = Attention(maxlen)(y)
    avg_pool = GlobalAveragePooling1D()(y)
    max_pool = GlobalMaxPooling1D()(y)
    
    conc = concatenate([atten_1, atten_2, avg_pool, max_pool])
    conc = Dropout(0.1)(conc)
    outp = Dense(1, activation="sigmoid")(conc)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1])
    
    return model

In [26]:
def model_lstm_atten(embedding_matrix):
    inp = Input(shape=(maxlen,))
    fea = Input(shape=(8,), name='Features')
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = concatenate([x,fea])
    x = Dense(64, activation="relu")(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=[inp,fea], outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1])
    
    return model

In [27]:
def cnn(embedding_matrix):
    inp1 = Input(shape=(maxlen,))
    fea = Input(shape=(6,), name='Features')
    emb = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp1)
    filters = []
    for f in [3,4,5,10]:
        conv = Conv1D(64,f, activation='elu')(emb)
        pool = GlobalMaxPooling1D()(conv)
        filters.append(pool)
    
    x = concatenate(filters)
    x = concatenate([x,fea])
    #classification dense net
    x = Dense(maxlen, activation="relu")(x)
    x = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[inp1,fea], outputs=x)
    model.compile(loss='binary_crossentropy', optimizer=optimizers.Adam(lr=1e-3), metrics=[f1])
#     print(model.summary())
    return model

In [29]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [30]:
def squash(x, axis=-1):
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

# A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(tf.keras.backend.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = tf.keras.backend.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

In [31]:
def capsule(embedding_matrix):
    K.clear_session()       
    inp = Input(shape=(maxlen,))
    fea = Input(shape=(8,), name='Features')
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(rate=0.2)(x)
    x = Bidirectional(CuDNNGRU(100, return_sequences=True, 
                                kernel_initializer=glorot_normal(seed=12300), recurrent_initializer=orthogonal(gain=1.0, seed=10000)))(x)

    x = Capsule(num_capsule=10, dim_capsule=10, routings=4, share_weights=True)(x)
    x = Flatten()(x)
    x = concatenate([x,fea])
    x = Dense(100, activation="relu", kernel_initializer=glorot_normal(seed=12300))(x)
    x = Dropout(0.12)(x)
    x = BatchNormalization()(x)

    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=[inp,fea], outputs=x)
    model.compile(loss='binary_crossentropy', optimizer=Adam(),)
    return model

### Training

In [32]:
outputs=[]
outputs.append([train_preds, test_preds_fold, th_f1['threshold'], 'Boss LSTM'])

In [33]:
train_X, val_X, train_y, val_y,train_f,val_f = train_test_split(trainX, trainy, features, test_size=0.001,random_state=618)
trn_idx = np.random.permutation(len(train_X))
val_idx = np.random.permutation(len(val_X))
train_X = train_X[trn_idx]
val_X = val_X[val_idx]
train_y = train_y[trn_idx]
val_y = val_y[val_idx]
train_f = train_f[trn_idx]
val_f = val_f[val_idx]

In [34]:
pred_val_y, pred_test_y, best_thresh,report= train_pred(model_gru_2pool(embedding_matrix), epochs = 3,fea=True)
outputs.append([pred_val_y, pred_test_y, best_thresh, 'Gru 2pools'])


Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Epoch:  0 -    best Val F1 Score: 0.7143, best threshold: 0.3600
Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Epoch:  1 -    best Val F1 Score: 0.7353, best threshold: 0.3600
Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Epoch:  2 -    best Val F1 Score: 0.7193, best threshold: 0.3200
classification report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97      1197
           1       0.80      0.60      0.68       110

   micro avg       0.95      0.95      0.95      1307
   macro avg       0.88      0.79      0.83      1307
weighted avg       0.95      0.95      0.95      1307



In [36]:
train_X, val_X, train_y, val_y,train_f,val_f = train_test_split(trainX, trainy, features, test_size=0.001,random_state=618)
trn_idx = np.random.permutation(len(train_X))
val_idx = np.random.permutation(len(val_X))
# #test
# trn_idx = np.arange(100)
# val_idx = np.arange(100)
train_X = train_X[trn_idx]
val_X = val_X[val_idx]
train_y = train_y[trn_idx]
val_y = val_y[val_idx]
train_f = train_f[trn_idx]
val_f = val_f[val_idx]

In [37]:
pred_val_y, pred_test_y, best_thresh,report = train_pred(model_lstm_atten(embedding_matrix), epochs = 5, fea=True)
outputs.append([pred_val_y, pred_test_y, best_thresh, '2 LSTM w/ attention'])


Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Epoch:  0 -    best Val F1 Score: 0.7347, best threshold: 0.4500
Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Epoch:  1 -    best Val F1 Score: 0.7421, best threshold: 0.2700
Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Epoch:  2 -    best Val F1 Score: 0.7463, best threshold: 0.4500
Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Epoch:  3 -    best Val F1 Score: 0.7830, best threshold: 0.4100
Train on 1304815 samples, validate on 1307 samples
Epoch 1/1

In [38]:
print(report)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1197
           1       0.81      0.67      0.74       110

   micro avg       0.96      0.96      0.96      1307
   macro avg       0.89      0.83      0.86      1307
weighted avg       0.96      0.96      0.96      1307



In [39]:
# train_X, val_X, train_y, val_y,train_f,val_f = train_test_split(trainX, trainy, features, test_size=0.001,random_state=618)
# trn_idx = np.random.permutation(len(train_X))
# val_idx = np.random.permutation(len(val_X))
# train_X = train_X[trn_idx]
# val_X = val_X[val_idx]
# train_y = train_y[trn_idx]
# val_y = val_y[val_idx]
# train_f = train_f[trn_idx]
# val_f = val_f[val_idx]

In [40]:
# pred_val_y, pred_test_y, best_thresh, report = train_pred(model_lstm_atten2(embedding_matrix), epochs = 3)
# outputs.append([pred_val_y, pred_test_y, best_thresh, '2 LSTM+GRU+atten'])

In [41]:
# print(report)

In [42]:
train_X, val_X, train_y, val_y,train_f,val_f = train_test_split(trainX, trainy, features, test_size=0.001,random_state=527)
trn_idx = np.random.permutation(len(train_X))
val_idx = np.random.permutation(len(val_X))
train_X = train_X[trn_idx]
val_X = val_X[val_idx]
train_y = train_y[trn_idx]
val_y = val_y[val_idx]
train_f = train_f[trn_idx]
val_f = val_f[val_idx]

In [43]:
pred_val_y, pred_test_y, best_thresh, report = train_pred(capsule(embedding_matrix), epochs=4,fea=True)
outputs.append([pred_val_y, pred_test_y, best_thresh, 'GRU Capsule concat ep4'])
# pred_val_y, pred_test_y, best_thresh,report= train_pred(cnn(embedding_matrix), epochs = 10,fea=True)
# outputs.append([pred_val_y, pred_test_y, best_thresh, 'Conv'])

Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Epoch:  0 -    best Val F1 Score: 0.7345, best threshold: 0.4800
Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Epoch:  1 -    best Val F1 Score: 0.7551, best threshold: 0.2300
Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Epoch:  2 -    best Val F1 Score: 0.7692, best threshold: 0.2200
Train on 1304815 samples, validate on 1307 samples
Epoch 1/1
Epoch:  3 -    best Val F1 Score: 0.7941, best threshold: 0.3400
classification report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      1213
           1       0.78      0.66      0.71        94

   micro avg       0.96      0.96      0.96      1307
   macro avg       0.87      0.82      0.85      1307
weighted avg       0.96      0.96      0.96      1307



In [45]:
print([(outputs[i][2],outputs[i][3]) for i in range(len(outputs))])

[(0.24, 'Boss LSTM'), (0.32, 'Gru 2pools'), (0.41, '2 LSTM w/ attention'), (0.34, 'GRU Capsule concat ep4')]


In [46]:
# coefs =[0.227,0.1900,0.2024,0.1907,0.20]
coefs =[0.277,0.2400,0.2424,0.25]
ths = np.sum([outputs[i][2]*coefs[i] for i in range(len(coefs))], axis = 0)
pred_test_y = np.sum([outputs[i][1].reshape(-1,1)*coefs[i] for i in range(len(coefs))], axis = 0)

pred_test_y = (pred_test_y > ths).astype(int)
test_df = pd.read_csv("../input/test.csv", usecols=["qid"])
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)

In [47]:
 np.sum([outputs[i][2]*coefs[i] for i in range(len(coefs))], axis = 0)

0.327664

# Appendix
The part is the local analysis for feature engineering.

In [4]:
# Analysis
from nltk.stem.porter import *
from nltk.corpus import stopwords

train_df = pd.read_csv("../input/train.csv")
toxic = train_df[train_df['target']==1][['qid','question_text']]
nontoxic = train_df[train_df['target']==0][['qid','question_text']]

def printwordc(li):
    w_score = []
    for l in li:
        score = toxd[l]/(nontoxd[l]*10)
        w_score.append((l,score))
    return w_score

def post_preprocess(df):
    i = 0
    post_list = []
    length = len(df)
    stemmer = PorterStemmer()
    print('Processing... Be patient')

    for row in df.iterrows():
        # Progress bar
        i += 1
        if (i % 500 == 0 or i == length):
            print(f"Progress bar：{round(i/length*100)}%")
        # clean the posts
        posts = row[1].question_text
        posts = re.sub(r'\|\|\|', ' ', posts)
        posts = re.sub(r'http[\S]*', '', posts).lower()
        posts = re.sub("[^a-z\s]", ' ', posts)
        posts = ' '.join([stemmer.stem(w) for w in posts.split(
            ' ') if w not in stopwords.words('english')])

        post_list.append(posts)

    return np.array(post_list)

## Toxic and Non-toxic Analysis
process_toxic = post_preprocess(toxic)
process_nontoxic = post_preprocess(nontoxic)
nontoxic_sam = nontoxic.sample(frac=1/10,replace=False)
import collections
frequency = collections.defaultdict(int)
for i in tqdm(range(len(process_toxic))):
    text = process_toxic[i]
    for word in text.split():
        frequency[word] += 1
     
nt_frequency = collections.defaultdict(int)
for i in tqdm(range(len(process_nontoxic))):
    text = process_nontoxic[i]
    for word in text.split():
        nt_frequency[word] += 1
       
freq_sort = sorted(frequency.items(),key=lambda x:-x[1])
nt_freq_sort = sorted(nt_frequency.items(),key=lambda x:-x[1])
nontoxd = dict(nt_freq_sort)
toxd = dict(freq_sort)

top50 = []
for n in freq_sort[:50]:
    top50.append(n[0])
w_score = sorted(printwordc(top50),key=lambda x:x[1],reverse=True)

real_toxic_word = []
for key in toxd:
    if key not in nontoxd:
        real_toxic_word.append(key)
        
real_toxic_dict=[]
for r in real_toxic_word:
    real_toxic_dict.append((r,toxd[r]))
real_toxic_dict = sorted(real_toxic_dict,key=lambda x:x[1],reverse=True)

real_toxic_top500 = []
for tp in real_toxic_dict[:500]:
    real_toxic_top500.append(tp[0])  