In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
import gc

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

#from subprocess import check_output
#print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import (GRU, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, TimeDistributed, Flatten, Activation, 
                          Concatenate, Multiply, RepeatVector, Permute, Lambda, BatchNormalization, Add, CuDNNGRU, CuDNNLSTM, Conv1D, MaxPooling1D)
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, Callback
from keras.optimizers import Adam
from keras.utils import plot_model
from keras import backend as K
from keras import regularizers
from Attention import Attention
import tensorflow as tf

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import words, stopwords

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import log_loss
import seq2seq

from tqdm import tqdm
import hunspell
import editdistance #Levenshtein distance

from joblib import Parallel, delayed
import multiprocessing

In [2]:
def metric(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    columns = y_true.shape[1]
    column_losses = []
    for i in range(0, columns):
        column_losses.append(log_loss(y_true[:, i], y_pred[:, i]))
    return np.array(column_losses).mean()

In [3]:
max_words = 100000
maxlen = 150

train = pd.read_csv("./train_cleaned.csv")
test = pd.read_csv("./test_cleaned.csv")

list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values

tokenizer = text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(list(list_sentences_train))
word_index = tokenizer.word_index
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [4]:
embeddings_index = {}
embed_size = 300

f = open('./glove.840B.300d.txt') #Use this as our list of valid words
valid_words = []
for line in f:
    values = line.split()
    word = ' '.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 2195895 word vectors.


In [5]:
num_words = min(max_words, len(word_index))
embedding_matrix = np.zeros((num_words, embed_size))

for word, i in word_index.items():
    if i >= max_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [6]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_words, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Dropout(0.25)(x)
    #x = Bidirectional(CuDNNLSTM(300, return_sequences=True))(x)
    
    
    f = CuDNNLSTM(300, return_sequences=True)(x)
    f= Attention(maxlen)(f)
    b = CuDNNLSTM(300, return_sequences=True, go_backwards=True)(x)
    b= Attention(maxlen)(b)

    x = Concatenate()([f,b])
    x = Dropout(0.25)(x)
    
    #x = GlobalMaxPool1D()(x)
    
    x = Dense(256, activation="relu")(x)
    x = Dropout(0.25)(x)
    x = BatchNormalization()(x)
    x = Dense(6, activation="sigmoid")(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    return model


model = get_model()
init_weights = model.get_weights()
model.summary()
plot_model(model, to_file='model.png',show_shapes=True)

file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 300)     30000000    input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 150, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)        (None, 150, 300)     722400      dropout_1[0][0]                  
__________________________________________________________________________________________________
cu_dnnlstm

In [7]:
#Stratify KFold by creating unique labels for ground truth values, toss things with less than num_folds into an 'others' label
num_folds = 10
truth_labels = np.unique(y,axis=0)

truth_counts = np.zeros(truth_labels.shape[0],dtype=int)
for j, tlab in enumerate(truth_labels):
    truth_counts[j] = (y == tlab).all(-1).sum()

other_label = truth_labels.shape[0]+1 #create a new label out of the index range for 'others'
truth_encoded = []
for row in y:
    for j, truth_row in enumerate(truth_labels):
        index_test = np.array_equal(row, truth_row)
        if index_test:
            if truth_counts[j]>=num_folds:
                truth_encoded.append(j)
            else:
                truth_encoded.append(other_label)

In [8]:
batch_size = 512
epochs = 50
kf = StratifiedKFold(n_splits=num_folds, random_state=42, shuffle=True)
y_test = np.zeros((len(X_te),6))
valpred = np.zeros((len(X_t),6))
for j, (train_index, val_index) in enumerate(kf.split(X_t, truth_encoded)):
    print("Training on fold {:d}".format(j))
    #model.set_weights(init_weights)
    K.clear_session()
    model = get_model()
    early = EarlyStopping(monitor="val_loss", patience=10)
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',  patience=3, verbose=1, factor=0.5, min_lr=0.0001)
    checkpoint = ModelCheckpoint("fold_" + str(j) + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    history = model.fit(X_t[train_index], y[train_index], batch_size=batch_size, epochs=epochs, shuffle=True, validation_data=(X_t[val_index], y[val_index]), callbacks=[checkpoint, early, learning_rate_reduction])
    
    model.load_weights("fold_" + str(j) + ".hdf5")
    
    valpred[val_index,:]=model.predict(X_t[val_index], verbose=1, batch_size=512)
    logloss = metric(y[val_index],valpred[val_index,:])
    print("Loss on fold {:d} is {:f}".format(j,logloss))
    y_test += model.predict(X_te, verbose=1, batch_size=512)
    
y_test = y_test/num_folds
logloss = metric(y,valpred)
print("CV loss is {:f}".format(logloss))
sample_submission = pd.read_csv("./sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("submission.csv", index=False)

temp = pd.read_csv("./train.csv")
temp[list_classes] = valpred
temp.to_csv("validation_predictions.csv", index = False)

Training on fold 0
Train on 86253 samples, validate on 9598 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50

Epoch 00013: reducing learning rate to 0.0005000000237487257.
Epoch 14/50
Epoch 15/50
Epoch 16/50

Epoch 00016: reducing learning rate to 0.0002500000118743628.
Epoch 17/50
Epoch 18/50
Epoch 19/50

Epoch 00019: reducing learning rate to 0.0001250000059371814.
Loss on fold 0 is 0.042120
Training on fold 1
Train on 86258 samples, validate on 9593 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50


Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50

Epoch 00016: reducing learning rate to 0.0005000000237487257.
Epoch 17/50
Epoch 18/50
Epoch 19/50

Epoch 00019: reducing learning rate to 0.0002500000118743628.
Epoch 20/50
Epoch 21/50
Epoch 22/50

Epoch 00022: reducing learning rate to 0.0001250000059371814.
Loss on fold 1 is 0.043389
Training on fold 2
Train on 86261 samples, validate on 9590 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50

Epoch 00010: reducing learning rate to 0.0005000000237487257.
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50

Epoch 00016: reducing learning rate to 0.0002500000118743628.
Epoch 17/50
Epoch 18/50
Epoch 19/50

Epoch 00019: reducing learning rate to 0.0001250000059371814.
Epoch 20/50
Epoch 21/50
Epoch 22/50

Epoch 00022: reducing learning rate to 0.0001.
Loss on fold 2 is 0.042503
Training on fold 3
Train on 86263 samples

Loss on fold 3 is 0.041691
Training on fold 4
Train on 86264 samples, validate on 9587 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50

Epoch 00016: reducing learning rate to 0.0005000000237487257.
Epoch 17/50
Epoch 18/50
Epoch 19/50

Epoch 00019: reducing learning rate to 0.0002500000118743628.
Epoch 20/50
Epoch 21/50
Epoch 22/50

Epoch 00022: reducing learning rate to 0.0001250000059371814.
Loss on fold 4 is 0.042693
Training on fold 5
Train on 86268 samples, validate on 9583 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50


Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50

Epoch 00015: reducing learning rate to 0.0005000000237487257.
Epoch 16/50
Epoch 17/50
Epoch 18/50

Epoch 00018: reducing learning rate to 0.0002500000118743628.
Epoch 19/50
Epoch 20/50
Epoch 21/50

Epoch 00021: reducing learning rate to 0.0001250000059371814.
Loss on fold 5 is 0.041454
Training on fold 6
Train on 86269 samples, validate on 9582 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50

Epoch 00013: reducing learning rate to 0.0005000000237487257.
Epoch 14/50
Epoch 15/50
Epoch 16/50

Epoch 00016: reducing learning rate to 0.0002500000118743628.
Epoch 17/50
Epoch 18/50
Epoch 19/50

Epoch 00019: reducing learning rate to 0.0001250000059371814.
Loss on fold 6 is 0.042893
Training on fold 7
Train on 86271 samples, validate on 9580 samples
Epoch 1/50
Epoch 2/50
Epo


Epoch 00022: reducing learning rate to 0.0001250000059371814.
Loss on fold 7 is 0.040522
Training on fold 8
Train on 86275 samples, validate on 9576 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50

Epoch 00012: reducing learning rate to 0.0005000000237487257.
Epoch 13/50
Epoch 14/50
Epoch 15/50

Epoch 00015: reducing learning rate to 0.0002500000118743628.
Epoch 16/50
Epoch 17/50
Epoch 18/50

Epoch 00018: reducing learning rate to 0.0001250000059371814.
Loss on fold 8 is 0.044684
Training on fold 9
Train on 86277 samples, validate on 9574 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50

Epoch 00017: reducing learning rate to 0.0005000000237487257.
Epoch 18/50
Epoch 19/50
Epoch 20/50

Epoch 00020: reducing learning rate to 0.0002500000118743628.
Epoch 21/50
Epoch 22/50
Epoch 23/50

Epoch 00023: reducing learning rate to 0.0001250000059371814.
Loss on fold 9 is 0.044367
CV loss is 0.042631
