In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import (GRU, LSTM, Bidirectional, GlobalMaxPool1D, Dropout, TimeDistributed, Flatten, Activation, 
                          Concatenate, Multiply, RepeatVector, Permute, Lambda, BatchNormalization, Add, CuDNNGRU, CuDNNLSTM, Conv1D, MaxPooling1D)
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, Callback
from keras.optimizers import Adam
from keras.utils import plot_model
from keras import backend as K
from keras import regularizers

import tensorflow as tf

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import words, stopwords

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import log_loss
import seq2seq
from recurrentshop import LSTMCell, RecurrentSequential
from seq2seq.cells import LSTMDecoderCell, AttentionDecoderCell

from tqdm import tqdm
import hunspell
import editdistance #Levenshtein distance

from joblib import Parallel, delayed
import multiprocessing

Using TensorFlow backend.


In [2]:
stemmer=EnglishStemmer()
stop_words = set(stopwords.words('english'))

def metric(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    columns = y_true.shape[1]
    column_losses = []
    for i in range(0, columns):
        column_losses.append(log_loss(y_true[:, i], y_pred[:, i]))
    return np.array(column_losses).mean()

num_cores = multiprocessing.cpu_count()
def clean_sentence(dirty_sentence):
    cleaning = word_tokenize(dirty_sentence)
    
    wordlist = [word for word in cleaning if word.isalpha()]
    #Removing stopwords didn't seem to help.. check this again after modification?
    #wordlist = [word for word in cleaning if (word.isalpha() and not word in stop_words)]
    return " ".join(wordlist)

#Right now we're just autocorrecting the words that show up in most used before looking up embeddings
#I need to preprocess the entire train/test sets, but this is computationally intensive--  on my TODO list (hoping it'll help too)
def autocorrect(potential):
    autotry = spellchecker.suggest(re.sub(r'[^\x00-\x7f]',r'', potential[0]))
    if len(autotry)==0:
        return (None, potential[1])
                
    autocorrect = autotry[0]
    if editdistance.eval(potential[0], autocorrect) >3:
        return (None, potential[1])
    
    embedding_vector = embeddings_index.get(autocorrect)
    if embedding_vector is not None:
        return (embedding_vector, potential[1])
    else:
        return (None, potential[1])


In [3]:
max_words = 30000
maxlen = 150

train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
train = train.sample(frac=1)

list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values

clean_train = Parallel(n_jobs=num_cores)(delayed(clean_sentence)(i) for i in tqdm(list_sentences_train))
clean_test = Parallel(n_jobs=num_cores)(delayed(clean_sentence)(i) for i in tqdm(list_sentences_test))

list_sentences_train = clean_train
list_sentences_test = clean_test
del(clean_train,clean_test)


tokenizer = text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(list(list_sentences_train))
word_index = tokenizer.word_index
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

100%|██████████| 95851/95851 [00:11<00:00, 8331.12it/s]
100%|██████████| 226998/226998 [00:34<00:00, 6536.26it/s]


In [4]:
embeddings_index = {}
embed_size = 100
f = open('./glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [5]:
num_words = min(max_words, len(word_index))
embedding_matrix = np.zeros((num_words, embed_size))

spellchecker = hunspell.HunSpell('./index.dic', './index.aff')
notfound = []
for word, i in word_index.items():
    if i >= max_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        notfound.append((word, i))

extrafound=0
newpairs = Parallel(n_jobs=num_cores)(delayed(autocorrect)(i) for i in tqdm(notfound))
for pair in newpairs:
    if pair[0] is not None:
        extrafound+=1
        embedding_matrix[pair[1]] = pair[0]
    

print("There were {:d} words not found.".format(len(notfound)))
print("We found {:d} of these words using autocorrect.".format(extrafound))

100%|██████████| 2190/2190 [00:07<00:00, 279.31it/s]


There were 2190 words not found.
We found 936 of these words using autocorrect.


In [10]:
def get_model():
    units = 100
    frac_drop = 0.5
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_words, embed_size, weights=[embedding_matrix], trainable=True)(inp)
    x = Dropout(0.1)(x)
    x = Bidirectional(CuDNNLSTM(units, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.2)(x)
    x = Dense(75, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)

    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model


model = get_model()
init_weights = model.get_weights()
model.summary()
plot_model(model, to_file='model.png',show_shapes=True)

file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',  patience=1, verbose=1, factor=0.5, min_lr=0.001)

callbacks_list = [checkpoint, early, learning_rate_reduction]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        (None, 150)               0         
_________________________________________________________________
embedding_12 (Embedding)     (None, 150, 100)          3000000   
_________________________________________________________________
dropout_34 (Dropout)         (None, 150, 100)          0         
_________________________________________________________________
bidirectional_12 (Bidirectio (None, 150, 200)          161600    
_________________________________________________________________
global_max_pooling1d_12 (Glo (None, 200)               0         
_________________________________________________________________
dropout_35 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_23 (Dense)             (None, 75)                15075     
__________

In [7]:
#### Train/validation split without CV
# batch_size = 256
# epochs = 5
# model.set_weights(init_weights)
# X_train, X_valid, y_train, y_valid = train_test_split(X_t, y, test_size=0.1, random_state=42)
# model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_valid, y_valid), callbacks=callbacks_list)
# logloss = metric(y_valid,model.predict(X_valid, verbose=1))
# print(logloss)
# model.load_weights(file_path)
# y_test = model.predict(X_te, verbose=1)
# sample_submission = pd.read_csv("./sample_submission.csv")
# sample_submission[list_classes] = y_test
# sample_submission.to_csv("toxic.csv", index=False)

In [11]:
batch_size = 512
epochs = 10
num_folds = 10
kf = KFold(n_splits=num_folds, random_state=42)
y_test = np.zeros((len(X_te),6))
valpred = np.zeros((len(X_t),6))
for j, (train_index, val_index) in enumerate(kf.split(X_t)):
    print("Training on fold {:d}".format(j))
    #model.set_weights(init_weights)
    model = get_model()
    learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss',  patience=20, verbose=1, factor=0.5, min_lr=0.005)
    checkpoint = ModelCheckpoint("fold_" + str(j) + ".hdf5", monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    model.fit(X_t[train_index], y[train_index], batch_size=batch_size, epochs=epochs, validation_data=(X_t[val_index], y[val_index]), callbacks=[checkpoint, learning_rate_reduction])
    
    model.load_weights("fold_" + str(j) + ".hdf5")
    
    valpred[val_index,:]=model.predict(X_t[val_index], verbose=1, batch_size=512)
    logloss = metric(y[val_index],valpred[val_index,:])
    print("Loss on fold {:d} is {:f}".format(j,logloss))
    y_test += model.predict(X_te, verbose=1, batch_size=512)
    
y_test = y_test/num_folds
logloss = metric(y,valpred)
print("CV loss is {:f}".format(logloss))
sample_submission = pd.read_csv("./sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("submission.csv", index=False)

temp = pd.read_csv("./train.csv")
temp[list_classes] = valpred
temp.to_csv("validation_predictions.csv", index = False)

Training on fold 0
Train on 86265 samples, validate on 9586 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss on fold 0 is 0.046822
Training on fold 1
Train on 86266 samples, validate on 9585 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss on fold 1 is 0.049805
Training on fold 2
Train on 86266 samples, validate on 9585 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss on fold 2 is 0.043874
Training on fold 3
Train on 86266 samples, validate on 9585 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss on fold 3 is 0.049083
Training on fold 4
Train on 86266 samples, validate on 9585 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss on fold 4 is 0.050122
Training on fold 5
Train on 86266 samples, validate on 9585 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss on fold 5 is 0.045346
Training on fold 6
Train on 86266 samples, validate on 9585 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss on fold 6 is 0.049437
Training on fold 7
Train on 86266 samples, validate on 9585 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss on fold 7 is 0.045427
Training on fold 8
Train on 86266 samples, validate on 9585 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss on fold 8 is 0.046630
Training on fold 9
Train on 86266 samples, validate on 9585 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Loss on fold 9 is 0.045540
CV loss is 0.047209


In [12]:
temp = pd.read_csv("./train.csv")
temp[list_classes] = valpred
temp.to_csv("validation_predictions.csv", index = False)