In [11]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback


In [18]:
EMBEDDING_FILE = './dataset/crawl-300d-2M.vec'

In [13]:
train = pd.read_csv("./dataset/train.csv", header = 0)
test = pd.read_csv("./dataset/test.csv", header = 0)
submission = pd.read_csv('./dataset/sample_submission.csv')

In [14]:
X_train = train["comment_text"].fillna("fillna")
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna")

In [15]:
max_features = 30000
max_len = 100
embed_size = 300

batch_size = 32
epochs = 2

In [16]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train)+list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)

In [19]:
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [20]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [21]:
def get_model():
    inp = Input(shape = (max_len, ))
    x = Embedding(max_features, embed_size, weights = [embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation='sigmoid')(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                 optimizer='adam',
                 metrics=['accuracy'])
    return model
model = get_model()
print(model.summary())

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     9000000     input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 100, 300)     0           embedding_1[0][0]                
________________________________________________________________________________________

In [10]:
num_models = 5
kf = KFold(n_splits=num_models, random_state=2222)

In [13]:
def get_oof(X_train, y_train, X_test):
    
    oof_train = np.zeros((X_train.shape[0], 6))
    oof_test = np.zeros((num_models, X_test.shape[0],6))
    
    for i, (train_index, val_index) in enumerate(kf.split(X_train)):
        print("Fold: {0}".format(i))
        kf_X_train = X_train[train_index]
        kf_y_train = y_train[train_index]
        kf_X_val = X_train[val_index]
        kf_y_val = y_train[val_index]
        
        RocAuc = RocAucEvaluation(validation_data=(kf_X_val, kf_y_val), interval=1)
        model = get_model()
        model.fit(kf_X_train, kf_y_train, 
                 batch_size=batch_size, 
                 epochs=epochs, 
                 validation_data=(kf_X_val,kf_y_val),
                 callbacks=[RocAuc], 
                 verbose=1)
        
        oof_train[val_index,:] = model.predict(kf_X_val)
        oof_test[i,:,:] = model.predict(X_test)
        
    oof_test = np.mean(oof_test, axis=0)
    assert oof_test.shape == (X_test.shape[0],6)
    assert oof_train.shape == (X_train.shape[0], 6)
    
    return oof_train, oof_test

In [14]:
oof_train, oof_test = get_oof(X_train, y_train, X_test)

Fold: 0
Train on 127656 samples, validate on 31915 samples
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.986696 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.986499 

Fold: 1
Train on 127657 samples, validate on 31914 samples
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.984124 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.985445 

Fold: 2
Train on 127657 samples, validate on 31914 samples
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.985557 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.984419 

Fold: 3
Train on 127657 samples, validate on 31914 samples
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.986274 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.986978 

Fold: 4
Train on 127657 samples, validate on 31914 samples
Epoch 1/2
 ROC-AUC - epoch: 1 - score: 0.986582 

Epoch 2/2
 ROC-AUC - epoch: 2 - score: 0.987199 



In [15]:
oof_train.shape

(159571, 6)

In [23]:
stacked_train = pd.DataFrame(index=train.id, 
                             columns=["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"],
                             data = oof_train)
stacked_train.head(5)

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0000997932d777bf,0.00027,7e-06,9.7e-05,4e-06,5.3e-05,3.9e-05
000103f0d9cfb60f,0.003815,6.5e-05,0.00126,2.5e-05,0.000603,0.000101
000113f07ec002fd,0.001735,3.3e-05,0.000326,1.5e-05,0.000185,6.5e-05
0001b41b1c6bb37e,7.3e-05,4e-06,3.5e-05,5e-06,5.9e-05,2.2e-05
0001d958c54c6e35,0.03605,0.000212,0.0035,0.000354,0.006007,0.000311


In [26]:
stacked_test = pd.DataFrame(index=test.id,
                           columns=["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"],
                           data=oof_test)
stacked_test.head(5)

Unnamed: 0_level_0,toxic,severe_toxic,obscene,threat,insult,identity_hate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
00001cee341fdb12,0.996893,0.51209,0.981384,0.108628,0.94505,0.247938
0000247867823ef7,0.000193,1.5e-05,9.7e-05,7e-06,7.2e-05,1.7e-05
00013b17ad220c46,0.001615,0.000128,0.000695,7.5e-05,0.000439,0.000136
00017563c3f7919a,0.000435,3.1e-05,0.000276,6.4e-05,0.000232,2.5e-05
00017695ad8997eb,0.009032,0.00023,0.001415,0.000177,0.000834,0.000126
