In [1]:
import pandas as pd
import numpy as np
np.random.seed(223)
from matplotlib import pyplot as plt
from tqdm import tqdm

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Dropout
from keras.layers import LSTM, GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import backend as K

In [3]:
max_features = 60000
maxlen = 300
embed_size = 300

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [5]:
X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

In [6]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

In [7]:
X_train_token = sequence.pad_sequences(X_train_token, maxlen=maxlen)
X_test_token = sequence.pad_sequences(X_test_token, maxlen=maxlen)

In [8]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('/home/paperspace/Desktop/Kaggle/crawl-300d-2M.vec', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

2000001it [02:14, 14867.70it/s]

Loaded 2000000 word vectors.





In [9]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))

In [10]:
# create a weight matrix for words in training docs
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [11]:
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [12]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [16]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    #avg_pool = GlobalAveragePooling1D()(x)
    #max_pool = GlobalMaxPooling1D()(x)
    #conc = concatenate([avg_pool, max_pool])
    x = Attention(maxlen)(x)
    x = Dense(24, activation = 'relu')(x)
    x = Dropout(0.2)(x)
    outp = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [17]:
model = get_model()

In [18]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 300, 300)          18000000  
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 300, 300)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 300, 160)          182880    
_________________________________________________________________
attention_1 (Attention)      (None, 160)               460       
_________________________________________________________________
dense_1 (Dense)              (None, 24)                3864      
_________________________________________________________________
dropout_1 (Dropout)          (None, 24)                0         
__________

In [19]:
batch_size = 1600
epochs = 2

[X_tra, X_val, y_tra, y_val] = train_test_split(X_train_token, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)



In [20]:
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(len(X_tra)/batch_size) * epochs
lr_init, lr_fin = 0.001, 0.0005
lr_decay = exp_decay(lr_init, lr_fin, steps)
K.set_value(model.optimizer.lr, lr_init)
K.set_value(model.optimizer.decay, lr_decay)

In [21]:
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=10, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

Train on 151592 samples, validate on 7979 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.949933 

 - 132s - loss: 0.2546 - acc: 0.9260 - val_loss: 0.0865 - val_acc: 0.9687
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.973148 

 - 131s - loss: 0.0680 - acc: 0.9759 - val_loss: 0.0521 - val_acc: 0.9812
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.974494 

 - 132s - loss: 0.0516 - acc: 0.9814 - val_loss: 0.0498 - val_acc: 0.9813
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.975638 

 - 132s - loss: 0.0464 - acc: 0.9831 - val_loss: 0.0500 - val_acc: 0.9820
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.976121 

 - 130s - loss: 0.0427 - acc: 0.9841 - val_loss: 0.0496 - val_acc: 0.9818
Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.978199 

 - 130s - loss: 0.0395 - acc: 0.9851 - val_loss: 0.0511 - val_acc: 0.9817
Epoch 7/10

 ROC-AUC - epoch: 7 - score: 0.978468 

 - 131s - loss: 0.0370 - acc: 0.9858 - val_loss: 0.0512 - val_acc: 0.9814
Epoch 8/10

 ROC-AUC - epoch: 8 - score: 0.980453 

 - 132s - loss: 

In [22]:
y_pred = model.predict(X_test_token, batch_size=1024*2)

submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred

submission.to_csv('submission.csv', index=False)