In [1]:
import pandas as pd
import numpy as np
np.random.seed(223)
from matplotlib import pyplot as plt
from tqdm import tqdm

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import LSTM, GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import backend as K

Using TensorFlow backend.


In [3]:
max_features = 60000
maxlen = 300
embed_size = 300

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [5]:
X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values

In [6]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)

In [7]:
X_train_token = sequence.pad_sequences(X_train_token, maxlen=maxlen)
X_test_token = sequence.pad_sequences(X_test_token, maxlen=maxlen)

In [8]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('/home/paperspace/Desktop/Kaggle/crawl-300d-2M.vec', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

2000001it [02:12, 15150.35it/s]

Loaded 2000000 word vectors.





In [9]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))

In [10]:
# create a weight matrix for words in training docs
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [11]:
CONTEXT_DIM = 100
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints


class Attention(Layer):

    def __init__(self, regularizer=regularizers.l2(1e-10), **kwargs):
        self.regularizer = regularizer
        self.supports_masking = True
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3        
        self.W = self.add_weight(name='W',
                                 shape=(input_shape[-1], CONTEXT_DIM),
                                 initializer='normal',
                                 trainable=True, 
                                 regularizer=self.regularizer)
        self.b = self.add_weight(name='b',
                                 shape=(CONTEXT_DIM,),
                                 initializer='normal',
                                 trainable=True, 
                                 regularizer=self.regularizer)
        self.u = self.add_weight(name='u',
                                 shape=(CONTEXT_DIM,),
                                 initializer='normal',
                                 trainable=True, 
                                 regularizer=self.regularizer)        
        super(Attention, self).build(input_shape)

    @staticmethod
    def softmax(x, dim):
        """Computes softmax along a specified dim. Keras currently lacks this feature.
        """
        if K.backend() == 'tensorflow':
            import tensorflow as tf
            return tf.nn.softmax(x, dim)
        elif K.backend() == 'theano':
            # Theano cannot softmax along an arbitrary dim.
            # So, we will shuffle `dim` to -1 and un-shuffle after softmax.
            perm = np.arange(K.ndim(x))
            perm[dim], perm[-1] = perm[-1], perm[dim]
            x_perm = K.permute_dimensions(x, perm)
            output = K.softmax(x_perm)

            # Permute back
            perm[dim], perm[-1] = perm[-1], perm[dim]
            output = K.permute_dimensions(x, output)
            return output
        else:
            raise ValueError("Backend '{}' not supported".format(K.backend()))

    def call(self, x, mask=None):
        ut = K.tanh(K.bias_add(K.dot(x, self.W), self.b)) * self.u

        # Collapse `attention_dims` to 1. This indicates the weight for each time_step.
        ut = K.sum(ut, axis=-1, keepdims=True)

        # Convert those weights into a distribution but along time axis.
        # i.e., sum of alphas along `time_steps` axis should be 1.
        self.at = self.softmax(ut, dim=1)
        if mask is not None:
            self.at *= K.cast(K.expand_dims(mask, -1), K.floatx())

        # Weighted sum along `time_steps` axis.
        return K.sum(x * self.at, axis=-2)

    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[-1])
    
    def get_config(self):
        config = {}
        base_config = super(Attention, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def compute_mask(self, inputs, mask):
        return None

In [12]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [13]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    #avg_pool = GlobalAveragePooling1D()(x)
    #max_pool = GlobalMaxPooling1D()(x)
    #conc = concatenate([avg_pool, max_pool])
    x = Attention()(x)
    x = Dense(24, activation = 'relu')(x)
    outp = Dense(6, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

In [14]:
model = get_model()

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 300, 300)          18000000  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 300, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 300, 160)          182880    
_________________________________________________________________
attention_1 (Attention)      (None, 160)               16200     
_________________________________________________________________
dense_1 (Dense)              (None, 24)                3864      
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 150       
Total para

In [16]:
batch_size = 1600
epochs = 2

[X_tra, X_val, y_tra, y_val] = train_test_split(X_train_token, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)



In [17]:
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1
steps = int(len(X_tra)/batch_size) * epochs
lr_init, lr_fin = 0.001, 0.0005
lr_decay = exp_decay(lr_init, lr_fin, steps)
K.set_value(model.optimizer.lr, lr_init)
K.set_value(model.optimizer.decay, lr_decay)

In [18]:
hist = model.fit(X_tra, y_tra, batch_size=1024, epochs=10, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

Train on 151592 samples, validate on 7979 samples
Epoch 1/10

 ROC-AUC - epoch: 1 - score: 0.975412 

 - 160s - loss: 0.1548 - acc: 0.9552 - val_loss: 0.0517 - val_acc: 0.9807
Epoch 2/10

 ROC-AUC - epoch: 2 - score: 0.981173 

 - 156s - loss: 0.0453 - acc: 0.9830 - val_loss: 0.0460 - val_acc: 0.9830
Epoch 3/10

 ROC-AUC - epoch: 3 - score: 0.985387 

 - 155s - loss: 0.0391 - acc: 0.9848 - val_loss: 0.0446 - val_acc: 0.9830
Epoch 4/10

 ROC-AUC - epoch: 4 - score: 0.986700 

 - 155s - loss: 0.0346 - acc: 0.9863 - val_loss: 0.0458 - val_acc: 0.9829
Epoch 5/10

 ROC-AUC - epoch: 5 - score: 0.987227 

 - 155s - loss: 0.0310 - acc: 0.9876 - val_loss: 0.0462 - val_acc: 0.9822
Epoch 6/10

 ROC-AUC - epoch: 6 - score: 0.984374 

 - 155s - loss: 0.0276 - acc: 0.9889 - val_loss: 0.0505 - val_acc: 0.9823
Epoch 7/10

 ROC-AUC - epoch: 7 - score: 0.985240 

 - 155s - loss: 0.0248 - acc: 0.9901 - val_loss: 0.0536 - val_acc: 0.9817
Epoch 8/10

 ROC-AUC - epoch: 8 - score: 0.984199 

 - 156s - loss: 

In [19]:
y_pred = model.predict(X_test_token, batch_size=1024)

submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred

submission.to_csv('submission.csv', index=False)