In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from keras.optimizers import *
from keras.models import Model
from keras.layers import *
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
import keras
from keras import backend as K, activations, initializers, regularizers, constraints
from keras.engine.topology import Layer, InputSpec
import numpy as np
import warnings
import os
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))
warnings.filterwarnings('ignore')
os.environ['OMP_NUM_THREADS'] = '4'

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import re
more_than_2_sequential_characters = re.compile(r'(.)\1{3,}', flags=re.IGNORECASE)
def preprocess(x):
    return x.fillna("fillna") \
    .values

In [3]:
EMBEDDING_FILE = '../../../embeddings/crawl-300d-2M.vec'

train = pd.read_csv('../data/preprocessed/train.csv')
test = pd.read_csv('../data/preprocessed/test.csv')
submission = pd.read_csv('../submissions/sample_submission.csv')

X_train = preprocess(train["clean_text"])
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = preprocess(test["clean_text"])

def build_input_data(sentences, labels, vocabulary):
    x = np.array([[embeddings_index[vocabulary_inv[vocabulary['word']]] if word in vocabulary.keys() else len(vocabulary) - 1 for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,nlp,tokens,lemmata,no_stopwords
0,0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...,Explanation\nWhy the edits made under my usern...,"['Explanation', '\n', 'Why', 'the', 'edits', '...","['explanation', '\n', 'why', 'the', 'edit', 'm...","['Explanation', '\n', 'Why', 'edits', 'made', ..."
1,1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww! he matches this background colour i'm s...,D'aww! He matches this background colour I'm s...,"[""D'aww"", '!', 'He', 'matches', 'this', 'backg...","[""d'aww"", '!', '-PRON-', 'match', 'this', 'bac...","[""D'aww"", '!', 'He', 'matches', 'background', ..."
2,2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man, i'm really not trying to edit war. it...","Hey man, I'm really not trying to edit war. It...","['Hey', 'man', ',', 'I', ""'m"", 'really', 'not'...","['hey', 'man', ',', '-PRON-', 'be', 'really', ...","['Hey', 'man', ',', 'I', ""'m"", 'really', 'tryi..."
3,3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,""" more i can't make any real suggestions on im...","""\nMore\nI can't make any real suggestions on ...","['""', '\n', 'More', '\n', 'I', 'ca', ""n't"", 'm...","['""', '\n', 'more', '\n', '-PRON-', 'can', 'no...","['""', '\n', 'More', '\n', 'I', 'ca', ""n't"", 'm..."
4,4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...","You, sir, are my hero. Any chance you remember...","['You', ',', 'sir', ',', 'are', 'my', 'hero', ...","['-PRON-', ',', 'sir', ',', 'be', '-PRON-', 'h...","['You', ',', 'sir', ',', 'hero', '.', 'Any', '..."


In [5]:
max_features = 30000
maxlen = 100
embed_size = 300
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train_sequences, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test_sequences, maxlen=maxlen)

In [6]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [7]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [8]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)

In [9]:
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        # todo: check that this is correct
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

In [10]:
class SelfAttention(Layer):
    @interfaces.legacy_dense_support
    def __init__(self, units,
                 activation='tanh',
                 use_bias=True,
                 kernel_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 v_kernel_initializer='glorot_uniform',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 v_kernel_regularizer = None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 v_kernel_constraint=None,
                 **kwargs):
        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
        super(SelfAttention, self).__init__(**kwargs)
        self.units = units
        self.activation = activations.get(activation)
        self.use_bias = use_bias
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.bias_initializer = initializers.get(bias_initializer)
        self.v_kernel_initializer = initializers.get(v_kernel_initializer)
        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.v_kernel_regularizer = regularizers.get(v_kernel_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)
        self.kernel_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)
        self.v_kernel_constraint =  constraints.get(v_kernel_constraint)
        self.input_spec = InputSpec(min_ndim=2)
        self.supports_masking = True

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.kernel = self.add_weight(shape=(input_shape[-1],),
                                      initializer=self.kernel_initializer,
                                      name='kernel',
                                      regularizer=self.kernel_regularizer,
                                      constraint=self.kernel_constraint)

        self.v_kernel = self.add_weight(shape=(1, self.units),
                                      initializer=self.v_kernel_initializer,
                                      name='v_kernel',
                                      regularizer=self.v_kernel_regularizer,
                                      constraint=self.v_kernel_constraint)

        if self.use_bias:
            self.bias = self.add_weight(shape=(self.units,),
                                        initializer=self.bias_initializer,
                                        name='bias',
                                        regularizer=self.bias_regularizer,
                                        constraint=self.bias_constraint)
        else:
            self.bias = None
#         self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
        self.built = True

    def call(self, inputs):
#         reshape_layer = K.reshape(inputs,(-1,100,160))
#         print(reshape_layer.shape)
#         output = K.dot(self.kernel, reshape_layer)%%
        print('H', inputs.shape)
#         inputs = K.reshape(inputs, (-1, 160))
#         print('H reshape', inputs.shape)
        print('H^T', K.transpose(inputs).shape)
        print('W_s1', self.kernel.shape)
#         output = dot_product(self.kernel, K.transpose(inputs))
#         BatchM = K.repeat_elements(x=self.kernel,rep=1,axis=0)
        tmp = K.batch_dot(inputs, self.kernel, axes=[1,2])

        #we also need to transpose x[1]:
        inputs1T = K.permute_dimensions(inputs[1],(0,2,1))

        #and the second multiplication:
        output = K.batch_dot(tmp, inputs1T, axes=[1,2])
#         output = K.sum(self.kernel * inputs,axis=-1,keepdims=True)
#         if self.use_bias:
#             output = K.bias_add(output, self.bias)
        if self.activation is not None:
            output = self.activation(output)
        print('output', output.shape)
        v_kernel_output = dot_product(self.v_kernel, output)
        return dot_product(activations.softmax(v_kernel_output), inputs)



    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]
        
    def get_config(self):
        config = {
            'units': self.units,
            'activation': activations.serialize(self.activation),
            'use_bias': self.use_bias,
            'kernel_initializer': initializers.serialize(self.kernel_initializer),
            'bias_initializer': initializers.serialize(self.bias_initializer),
            'v_kernel_initializer': initializers.serialize(self.v_kernel_initializer),
            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
            'v_kernel_regularizer': regularizers.serialize(self.v_kernel_regularizer),
            'activity_regularizer': regularizers.serialize(self.activity_regularizer),
            'kernel_constraint': constraints.serialize(self.kernel_constraint),
            'bias_constraint': constraints.serialize(self.bias_constraint),
            'v_kernel_constraint':constraints.serialize(self.v_kernel_constraint)
        }
        base_config = super(SelfAttention, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


$d_a$ = 13
$n$ = 100
$u$ = 80

In [11]:
def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
    lstm = Bidirectional(CuDNNLSTM(80, return_sequences=True))(x)
    att_1 = TimeDistributed(Dense(13))(lstm) # tanh(W_{s_1} * H^T)
    att_2 = TimeDistributed(Dense(1, activation = 'softmax'))(att_1) # A = softmax(w_{s_2}*tanh(W_s * H^T)
    att_3 = Multiply()([att_2, lstm]) # AH
    flat = Flatten()(att_3)
    output = Dense(units=6, activation='sigmoid')(flat)
    
    model = Model(inputs=inp, outputs=output)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [216]:
model_func = get_model
epochs = 5
batch_size = 32

In [217]:
model = model_func()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_53 (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_53 (Embedding)        (None, 100, 300)     9000000     input_53[0][0]                   
__________________________________________________________________________________________________
bidirectional_53 (Bidirectional (None, 100, 160)     244480      embedding_53[0][0]               
__________________________________________________________________________________________________
time_distributed_5 (TimeDistrib (None, 100, 13)      2093        bidirectional_53[0][0]           
__________________________________________________________________________________________________
time_distr

In [218]:
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

Train on 151592 samples, validate on 7979 samples
Epoch 1/5

 ROC-AUC - epoch: 1 - score: 0.982803 

Epoch 2/5

 ROC-AUC - epoch: 2 - score: 0.981408 

Epoch 3/5

 ROC-AUC - epoch: 3 - score: 0.974178 

Epoch 4/5

 ROC-AUC - epoch: 4 - score: 0.968631 

Epoch 5/5

 ROC-AUC - epoch: 5 - score: 0.966636 



In [219]:

y_val_pred = model.predict(X_val, batch_size=2, verbose=1)
score = roc_auc_score(y_val, y_val_pred)
print(score)

0.9666361492889011


In [225]:
def mean_roc_auc(y_true, y_pred):
    roc_auc_scores = []
    for i in range(0, y_true.shape[1]):
        roc_auc_scores.append(roc_auc_score(y_true[:, i], y_pred[:, i]))
    print(roc_auc_scores)
    return np.mean(roc_auc_scores)

In [226]:
mean_roc_auc(y_val, y_val_pred)

[0.9554555662776835, 0.98057301288454, 0.9673196255313045, 0.9714082358122563, 0.9651489811370993, 0.9599114740905229]


0.9666361492889011

In [None]:
# y_val_pred = ensemble_model.predict(X_val, batch_size=4, verbose=1)
# score = roc_auc_score(y_val, y_val_pred)
# print(score)

# y_pred = ensemble_model.predict(x_test, batch_size=4, verbose=1)
# submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
# submission.to_csv('clean_text_lstm_7_ensemble_adamax005.csv', index=False)

# print(x_test.shape, y_pred.shape, y_val.shape)