In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.optimizers import *
from keras.models import Model
from keras.layers import *
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
# config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 0.3
# set_session(tf.Session(config=config))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# import keras
from keras import backend as K, activations, initializers, regularizers, constraints
from keras.engine.topology import Layer, InputSpec
import numpy as np

class SelfAttention(Layer):

    """Just your regular densely-connected NN layer.

    `Dense` implements the operation:
    `output = activation(dot(input, kernel) + bias)`
    where `activation` is the element-wise activation function
    passed as the `activation` argument, `kernel` is a weights matrix
    created by the layer, and `bias` is a bias vector created by the layer
    (only applicable if `use_bias` is `True`).

    Note: if the input to the layer has a rank greater than 2, then
    it is flattened prior to the initial dot product with `kernel`.

    # Example

    ```python
        # as first layer in a sequential model:
        model = Sequential()
        model.add(Dense(32, input_shape=(16,)))
        # now the model will take as input arrays of shape (*, 16)
        # and output arrays of shape (*, 32)

        # after the first layer, you don't need to specify
        # the size of the input anymore:
        model.add(Dense(32))
    ```

    # Arguments
        units: Positive integer, dimensionality of the output space.
        activation: Activation function to use
            (see [activations](../activations.md)).
            If you don't specify anything, no activation is applied
            (ie. "linear" activation: `a(x) = x`).
        use_bias: Boolean, whether the layer uses a bias vector.
        kernel_initializer: Initializer for the `kernel` weights matrix
            (see [initializers](../initializers.md)).
        bias_initializer: Initializer for the bias vector
            (see [initializers](../initializers.md)).
        kernel_regularizer: Regularizer function applied to
            the `kernel` weights matrix
            (see [regularizer](../regularizers.md)).
        bias_regularizer: Regularizer function applied to the bias vector
            (see [regularizer](../regularizers.md)).
        activity_regularizer: Regularizer function applied to
            the output of the layer (its "activation").
            (see [regularizer](../regularizers.md)).
        kernel_constraint: Constraint function applied to
            the `kernel` weights matrix
            (see [constraints](../constraints.md)).
        bias_constraint: Constraint function applied to the bias vector
            (see [constraints](../constraints.md)).

    # Input shape
        nD tensor with shape: `(batch_size, ..., input_dim)`.
        The most common situation would be
        a 2D input with shape `(batch_size, input_dim)`.

    # Output shape
        nD tensor with shape: `(batch_size, ..., units)`.
        For instance, for a 2D input with shape `(batch_size, input_dim)`,
        the output would have shape `(batch_size, units)`.
    """

    @interfaces.legacy_dense_support
    def __init__(self, units,
                 activation='tanh',
                 use_bias=True,
                 kernel_initializer='glorot_uniform',
                 bias_initializer='zeros',
                 v_kernel_initializer='glorot_uniform',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 v_kernel_regularizer = None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 v_kernel_constraint=None,
                 **kwargs):
        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
        super(SelfAttention, self).__init__(**kwargs)
        self.units = units
        self.activation = activations.get(activation)
        self.use_bias = use_bias
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.bias_initializer = initializers.get(bias_initializer)
        self.v_kernel_initializer = initializers.get(v_kernel_initializer)
        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.v_kernel_regularizer = regularizers.get(v_kernel_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)
        self.kernel_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)
        self.v_kernel_constraint =  constraints.get(v_kernel_constraint)
        self.input_spec = InputSpec(min_ndim=2)
        self.supports_masking = True

    def build(self, input_shape):
        assert len(input_shape) >= 2
        input_dim = input_shape[-1]

        self.kernel = self.add_weight(shape=(self.units, input_dim),
                                      initializer=self.kernel_initializer,
                                      name='kernel',
                                      regularizer=self.kernel_regularizer,
                                      constraint=self.kernel_constraint)

        self.v_kernel = self.add_weight(shape=(input_dim, self.units),
                                      initializer=self.v_kernel_initializer,
                                      name='v_kernel',
                                      regularizer=self.v_kernel_regularizer,
                                      constraint=self.v_kernel_constraint)

        if self.use_bias:
            self.bias = self.add_weight(shape=(self.units,),
                                        initializer=self.bias_initializer,
                                        name='bias',
                                        regularizer=self.bias_regularizer,
                                        constraint=self.bias_constraint)
        else:
            self.bias = None
        self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
        self.built = True

    def call(self, inputs):
        print(self.kernel.shape, inputs.shape, flush=True)
        newdim = tuple([x for x in inputs.shape.as_list() if x != 1 and x is not None])
        #newdim is now (15, 36). Reshape does not take batch size as an input dimension.
        reshape_layer = Reshape(newdim) (inputs)
        print(reshape_layer.shape)
        output = K.dot(self.kernel, reshape_layer)
        if self.use_bias:
            output = K.bias_add(output, self.bias)
        if self.activation is not None:
            output = self.activation(output)
        v_kernel_output = K.dot(output, self.v_kernel)
        return K.dot(inputs, activations.softmax(v_kernel_output))

    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) >= 2
        assert input_shape[-1]
        output_shape = list(input_shape)
        output_shape[-1] = self.units
        return tuple(output_shape)

    def get_config(self):
        config = {
            'units': self.units,
            'activation': activations.serialize(self.activation),
            'use_bias': self.use_bias,
            'kernel_initializer': initializers.serialize(self.kernel_initializer),
            'bias_initializer': initializers.serialize(self.bias_initializer),
            'v_kernel_initializer': initializers.serialize(self.v_kernel_initializer),
            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
            'v_kernel_regularizer': regularizers.serialize(self.v_kernel_regularizer),
            'activity_regularizer': regularizers.serialize(self.activity_regularizer),
            'kernel_constraint': constraints.serialize(self.kernel_constraint),
            'bias_constraint': constraints.serialize(self.bias_constraint),
            'v_kernel_constraint':constraints.serialize(self.v_kernel_constraint)
        }
        base_config = super(SelfAttention, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


In [3]:
import re
more_than_2_sequential_characters = re.compile(r'(.)\1{3,}', flags=re.IGNORECASE)
def preprocess(x):
    return x.fillna("fillna") \
    .values
#     .map(lambda string: string.lower()) \
#     .map(lambda string: more_than_2_sequential_characters.sub(r'\1\1', string)) \
#     .map(lambda string: string.replace("f*ck", "fuck")) \
#     .map(lambda string: string.replace("b*tch", "bitch")) \
#     .map(lambda string: string.replace("c*nt", "cunt")) \

In [4]:
# train_ling = pd.read_csv("../data/" + "preprocessed/train_ling.csv")

EMBEDDING_FILE = '../../../embeddings/glove.42B.300d.txt'
# EMBEDDING_FILE = '../../../embeddings/crawl-300d-2M.vec'

# train = pd.read_csv('../data/train.csv')
# test = pd.read_csv('../data/test.csv')
train = pd.read_csv('../data/preprocessed/train.csv')
test = pd.read_csv('../data/preprocessed/test.csv')
submission = pd.read_csv('../submissions/sample_submission.csv')

X_train = preprocess(train["comment_text"])
# X_train = preprocess(train["clean_text"])
# X_train_nostopwords = train["no_stopwords"].apply(" ".join).values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = preprocess(test["comment_text"])
# X_test = preprocess(test["clean_text"])
# X_test_nostopwords = test["no_stopwords"].apply(" ".join).values

# meta_features = ['count_sent', 'count_word', 'count_unique_word', 'count_letters',
#        'count_punctuations', 'count_words_upper', 'count_words_title',
#        'count_stopwords', 'mean_word_len', 'word_unique_percent',
#        'punct_percent', 'count_swear_words']

# X_meta_features = train_ling[meta_features]

def build_input_data(sentences, labels, vocabulary):
    x = np.array([[embeddings_index[vocabulary_inv[vocabulary['word']]] if word in vocabulary.keys() else len(vocabulary) - 1 for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]

In [5]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,nlp,tokens,lemmata,no_stopwords
0,0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...,Explanation\nWhy the edits made under my usern...,"['Explanation', '\n', 'Why', 'the', 'edits', '...","['explanation', '\n', 'why', 'the', 'edit', 'm...","['Explanation', '\n', 'Why', 'edits', 'made', ..."
1,1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww! he matches this background colour i'm s...,D'aww! He matches this background colour I'm s...,"[""D'aww"", '!', 'He', 'matches', 'this', 'backg...","[""d'aww"", '!', '-PRON-', 'match', 'this', 'bac...","[""D'aww"", '!', 'He', 'matches', 'background', ..."
2,2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man, i'm really not trying to edit war. it...","Hey man, I'm really not trying to edit war. It...","['Hey', 'man', ',', 'I', ""'m"", 'really', 'not'...","['hey', 'man', ',', '-PRON-', 'be', 'really', ...","['Hey', 'man', ',', 'I', ""'m"", 'really', 'tryi..."
3,3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,""" more i can't make any real suggestions on im...","""\nMore\nI can't make any real suggestions on ...","['""', '\n', 'More', '\n', 'I', 'ca', ""n't"", 'm...","['""', '\n', 'more', '\n', '-PRON-', 'can', 'no...","['""', '\n', 'More', '\n', 'I', 'ca', ""n't"", 'm..."
4,4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...","You, sir, are my hero. Any chance you remember...","['You', ',', 'sir', ',', 'are', 'my', 'hero', ...","['-PRON-', ',', 'sir', ',', 'be', '-PRON-', 'h...","['You', ',', 'sir', ',', 'hero', '.', 'Any', '..."


In [6]:

max_features = 30000
# max_features = 5159 #only this many with stopwords
maxlen = 100
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train_sequences, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test_sequences, maxlen=maxlen)



In [7]:
# glove embeddings
# def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
# embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

# all_embs = np.stack(embeddings_index.values())
# emb_mean, emb_std = all_embs.mean(), all_embs.std()

# word_index = tokenizer.word_index
# nb_words = min(max_features, len(word_index))
# embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
# for word, i in word_index.items():
#     if i >= max_features: continue
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None: embedding_matrix[i] = embedding_vector



#fasttest embeddings

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


In [None]:
import tensorflow as tf
import keras



class MinOrMax(Merge):
    """Layer that averages a list of inputs.
    It takes as input a list of tensors,
    all of the same shape, and returns
    a single tensor (also of the same shape).
    """
    
    def _merge_function(self, inputs):
        tf.c
        output = inputs[0]
        for i in range(0, len(inputs[0])):
            higher_than_05 = tf.nn.relu(inputs[:,0] - 0.5)
            nr_higher_than_05 = tf.count_nonzero(higher_than_05)
            if nr_higher_than_05 > len(inputs[:,0]): #take max
                
            output += inputs[i]
            return output / len(inputs)
    
def min_or_max(inputs, **kwargs):
    return Median(**kwargs)(inputs)

def reduce_var(x, axis=None, keepdims=False):
    """Variance of a tensor, alongside the specified axis.

    # Arguments
        x: A tensor or variable.
        axis: An integer, the axis to compute the variance.
        keepdims: A boolean, whether to keep the dimensions or not.
            If `keepdims` is `False`, the rank of the tensor is reduced
            by 1. If `keepdims` is `True`,
            the reduced dimension is retained with length 1.

    # Returns
        A tensor with the variance of elements of `x`.
    """
    m = tf.reduce_mean(x, axis=axis, keep_dims=True)
    devs_squared = tf.square(x - m)
    return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)

def reduce_std(x, axis=None, keepdims=False):
    """Standard deviation of a tensor, alongside the specified axis.

    # Arguments
        x: A tensor or variable.
        axis: An integer, the axis to compute the standard deviation.
        keepdims: A boolean, whether to keep the dimensions or not.
            If `keepdims` is `False`, the rank of the tensor is reduced
            by 1. If `keepdims` is `True`,
            the reduced dimension is retained with length 1.

    # Returns
        A tensor with the standard deviation of elements of `x`.
    """
    return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))


# class MLGaussian(Merge):
    
#     def _merge_function(self, inputs):
#         std = reduce_std(inputs)
#         mean = average(inputs)
        
    

class Median(Merge):
    """Layer that averages a list of inputs.
    It takes as input a list of tensors,
    all of the same shape, and returns
    a single tensor (also of the same shape).
    """

    def _merge_function(self, inputs):
        return tf.contrib.distributions.percentile(inputs, 50.0)
    
def median(inputs, **kwargs):
    return Median(**kwargs)(inputs)

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


In [None]:

def test_model():    
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
    lstm1 = Bidirectional(CuDNNLSTM(85, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(lstm1)
    max_pool = GlobalMaxPooling1D()(lstm1)
    conc = concatenate([avg_pool, max_pool])
    output = Dense(units=6, activation='sigmoid')(conc)
    model = Model(inputs=inp, outputs=output)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adamax(lr=0.005),
                  metrics=['accuracy'])
    return model


def ensemble(model_f, num_ensembles, input_length):
    models = [model_f() for i in range(0,num_ensembles)]
    ensemble_input = Input(shape=(input_length,))
#     averaged = average([m(ensemble_input) for m in models])
    averaged = median([m(ensemble_input) for m in models])
    ensemble = Model(inputs=[ensemble_input], outputs=[averaged])
    ensemble.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])    
    return models, ensemble


def simple_lstm_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
    lstm1 = Bidirectional(CuDNNLSTM(80, return_sequences=True))(x )
    att = SelfAttention(50)(lstm1)
    output = Dense(units=6, activation='sigmoid')(att)
    
    model = Model(inputs=inp, outputs=output)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

def highest_current_model(): # 7 ensemble, glove.42B.300d, seed42, 2 epochs,batch size 32 with median ensemble val auc 0.9877301793160443
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
    lstm1 = Bidirectional(CuDNNLSTM(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(lstm1)
    max_pool = GlobalMaxPooling1D()(lstm1)
    conc = concatenate([avg_pool, max_pool])
    output = Dense(units=6, activation='sigmoid')(conc)
    
    model = Model(inputs=inp, outputs=output)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adamax(lr=0.005),
                  metrics=['accuracy'])
    return model

model_func = test_model
ensemble_it = True
num_ensembles = 7
train_ensemble_jointly_after = False
epochs = 2
batch_size = 32

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)

if ensemble_it:
    models, ensemble_model = ensemble(model_func, num_ensembles, maxlen)
    for model in models:

        RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

        hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                         callbacks=[RocAuc], verbose=1)
    y_val_pred = ensemble_model.predict(X_val, batch_size=2, verbose=1)
    score = roc_auc_score(y_val, y_val_pred)
    print(score)

else:
    model = model_func()
    model.summary()
    RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
    hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                     callbacks=[RocAuc], verbose=1)
    y_val_pred = model.predict(X_val, batch_size=2, verbose=1)
    score = roc_auc_score(y_val, y_val_pred)
    print(score)

if ensemble_it and train_ensemble_jointly_after:
    epochs=1
    RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
    hist = ensemble_model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                     callbacks=[RocAuc], verbose=1)
    y_val_pred = model.predict(X_val, batch_size=1, verbose=1)
    score = roc_auc_score(y_val, y_val_pred)
    print(score) 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.985191 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.986591 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.986279 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.987510 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.985828 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.987234 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.985981 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.987500 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.984691 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.986752 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.984469 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.986208 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score

In [None]:
# epochs=1
# RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
# hist = ensemble_model.fit(X_tra, y_tra, batch_size=2, epochs=epochs, validation_data=(X_val, y_val),
#                  callbacks=[RocAuc], verbose=1, verbose=1)
# y_val_pred = model.predict(X_val, batch_size=1, verbose=1)
# score = roc_auc_score(y_val, y_val_pred)
# print(score) 

In [None]:
y_val_pred = ensemble_model.predict(X_val, batch_size=1, verbose=1)
score = roc_auc_score(y_val, y_val_pred)
print(score)

In [None]:
create_submission = True
if create_submission:
    y_pred = ensemble_model.predict(x_test, batch_size=16, verbose=1)
    submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
    submission.to_csv('clean_text_lstm_17_ensemble_adamax005.csv', index=False)

In [None]:
print(x_test.shape, y_pred.shape, y_val.shape)