In [1]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import *
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
    

class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.

    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.

    Note: The layer has been tested with Keras 2.0.6

    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]


In [3]:
import re
more_than_2_sequential_characters = re.compile(r'(.)\1{3,}', flags=re.IGNORECASE)
def preprocess(x):
    return x.fillna("fillna") \
    .map(lambda string: string.lower()) \
    .map(lambda string: more_than_2_sequential_characters.sub(r'\1\1', string)) \
    .map(lambda string: string.replace("f*ck", "fuck")) \
    .map(lambda string: string.replace("b*tch", "bitch")) \
    .map(lambda string: string.replace("c*nt", "cunt")) \
    .values

In [4]:
# train_ling = pd.read_csv("../data/" + "preprocessed/train_ling.csv")

EMBEDDING_FILE = '../../../embeddings/glove.42B.300d.txt'

# train = pd.read_csv('../data/train.csv')
# test = pd.read_csv('../data/test.csv')
train = pd.read_csv('../data/preprocessed/train.csv')
test = pd.read_csv('../data/preprocessed/test.csv')
submission = pd.read_csv('../submissions/sample_submission.csv')

X_train = preprocess(train["clean_text"])
# X_train_nostopwords = train["no_stopwords"].apply(" ".join).values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = preprocess(test["clean_text"])
# X_test_nostopwords = test["no_stopwords"].apply(" ".join).values

# meta_features = ['count_sent', 'count_word', 'count_unique_word', 'count_letters',
#        'count_punctuations', 'count_words_upper', 'count_words_title',
#        'count_stopwords', 'mean_word_len', 'word_unique_percent',
#        'punct_percent', 'count_swear_words']

# X_meta_features = train_ling[meta_features]

def build_input_data(sentences, labels, vocabulary):
    x = np.array([[embeddings_index[vocabulary_inv[vocabulary['word']]] if word in vocabulary.keys() else len(vocabulary) - 1 for word in sentence] for sentence in sentences])
    y = np.array(labels)
    return [x, y]

In [5]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,nlp,tokens,lemmata,no_stopwords
0,0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation why the edits made under my userna...,Explanation\nWhy the edits made under my usern...,"['Explanation', '\n', 'Why', 'the', 'edits', '...","['explanation', '\n', 'why', 'the', 'edit', 'm...","['Explanation', '\n', 'Why', 'edits', 'made', ..."
1,1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww! he matches this background colour i'm s...,D'aww! He matches this background colour I'm s...,"[""D'aww"", '!', 'He', 'matches', 'this', 'backg...","[""d'aww"", '!', '-PRON-', 'match', 'this', 'bac...","[""D'aww"", '!', 'He', 'matches', 'background', ..."
2,2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man, i'm really not trying to edit war. it...","Hey man, I'm really not trying to edit war. It...","['Hey', 'man', ',', 'I', ""'m"", 'really', 'not'...","['hey', 'man', ',', '-PRON-', 'be', 'really', ...","['Hey', 'man', ',', 'I', ""'m"", 'really', 'tryi..."
3,3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,""" more i can't make any real suggestions on im...","""\nMore\nI can't make any real suggestions on ...","['""', '\n', 'More', '\n', 'I', 'ca', ""n't"", 'm...","['""', '\n', 'more', '\n', '-PRON-', 'can', 'no...","['""', '\n', 'More', '\n', 'I', 'ca', ""n't"", 'm..."
4,4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you, sir, are my hero. any chance you remember...","You, sir, are my hero. Any chance you remember...","['You', ',', 'sir', ',', 'are', 'my', 'hero', ...","['-PRON-', ',', 'sir', ',', 'be', '-PRON-', 'h...","['You', ',', 'sir', ',', 'hero', '.', 'Any', '..."


In [6]:

max_features = 30000
# max_features = 5159 #only this many with stopwords
maxlen = 100
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train_sequences, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test_sequences, maxlen=maxlen)



In [7]:

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


In [8]:


class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


In [11]:
def test_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
    lstm1 = Bidirectional(CuDNNLSTM(80, return_sequences=True))(x)
#     attention = AttentionWithContext()(lstm1)
#     lstm2 = Bidirectional(CuDNNLSTM(20, return_sequences=True))(lstm1)
#     conv1 = Conv1D(128, 2, activation='relu')(lstm1)
#     pool1 = MaxPool1D(2)(conv1)
#     lstm2 = Bidirectional(CuDNNLSTM(80, return_sequences=True))(pool1)
#     avg_pool = GlobalAveragePooling1D()(attention)
#     max_pool = GlobalMaxPooling1D()(attention)
#     conc = concatenate([avg_pool, max_pool])
#     dense = Dense(128)(conc)
    flatten = Flatten()(lstm1)
    output = Dense(units=6, activation='sigmoid')(flatten)
    model = Model(inputs=inp, outputs=output)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model


def ensemble(model_f, num_ensembles, input_length):
    models = [model_f() for i in range(0,num_ensembles)]
    ensemble_input = Input(shape=(input_length,))
    averaged = average([m(ensemble_input) for m in models])
    ensemble = Model(inputs=[ensemble_input], outputs=[averaged])
    ensemble.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])    
    return models, ensemble


def simple_lstm_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
    lstm1 = Bidirectional(CuDNNLSTM(80, return_sequences=True))(x )
#     lstm2 = Bidirectional(CuDNNLSTM(20, return_sequences=True))(lstm1)
    avg_pool = GlobalAveragePooling1D()(lstm1)
    max_pool = GlobalMaxPooling1D()(lstm1)
    conc = concatenate([avg_pool, max_pool])
#     dense = Dense(128)(conc)
    output = Dense(units=6, activation='sigmoid')(conc)
    
    model = Model(inputs=inp, outputs=output)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(CuDNNGRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(6, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model_func = simple_lstm_model
ensemble_it = True
num_ensembles = 7
train_ensemble_jointly_after = False
epochs = 2
batch_size = 16

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)

if ensemble_it:
    models, ensemble_model = ensemble(model_func, num_ensembles, maxlen)
    for model in models:

        RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

        hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                         callbacks=[RocAuc], verbose=1)
    y_val_pred = ensemble_model.predict(X_val, batch_size=2, verbose=1)
    score = roc_auc_score(y_val, y_val_pred)
    print(score)

else:
    model = model_func()
    RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
    hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                     callbacks=[RocAuc], verbose=1)
    y_val_pred = model.predict(X_val, batch_size=2, verbose=1)
    score = roc_auc_score(y_val, y_val_pred)
    print(score)

if ensemble_it and train_ensemble_jointly_after:
    epochs=1
    RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
    hist = ensemble_model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                     callbacks=[RocAuc], verbose=1)
    y_val_pred = model.predict(X_val, batch_size=1, verbose=1)
    score = roc_auc_score(y_val, y_val_pred)
    print(score) 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.986411 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.986454 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.986523 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.986579 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.985863 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.986642 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.985982 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.986489 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.985323 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.986011 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score: 0.985694 

Epoch 2/2

 ROC-AUC - epoch: 2 - score: 0.986792 

Train on 151592 samples, validate on 7979 samples
Epoch 1/2

 ROC-AUC - epoch: 1 - score

In [None]:
epochs=1
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
hist = ensemble_model.fit(X_tra, y_tra, batch_size=2, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1, verbose=1)
y_val_pred = model.predict(X_val, batch_size=1, verbose=1)
score = roc_auc_score(y_val, y_val_pred)
print(score) 

In [13]:
y_val_pred = ensemble_model.predict(X_val, batch_size=4, verbose=1)
score = roc_auc_score(y_val, y_val_pred)
print(score)

0.9875603046662279


In [None]:
y_pred = ensemble_model.predict(x_test, batch_size=4, verbose=1)
submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
submission.to_csv('clean_text_lstm_7_ensemble_98799valauc.csv', index=False)

In [None]:
print(x_test.shape, y_pred.shape, y_val.shape)