# Importing Libraries

In [15]:
import pandas as pd
import re
import numpy as np
from keras.preprocessing import sequence
from keras.regularizers import l2
from keras.models import Model
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.layers import Lambda, SpatialDropout1D, Dense, Flatten, GlobalMaxPooling1D, Activation, Dropout, GaussianNoise
from keras.layers import GlobalAveragePooling1D, Embedding, Input, BatchNormalization, SpatialDropout1D, Conv1D, CuDNNGRU, GRU, Bidirectional
from keras.optimizers import Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from IPython.display import display
import itertools
from nltk.corpus import words
%matplotlib inline
import matplotlib.pyplot as plt

# Loaded Cleaned Data

In [2]:
# Load data
train = pd.read_csv('../data/cleaned_train.csv')
test = pd.read_csv('../data/cleaned_test.csv')

list_sentences_train = train["comment_text"].fillna("_NaN_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("_NaN_").values

In [3]:
# Set parameters
embed_size   = 300    # how big is each word vector
max_features = 100000 # how many unique words to use (i.e num rows in embedding vector)
maxlen       = 400   # max number of words in a comment to use 

In [4]:
# Pad sentences and convert to integers
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

X_train = pad_sequences(list_tokenized_train, maxlen=maxlen, padding='post')
X_test = pad_sequences(list_tokenized_test, maxlen=maxlen, padding='post')

In [30]:
X_train.shape[1]

400

In [5]:
f = open('../data/glove/glove.840B.300d.txt')
embeddings_index = {}
for line in f:
    values = line.split()
    word = ''.join(values[:-300])
    coefs = np.asarray(values[-300:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [6]:
# Create embeddings matrix
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

# Create embedding matrix using our vocabulary
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))

# Initialize embedding matrix
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

# Loop through each word and get its embedding vector
for word, i in word_index.items():
    if i >= max_features: 
        continue # Skip words appearing less than the minimum allowed
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [7]:
from keras.engine import InputSpec, Layer
import tensorflow as tf
def auc_roc(y_true, y_pred):
    # any tensorflow metric
    value, update_op = tf.contrib.metrics.streaming_auc(y_pred, y_true)#Please switch to tf.metrics.auc. Note that the order of the labels and predictions arguments has been switched.

    # find all variables created for this metric
    metric_vars = [i for i in tf.local_variables() if 'auc_roc' in i.name.split('/')[1]]

    # Add metric variables to GLOBAL_VARIABLES collection.
    # They will be initialized for new session.
    for v in metric_vars:
        tf.add_to_collection(tf.GraphKeys.GLOBAL_VARIABLES, v)

    # force to update metric values
    with tf.control_dependencies([update_op]):
        value = tf.identity(value)
        return value

In [13]:
from keras import initializers
from keras import backend as K
class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(** kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

# Training

In [25]:
recurrent_units = 60
inp = Input(shape=(X_train.shape[1],), dtype='int32')
emb = Embedding(max_features, embed_size, weights=[embedding_matrix], input_length = maxlen, trainable = False)(inp)
emb = SpatialDropout1D(0.2)(emb)
rnn_1 = Bidirectional(GRU(recurrent_units, return_sequences=False))(emb)
rnn_2 = Bidirectional(GRU(recurrent_units, return_sequences=True))(rnn_1)
x = concatenate([rnn_1, rnn_2], axis=2)

last = Lambda(lambda t: t[:, -1], name='last')(x)
maxpool = GlobalMaxPooling1D()(x)
attn = AttentionWeightedAverage()(x)
average = GlobalAveragePooling1D()(x)
all_views = concatenate([last, maxpool, average, attn], axis=1)
x = Dropout(0.5)(all_views)
x = Dense(144, activation="relu")(x)
out = Dense(y.shape[1], activation="sigmoid")(rnn_1)

In [26]:
# Compile
model = Model(inputs=inp, outputs=out)
adam = Adam(lr=1e-3, decay=1e-7)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [27]:
# Set callback functions to early stop training and save the best model so far
callbacks = [EarlyStopping(monitor='val_loss', patience=10),
             ModelCheckpoint(filepath='./models/best_RNN_model.h5', monitor='val_loss', save_best_only=True)]

In [28]:
# Train neural network
history = model.fit(X_train, # Features
                      y, # Target vector
                      epochs=5, # Number of epochs
                      callbacks=callbacks, # Early stopping
                      verbose=1, # Print description after each epoch
                      batch_size=256, # Number of observations per batch
                      validation_split=0.1,
                      shuffle=True) # Data for evaluation

Train on 143613 samples, validate on 15958 samples
Epoch 1/5

KeyboardInterrupt: 

In [None]:
from keras.models import load_model
model = load_model('./models/best_RNN_model.h5')

In [None]:
# Predict
preds = model.predict(X_test)

In [None]:
# Create submission
submid = pd.DataFrame({'id': test["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = list_classes)], axis=1)
submission.to_csv('rnn_glove_simple_sub.csv', index=False)