In [1]:
import numpy as np
import pandas as pd
from keras.layers import Dense, Input, Flatten
from keras.layers import GlobalMaxPool1D, Bidirectional, Convolution1D, Embedding, BatchNormalization,MaxPooling1D, Dropout, LSTM
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
from keras.models import Model
from keras.layers.merge import Concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
INPUT_PATH = '../input/'
CACHE_PATH = '../cache/'
OUTPUT_PATH ='../output/'

In [4]:
data = np.load(CACHE_PATH + 'data.npz')
X_train = data['X_train']
y_train = data['y_train']
X_val = data['X_val']
y_val = data['y_val']
X_test = data['X_test']
embedding_matrix = np.load(CACHE_PATH + 'embedding_matrix.npy')

In [5]:
MAX_FEATURES = 20000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 256

In [6]:
embedding_layer = Embedding(MAX_FEATURES,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [8]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
        """
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        # eij = K.dot(x, self.W) TF backend doesn't support it

        # features_dim = self.W.shape[0]
        # step_dim = x._keras_shape[1]

        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
    #print weigthted_input.shape
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        #return input_shape[0], input_shape[-1]
        return input_shape[0],  self.features_dim

In [7]:
def get_attention_model():
    inp = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(inp)
    x = Bidirectional(LSTM(250, dropout=0.25, recurrent_dropout=0.25, return_sequences=True))(embedded_sequences)
    merged = Attention(MAX_SEQUENCE_LENGTH)(x)
    merged = Dense(256, activation='relu')(merged)
    merged = Dropout(0.25)(merged)
    merged = BatchNormalization()(merged)
    preds = Dense(1, activation='linear')(merged)
    model = Model(inputs=inp, outputs=preds)
    model.compile(loss='mse',optimizer='adam')
    return model

In [9]:
def train_attention_model(model):
    model_path = CACHE_PATH + "attention_weights_best.hdf5"
    early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
    checkpoint = ModelCheckpoint(model_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint, early]
    model.fit(X_train, y_train, batch_size=128, epochs=100, validation_data=(X_val, y_val), callbacks=callbacks_list)
    model.load_weights(model_path)
    return model

In [10]:
model = get_attention_model()

In [12]:
from keras.utils.vis_utils import plot_model
plot_model(model,to_file='Atttention.png')

In [13]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 256)          5120000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 500)          1014000   
_________________________________________________________________
attention_1 (Attention)      (None, 500)               600       
_________________________________________________________________
dense_1 (Dense)              (None, 256)               128256    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 256)               1024      
__________

In [14]:
model = train_attention_model(model)

Train on 209000 samples, validate on 11000 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.64161, saving model to ../cache/attention_weights_best.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 0.64161 to 0.53503, saving model to ../cache/attention_weights_best.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 0.53503 to 0.44874, saving model to ../cache/attention_weights_best.hdf5
Epoch 4/100

Epoch 00004: val_loss improved from 0.44874 to 0.42988, saving model to ../cache/attention_weights_best.hdf5
Epoch 5/100

Epoch 00005: val_loss did not improve
Epoch 6/100

Epoch 00006: val_loss improved from 0.42988 to 0.41793, saving model to ../cache/attention_weights_best.hdf5
Epoch 7/100

Epoch 00007: val_loss improved from 0.41793 to 0.41618, saving model to ../cache/attention_weights_best.hdf5
Epoch 8/100

Epoch 00008: val_loss did not improve
Epoch 9/100

Epoch 00009: val_loss did not improve
Epoch 10/100

Epoch 00010: val_loss improved from 0.41618 to 0.41411

In [20]:
y_test = model.predict(X_test,batch_size=128,verbose=1)
y_test[y_test < 1] = 1
y_test[y_test > 4.7] = 5



In [21]:
sub = pd.read_csv(INPUT_PATH + 'sample.csv',header=None,names=['Id','Score'])
sub['Score'] = y_test
sub.to_csv(OUTPUT_PATH + 'attention.csv',index=False, header=False)

In [22]:
sub

Unnamed: 0,Id,Score
0,d1c0b520-43c2-3060-843f-711422be08e7,3.089528
1,fed809ea-6c05-3cb5-864f-0b10199f38cf,4.419052
2,62880c2f-19ad-367d-ba6b-285984fd2e1c,4.084131
3,7a35d3c4-f5ff-384e-b0f9-032f3be7d81b,4.546053
4,64f0cdd4-26f9-3034-b109-eac59a1f4f30,4.691096
5,8fa29e8b-49e4-3d65-8dc1-9402c61dabd9,4.645653
6,512fc844-4f4f-3cdc-93a7-2f209937420a,3.875635
7,eb68cf16-ae60-3a1e-b648-ad92cc12c9c4,4.045265
8,89e8804e-84d1-37ce-8af0-ae3cd38f7b06,4.462513
9,6b6abc03-ae74-35a1-9b4f-ec4525ab3b72,4.522406
