<a href="https://colab.research.google.com/github/gilgarad/nlp_nlu/blob/master/jupyter_colab/Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mixed code and reimplementations with additional implementation from 
# 1. Code Structure: https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/
# 2. Attention: https://github.com/philipperemy/keras-attention-mechanism
# 3. Dataset(Movie Review): https://keras.io/datasets/

In [1]:
import keras.backend.tensorflow_backend as K
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Flatten, BatchNormalization, Activation, \
Dropout, Input, multiply, Permute, Reshape, merge, Concatenate, Lambda, \
RepeatVector, concatenate, TimeDistributed, Multiply
from keras.layers.wrappers import Bidirectional

import numpy as np
import pandas

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [2]:
# fix random seed for reproducibility
seed = 20152018
np.random.seed(seed)

In [3]:
# from keras.datasets import imdb
from keras.datasets import reuters, imdb
from keras.utils import np_utils
from keras.layers import Embedding
from keras.preprocessing.sequence import pad_sequences

In [4]:
import tensorflow as tf

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

In [5]:
# Data download
print('Data download. Currently only for 1. imdb and 2. reuter')
# dataset_name = 'imdb'
dataset_name = 'reuters'

dataset_dict = {'imdb': imdb, 'reuters': reuters}

(x_train, y_train), (x_test, y_test) = dataset_dict[dataset_name].load_data(path=dataset_name + ".npz",
                                                                            num_words=None,
                                                                            skip_top=0,
                                                                            maxlen=None,
                                                                            seed=seed,
                                                                            start_char=1,
                                                                            oov_char=2,
                                                                            index_from=3)


Data download. Currently only for 1. imdb and 2. reuter


In [6]:
# # Label Normalize! encode class values as integers
# encoder = LabelEncoder()
# encoder.fit(y_train)
# encoded_Y = encoder.transform(y_train)
# y_train = np_utils.to_categorical(encoded_Y)
# encoded_Y = encoder.transform(y_test)
# y_test = np_utils.to_categorical(encoded_Y)


# Check input and output shape

print('x_train:')
print('Shape:', x_train.shape)
print('x_test:')
print('Shape:', x_test.shape)
print('Example:', x_train[:1])
print('')
print('y_train:')
print('Shape:', y_train.shape)
print('y_test:')
print('Shape:', y_test.shape)
print('Example', y_train[:1])

x_train:
Shape: (8982,)
x_test:
Shape: (2246,)
Example: [list([1, 195, 13780, 15752, 5, 141, 4287, 130, 71, 9262, 68, 5, 78, 3145, 561, 900, 6, 2156, 603, 141, 1211, 76, 8, 16, 33, 116, 10, 310, 7, 4, 49, 6, 83, 127, 561, 51, 36, 487, 6, 3391, 432, 67, 4, 561, 41, 263, 9, 118, 371, 77, 41, 45, 2912, 7, 25, 362, 9262, 9, 141, 5172, 1197, 1416, 71, 3666, 7, 50, 286, 1068, 9, 2094, 450, 9, 697, 7019, 103, 595, 119, 20, 4093, 55, 306, 6, 1934, 172, 10, 73, 4252, 6, 2570, 112, 6766, 6842, 6803, 111, 149, 17, 12])]

y_train:
Shape: (8982,)
y_test:
Shape: (2246,)
Example [4]


In [7]:
# Check Dictionary
word_index = dataset_dict[dataset_name].get_word_index(path=dataset_name + "_word_index.json")
num_words = len(word_index.keys())
print('Word Index "man":', word_index['man'])
print('Word Index "save":', word_index['save'])
print('Number of words:', num_words)

Word Index "man": 2792
Word Index "save": 2854
Number of words: 30979


In [8]:
# Check max length
print('Check up the length of each input data. I will choose the mid length of three measured out of lengths as seq_max_length')
x_train_legnth = [len(x) for x in x_train]
x_test_length = [len(x) for x in x_test]
max_length = np.max(np.concatenate([x_train_legnth, x_test_length]))
median_length = np.median(np.concatenate([x_train_legnth, x_test_length]))
average_length = np.mean(np.concatenate([x_train_legnth, x_test_length]))
print('Max Length:', max_length)
print('Median Length:', median_length)
print('Average Length:', average_length)
print('')

seq_max_length = np.median([max_length, median_length, average_length])
seq_max_length = int(seq_max_length // 1 + 1)
print('seq_max_length:', seq_max_length)


Check up the length of each input data. I will choose the mid length of three measured out of lengths as seq_max_length
Max Length: 2376
Median Length: 95.0
Average Length: 145.96419665122906

seq_max_length: 146


In [9]:
# Pad input sequences
print('Pad sequences with seq_max_length, and padding type will be "post"')
_x_train = pad_sequences(x_train, maxlen=seq_max_length, padding='post')
_x_test = pad_sequences(x_test, maxlen=seq_max_length, padding='post')
_y_train = np_utils.to_categorical(y_train)
_y_test = np_utils.to_categorical(y_test)

num_categories = _y_train.shape[1]
print('Number of categories:', num_categories)

Pad sequences with seq_max_length, and padding type will be "post"
Number of categories: 46


# Simple LSTM

In [11]:
embedding_dim = 200
lstm_dim = 128

In [None]:
# baseline model
def create_baseline():
    with K.tf.device('/gpu:0'):
        # create model
        model = Sequential()
        model.add(Embedding(num_words, embedding_dim))
        model.add(Bidirectional(LSTM(lstm_dim, return_sequences=False)))
        model.add(Dense(num_categories, activation='softmax'))
        # Compile model
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
# evaluate model with standardized dataset
# estimator = KerasClassifier(build_fn=create_baseline, epochs=30, batch_size=512, verbose=1)
# kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
# results = cross_val_score(estimator, _x_train, _y_train, cv=kfold)
# print(estimator.summary())



# print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
model = create_baseline()
print(model.summary())
model.fit(x=_x_train, y=_y_train, validation_data=(_x_test, _y_test), batch_size=256, verbose=1, epochs=100)
loss, metrics = model.evaluate(_x_test, _y_test, batch_size=256)
print(metrics)

# Attention Concept - Encoder / Decoder Style (Luong Version)

In [None]:
attention_score_dim = 128
embedding_dim = 200
lstm_dim = 128

def repeat_vector(seq_length, axis):
    return Lambda(lambda x: K.repeat_elements(K.expand_dims(x, axis), seq_length, axis))

def attention_baseline(input_shape):
    with K.tf.device('/gpu:0'):
        inputs = Input(input_shape)
        embeded_enc = Embedding(num_words, embedding_dim)(inputs)
#         encoder = Bidirectional(LSTM(lstm_dim, return_sequences=True, return_state=True))
        encoder = LSTM(lstm_dim, return_sequences=True, return_state=True)
        encoder_outputs, encoder_state_h, encoder_state_c = encoder(embeded_enc)
        encoder_output_states = [encoder_state_h, encoder_state_c]
        
        decoder_inputs = Input((1, ))
        embeded_dec = Embedding(1, embedding_dim)(decoder_inputs)
        decoder = LSTM(lstm_dim, return_sequences=True, return_state=True)
        decoder_outputs, decoder_state_h, encoder_state_c = decoder(embeded_dec, initial_state=encoder_output_states)
#         decoder_outputs, decoder_state_h, encoder_state_c = decoder(embeded_dec)
        
        # Make Attention Layer
        reapeat_d = repeat_vector(seq_length=seq_max_length, axis=2)(decoder_outputs)
        repeat_e = repeat_vector(seq_length=1, axis=1)(encoder_outputs)

        attention_concat_outputs = Concatenate()([reapeat_d, repeat_e])
        dense1_score = Dense(attention_score_dim, activation='tanh')(attention_concat_outputs)
        score_layer = Dense(1)(dense1_score)
        dense2_score = Reshape((1, seq_max_length))(score_layer) # reshape to be 2 dims
        softmax_score = Activation('softmax')(dense2_score)
        
        repeat_score_layer = repeat_vector(attention_score_dim, 2)
        repeat_score = repeat_score_layer(softmax_score)

        permute_e = Permute((2, 1))(encoder_outputs)
        repeat_e_layer = repeat_vector(1, 1)
        repeat_e = repeat_e_layer(permute_e)

        attended_mat_layer = Multiply()
        attended_mat = attended_mat_layer([repeat_score, repeat_e])

        context_layer = Lambda(lambda x: K.sum(x, axis=-1),
                                     lambda x: tuple(x[:-1]))
        context = context_layer(attended_mat)

        concat_context_layer = Concatenate(axis=-1)
        concat_context = concat_context_layer([context, decoder_outputs])

        attention_dense_output_layer = Dense(attention_score_dim, activation='tanh')
        attention_output_layer = TimeDistributed(attention_dense_output_layer)
        attention_output = attention_output_layer(concat_context)
        
        
        outputs = Dense(num_categories, activation='softmax')(attention_output)
        outputs = Flatten()(outputs)
        
        
        model = Model(inputs=[inputs, decoder_inputs], outputs=outputs)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
#         encoder_model = Model(inputs=inputs, outputs=[decoder_outputs, encoder_output_states])
#         encoder_model.compile(loss='categorical_crossentropy', optimizer='adam')
        
#         decoder_model = Model(inputs=decoder_inputs)
        
        return model

In [None]:
# Make decoder inputs with dictionary '<s>'
decoder_x_train = np.array([[1, ] for _ in range(len(_x_train))])
decoder_x_test = np.array([[1, ] for _ in range(len(_x_test))])

print(decoder_x_train.shape)
print(decoder_x_test.shape)

In [None]:
model = attention_baseline(input_shape=_x_train.shape[1:])
print(model.summary())
model.fit(x=[_x_train, decoder_x_train], y=_y_train, validation_data=([_x_test, decoder_x_test], _y_test), 
          batch_size=256, verbose=1, epochs=100)
loss, metrics = model.evaluate([_x_test, decoder_x_test], _y_test, batch_size=256)
print(metrics)

# Attention Concept - Encoder / Decoder Style (Luong Version) with CORRECTED LAST OUTPUT

In [22]:
from keras.layers import GRU

In [21]:
# Sum of last dimension is 0, then that means it is padded !
def get_pad_index():
    return Lambda(lambda x: K.cast(K.not_equal(K.sum(x, axis=-1, keepdims=True), 0), 'float32'))

def get_last_outputs(inputs, outputs, dimension, seq_length):
    if dimension == 2:
        new_inputs = Reshape((seq_length, 1))(inputs)
    else:
        new_inputs = inputs
    pad_index = get_pad_index()(new_inputs)
    last_index = Lambda(lambda x: K.sum(x, axis=-2) - 1)(pad_index)

    # LAST RELEVANT OUTPUT
    # create the row index with tf.range
    row_idx = Lambda(lambda x: tf.reshape(tf.range(tf.shape(x)[0]), (-1,1)))(last_index)

    # stack with column index
    idx = Lambda(lambda x: tf.stack([row_idx, K.cast(x, 'int32')], axis=-1))(last_index)
    # extract the elements with gather_nd
    last_outputs = Lambda(lambda x: tf.gather_nd(x, idx))(outputs)
    
    last_outputs = Reshape((latent_dim, ))(last_outputs)
    return pad_index, last_outputs

In [33]:
attention_score_dim = 128
embedding_dim = 200
rnn_dim = 128
latent_dim = 128

def repeat_vector(seq_length, axis):
    return Lambda(lambda x: K.repeat_elements(K.expand_dims(x, axis), seq_length, axis))

def attention_baseline2(input_shape):
    with K.tf.device('/gpu:0'):
        encoder_inputs = Input(input_shape)
        
        embeded_enc = Embedding(num_words, embedding_dim)(encoder_inputs)
#         encoder = Bidirectional(LSTM(lstm_dim, return_sequences=True, return_state=True))
        encoder = GRU(rnn_dim, return_sequences=True, return_state=True)
        encoder_outputs, encoder_last_output = encoder(embeded_enc)
        
#         pad_index, last_outputs = get_last_outputs(encoder_inputs, encoder_outputs, dimension=2, seq_length=input_shape[0])
#         encoder_outputs = Multiply()([pad_index, encoder_outputs])
        
        decoder_inputs = Input((1, ))
        embeded_dec = Embedding(1, embedding_dim)(decoder_inputs)
        decoder = GRU(rnn_dim, return_sequences=True, return_state=True)
        decoder_outputs, decoder_last_output = decoder(embeded_dec, initial_state=encoder_last_output)
#         decoder_outputs, decoder_state_h, encoder_state_c = decoder(embeded_dec)
        
        # Make Attention Layer
        reapeat_d = repeat_vector(seq_length=seq_max_length, axis=2)(decoder_outputs)
        repeat_e = repeat_vector(seq_length=1, axis=1)(encoder_outputs)

        attention_concat_outputs = Concatenate()([reapeat_d, repeat_e])
        dense1_score = Dense(attention_score_dim, activation='tanh')(attention_concat_outputs)
        score_layer = Dense(1)(dense1_score)
        dense2_score = Reshape((1, seq_max_length))(score_layer) # reshape to be 2 dims
        softmax_score = Activation('softmax')(dense2_score)
        
        repeat_score_layer = repeat_vector(attention_score_dim, 2)
        repeat_score = repeat_score_layer(softmax_score)

        permute_e = Permute((2, 1))(encoder_outputs)
        repeat_e_layer = repeat_vector(1, 1)
        repeat_e = repeat_e_layer(permute_e)

        attended_mat_layer = Multiply()
        attended_mat = attended_mat_layer([repeat_score, repeat_e])

        context_layer = Lambda(lambda x: K.sum(x, axis=-1),
                                     lambda x: tuple(x[:-1]))
        context = context_layer(attended_mat)

        concat_context_layer = Concatenate(axis=-1)
        concat_context = concat_context_layer([context, decoder_outputs])

        attention_dense_output_layer = Dense(attention_score_dim, activation='tanh')
        attention_output_layer = TimeDistributed(attention_dense_output_layer)
        attention_output = attention_output_layer(concat_context)
        
        
        outputs = Dense(num_categories, activation='softmax')(attention_output)
        outputs = Flatten()(outputs)
        
        
        model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
#         encoder_model = Model(inputs=inputs, outputs=[decoder_outputs, encoder_output_states])
#         encoder_model.compile(loss='categorical_crossentropy', optimizer='adam')
        
#         decoder_model = Model(inputs=decoder_inputs)
        
        return model

In [34]:
# Make decoder inputs with dictionary '<s>'
decoder_x_train = np.array([[1, ] for _ in range(len(_x_train))])
decoder_x_test = np.array([[1, ] for _ in range(len(_x_test))])

print(decoder_x_train.shape)
print(decoder_x_test.shape)

(8982, 1)
(2246, 1)


In [35]:
model = attention_baseline2(input_shape=_x_train.shape[1:])
print(model.summary())
model.fit(x=[_x_train, decoder_x_train], y=_y_train, validation_data=([_x_test, decoder_x_test], _y_test), 
          batch_size=256, verbose=1, epochs=100)
loss, metrics = model.evaluate([_x_test, decoder_x_test], _y_test, batch_size=256)
print(metrics)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_8 (InputLayer)            (None, 146)          0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_8 (Embedding)         (None, 146, 200)     6195800     input_8[0][0]                    
__________________________________________________________________________________________________
embedding_9 (Embedding)         (None, 1, 200)       200         input_9[0][0]                    
__________________________________________________________________________________________________
gru_5 (GRU

Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100


Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
0.6718610868534656


In [36]:
attention_score_dim = 128
embedding_dim = 200
rnn_dim = 128
latent_dim = 128

def repeat_vector(seq_length, axis):
    return Lambda(lambda x: K.repeat_elements(K.expand_dims(x, axis), seq_length, axis))

def attention_baseline3(input_shape):
    with K.tf.device('/gpu:0'):
        encoder_inputs = Input(input_shape)
        
        embeded_enc = Embedding(num_words, embedding_dim)(encoder_inputs)
#         encoder = Bidirectional(LSTM(lstm_dim, return_sequences=True, return_state=True))
        encoder = GRU(rnn_dim, return_sequences=True, return_state=True)
        encoder_outputs, encoder_last_output = encoder(embeded_enc)
        
        pad_index, last_outputs = get_last_outputs(encoder_inputs, encoder_outputs, dimension=2, seq_length=input_shape[0])
        encoder_outputs = Multiply()([pad_index, encoder_outputs])
        
        decoder_inputs = Input((1, ))
        embeded_dec = Embedding(1, embedding_dim)(decoder_inputs)
        decoder = GRU(rnn_dim, return_sequences=True, return_state=True)
        decoder_outputs, decoder_last_output = decoder(embeded_dec, initial_state=last_outputs)
#         decoder_outputs, decoder_state_h, encoder_state_c = decoder(embeded_dec)
        
        # Make Attention Layer
        reapeat_d = repeat_vector(seq_length=seq_max_length, axis=2)(decoder_outputs)
        repeat_e = repeat_vector(seq_length=1, axis=1)(encoder_outputs)

        attention_concat_outputs = Concatenate()([reapeat_d, repeat_e])
        dense1_score = Dense(attention_score_dim, activation='tanh')(attention_concat_outputs)
        score_layer = Dense(1)(dense1_score)
        dense2_score = Reshape((1, seq_max_length))(score_layer) # reshape to be 2 dims
        softmax_score = Activation('softmax')(dense2_score)
        
        repeat_score_layer = repeat_vector(attention_score_dim, 2)
        repeat_score = repeat_score_layer(softmax_score)

        permute_e = Permute((2, 1))(encoder_outputs)
        repeat_e_layer = repeat_vector(1, 1)
        repeat_e = repeat_e_layer(permute_e)

        attended_mat_layer = Multiply()
        attended_mat = attended_mat_layer([repeat_score, repeat_e])

        context_layer = Lambda(lambda x: K.sum(x, axis=-1),
                                     lambda x: tuple(x[:-1]))
        context = context_layer(attended_mat)

        concat_context_layer = Concatenate(axis=-1)
        concat_context = concat_context_layer([context, decoder_outputs])

        attention_dense_output_layer = Dense(attention_score_dim, activation='tanh')
        attention_output_layer = TimeDistributed(attention_dense_output_layer)
        attention_output = attention_output_layer(concat_context)
        
        
        outputs = Dense(num_categories, activation='softmax')(attention_output)
        outputs = Flatten()(outputs)
        
        
        model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
#         encoder_model = Model(inputs=inputs, outputs=[decoder_outputs, encoder_output_states])
#         encoder_model.compile(loss='categorical_crossentropy', optimizer='adam')
        
#         decoder_model = Model(inputs=decoder_inputs)
        
        return model

In [37]:
# Make decoder inputs with dictionary '<s>'
decoder_x_train = np.array([[1, ] for _ in range(len(_x_train))])
decoder_x_test = np.array([[1, ] for _ in range(len(_x_test))])

print(decoder_x_train.shape)
print(decoder_x_test.shape)

(8982, 1)
(2246, 1)


In [None]:
model = attention_baseline3(input_shape=_x_train.shape[1:])
print(model.summary())
model.fit(x=[_x_train, decoder_x_train], y=_y_train, validation_data=([_x_test, decoder_x_test], _y_test), 
          batch_size=256, verbose=1, epochs=100)
loss, metrics = model.evaluate([_x_test, decoder_x_test], _y_test, batch_size=256)
print(metrics)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_10 (InputLayer)           (None, 146)          0                                            
__________________________________________________________________________________________________
embedding_10 (Embedding)        (None, 146, 200)     6195800     input_10[0][0]                   
__________________________________________________________________________________________________
gru_7 (GRU)                     [(None, 146, 128), ( 126336      embedding_10[0][0]               
__________________________________________________________________________________________________
input_11 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
lambda_30 

Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100

In [None]:
attention_score_dim = 128
embedding_dim = 200
rnn_dim = 128
latent_dim = 128

def repeat_vector(seq_length, axis):
    return Lambda(lambda x: K.repeat_elements(K.expand_dims(x, axis), seq_length, axis))

def attention_baseline4(input_shape):
    with K.tf.device('/gpu:0'):
        encoder_inputs = Input(input_shape)
        
        embeded_enc = Embedding(num_words, embedding_dim)(encoder_inputs)
#         encoder = Bidirectional(LSTM(lstm_dim, return_sequences=True, return_state=True))
        encoder = GRU(rnn_dim, return_sequences=True, return_state=True)
        encoder_outputs, encoder_last_output = encoder(embeded_enc)
        
        pad_index, last_outputs = get_last_outputs(encoder_inputs, encoder_outputs, dimension=2, seq_length=input_shape[0])
#         encoder_outputs = Multiply()([pad_index, encoder_outputs])
        
        decoder_inputs = Input((1, ))
        embeded_dec = Embedding(1, embedding_dim)(decoder_inputs)
        decoder = GRU(rnn_dim, return_sequences=True, return_state=True)
        decoder_outputs, decoder_last_output = decoder(embeded_dec, initial_state=last_outputs)
#         decoder_outputs, decoder_state_h, encoder_state_c = decoder(embeded_dec)
        
        # Make Attention Layer
        reapeat_d = repeat_vector(seq_length=seq_max_length, axis=2)(decoder_outputs)
        repeat_e = repeat_vector(seq_length=1, axis=1)(encoder_outputs)

        attention_concat_outputs = Concatenate()([reapeat_d, repeat_e])
        dense1_score = Dense(attention_score_dim, activation='tanh')(attention_concat_outputs)
        score_layer = Dense(1)(dense1_score)
        dense2_score = Reshape((1, seq_max_length))(score_layer) # reshape to be 2 dims
        softmax_score = Activation('softmax')(dense2_score)
        
        repeat_score_layer = repeat_vector(attention_score_dim, 2)
        repeat_score = repeat_score_layer(softmax_score)

        permute_e = Permute((2, 1))(encoder_outputs)
        repeat_e_layer = repeat_vector(1, 1)
        repeat_e = repeat_e_layer(permute_e)

        attended_mat_layer = Multiply()
        attended_mat = attended_mat_layer([repeat_score, repeat_e])

        context_layer = Lambda(lambda x: K.sum(x, axis=-1),
                                     lambda x: tuple(x[:-1]))
        context = context_layer(attended_mat)

        concat_context_layer = Concatenate(axis=-1)
        concat_context = concat_context_layer([context, decoder_outputs])

        attention_dense_output_layer = Dense(attention_score_dim, activation='tanh')
        attention_output_layer = TimeDistributed(attention_dense_output_layer)
        attention_output = attention_output_layer(concat_context)
        
        
        outputs = Dense(num_categories, activation='softmax')(attention_output)
        outputs = Flatten()(outputs)
        
        
        model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
#         encoder_model = Model(inputs=inputs, outputs=[decoder_outputs, encoder_output_states])
#         encoder_model.compile(loss='categorical_crossentropy', optimizer='adam')
        
#         decoder_model = Model(inputs=decoder_inputs)
        
        return model

In [None]:
# Make decoder inputs with dictionary '<s>'
decoder_x_train = np.array([[1, ] for _ in range(len(_x_train))])
decoder_x_test = np.array([[1, ] for _ in range(len(_x_test))])

print(decoder_x_train.shape)
print(decoder_x_test.shape)

In [None]:
model = attention_baseline4(input_shape=_x_train.shape[1:])
print(model.summary())
model.fit(x=[_x_train, decoder_x_train], y=_y_train, validation_data=([_x_test, decoder_x_test], _y_test), 
          batch_size=256, verbose=1, epochs=100)
loss, metrics = model.evaluate([_x_test, decoder_x_test], _y_test, batch_size=256)
print(metrics)

# Luong, My try

In [21]:
attention_score_dim = 128
embedding_dim = 200
lstm_dim = 128

def repeat_vector(seq_length, axis):
    return Lambda(lambda x: K.repeat_elements(K.expand_dims(x, axis), seq_length, axis))

def attention_baseline2(input_shape):
    with K.tf.device('/gpu:0'):
        inputs = Input(input_shape)
        embeded_enc = Embedding(num_words, embedding_dim)(inputs)
#         encoder = Bidirectional(LSTM(lstm_dim, return_sequences=True, return_state=True))
        encoder = LSTM(lstm_dim, return_sequences=True, return_state=True)
        encoder_outputs, encoder_state_h, encoder_state_c = encoder(embeded_enc)
        encoder_output_states = [encoder_state_h, encoder_state_c]
        
        decoder_inputs = Input((1, ))
        embeded_dec = Embedding(1, embedding_dim)(decoder_inputs)
        decoder = LSTM(lstm_dim, return_sequences=True, return_state=True)
        decoder_outputs, decoder_state_h, encoder_state_c = decoder(embeded_dec, initial_state=encoder_output_states)
        decoder_output_states = [decoder_state_h, encoder_state_c]
#         decoder_outputs, decoder_state_h, encoder_state_c = decoder(embeded_dec)
        
        # Make Attention Layer
        reapeat_d = repeat_vector(seq_length=seq_max_length, axis=2)(decoder_output_states)
        repeat_e = repeat_vector(seq_length=1, axis=1)(encoder_output_states)

        attention_concat_outputs = Concatenate()([reapeat_d, repeat_e])
        dense1_score = Dense(attention_score_dim, activation='tanh')(attention_concat_outputs)
        score_layer = Dense(1)(dense1_score)
        dense2_score = Reshape((1, seq_max_length))(score_layer) # reshape to be 2 dims
        softmax_score = Activation('softmax')(dense2_score)
        
        # Context Vector
        repeat_score_layer = repeat_vector(attention_score_dim, 2)
        repeat_score = repeat_score_layer(softmax_score)

        permute_e = Permute((2, 1))(encoder_output_states)
        repeat_e_layer = repeat_vector(1, 1)
        repeat_e = repeat_e_layer(permute_e)

        attended_mat_layer = Multiply()
        attended_mat = attended_mat_layer([repeat_score, repeat_e])

        context_layer = Lambda(lambda x: K.sum(x, axis=-1))
        context = context_layer(attended_mat)

        concat_context_layer = Concatenate(axis=-1)
        concat_context = concat_context_layer([context, decoder_output_states])

        attention_dense_output_layer = Dense(attention_score_dim, activation='tanh')
#         attention_output_layer = TimeDistributed(attention_dense_output_layer)
        attention_output = attention_dense_output_layer(concat_context)
        
        
        outputs = Dense(num_categories, activation='softmax')(attention_output)
        outputs = Flatten()(outputs)
        
        
        model = Model(inputs=[inputs, decoder_inputs], outputs=outputs)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
#         encoder_model = Model(inputs=inputs, outputs=[decoder_outputs, encoder_output_states])
#         encoder_model.compile(loss='categorical_crossentropy', optimizer='adam')
        
#         decoder_model = Model(inputs=decoder_inputs)
        
        return model

In [22]:
# Make decoder inputs with dictionary '<s>'
decoder_x_train = np.array([[1, ] for _ in range(len(_x_train))])
decoder_x_test = np.array([[1, ] for _ in range(len(_x_test))])

print(decoder_x_train.shape)
print(decoder_x_test.shape)

(8982, 1)
(2246, 1)


In [23]:
model = attention_baseline(input_shape=_x_train.shape[1:])
print(model.summary())
model.fit(x=[_x_train, decoder_x_train], y=_y_train, validation_data=([_x_test, decoder_x_test], _y_test), 
          batch_size=256, verbose=1, epochs=30)
loss, metrics = model.evaluate([_x_test, decoder_x_test], _y_test, batch_size=256)
print(metrics)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 146)          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 146, 200)     6195800     input_5[0][0]                    
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 1, 200)       200         input_6[0][0]                    
__________________________________________________________________________________________________
lstm_6 (LS

KeyboardInterrupt: 

# Attention Conecept without decoder

In [None]:
def attention_baseline(input_shape):
    with K.tf.device('/gpu:0'):
        # create model
        #     model = Model()
        inputs = Input(input_shape) # sequence length
        embeded_seq = Embedding(num_words, embedding_dim)(inputs) # sequence length x 100
        lstm = Bidirectional(LSTM(lstm_dim, return_sequences=True))(embeded_seq)
        attention_probs = Dense(256, activation='softmax')(lstm)
        attention_mul = multiply([lstm, attention_probs])

        f = Flatten()(attention_mul) 
        f = Dense(64)(f)

        y = Dense(46, activation='sigmoid')(f)
        model = Model(inputs=i, outputs=y)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
  
  

In [None]:
# evaluate model with standardized dataset
# estimator = KerasClassifier(build_fn=create_baseline, epochs=30, batch_size=512, verbose=1)
# kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
# results = cross_val_score(estimator, _x_train, y_train, cv=kfold)
# print(estimator.summary())
model = attention_baseline(input_shape=x_train.shape[1:])
print(model.summary())
model.fit(x=_x_train, y=_y_train, validation_split=0.1, batch_size=512, verbose=1, epochs=30)
loss, metrics = model.evaluate(_x_test, _y_test, batch_size=512)
print(metrics)
# results = model.predict(_x_test, y_test, batch_size=512)

# print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
def attention_3d_block(inputs):
  TIME_STEPS = 200
  # inputs.shape = (batch_size, time_steps, input_dim)
  input_dim = int(inputs.shape[2])
  a = Permute((2, 1))(inputs)
  a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
  a = Dense(TIME_STEPS, activation='softmax')(a)
#   if SINGLE_ATTENTION_VECTOR:
#       a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
#       a = RepeatVector(input_dim)(a)
  a_probs = Permute((2, 1), name='attention_vec')(a)
#   output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
  output_attention_mul = multiply([inputs, a_probs])
  return output_attention_mul


def model_attention_applied_after_lstm(input_shape):
  with K.tf.device('/gpu:0'):
    inputs = Input(shape=input_shape)
    embeded_seq = Embedding(30979, 100)(inputs) # sequence length x 100
    lstm_units = 128
    lstm_out = LSTM(lstm_units, return_sequences=True)(embeded_seq)
    attention_mul = attention_3d_block(lstm_out)
    attention_mul = Flatten()(attention_mul)
    output = Dense(46, activation='sigmoid')(attention_mul)
    model = Model(input=[inputs], output=output)
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [None]:
# evaluate model with standardized dataset
# estimator = KerasClassifier(build_fn=create_baseline, epochs=30, batch_size=512, verbose=1)
# kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
# results = cross_val_score(estimator, _x_train, y_train, cv=kfold)
# print(estimator.summary())
model = model_attention_applied_after_lstm(input_shape=(200, ))
print(model.summary())
model.fit(x=_x_train, y=_y_train, validation_split=0.1, batch_size=512, verbose=1, epochs=30)
loss, metrics = model.evaluate(_x_test, _y_test, batch_size=512)
print(metrics)
# results = model.predict(_x_test, y_test, batch_size=512)

# print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

# Seq2Seq

In [None]:
def seq2seq_baseline():
  with K.tf.device('/gpu:0'):
    model = Sequential()
    model.add(LSTM(150, input_shape=(n_timesteps_in, n_features)))
    model.add(RepeatVector(n_timesteps_in))
    model.add(LSTM(150, return_sequences=True))
    model.add(TimeDistributed(Dense(n_features, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

# Attention Seq2Seq

In [None]:
from keras import backend as K
from keras import regularizers, constraints, initializers, activations
from keras.layers.recurrent import Recurrent
from keras.engine import InputSpec
 
tfPrint = lambda d, T: tf.Print(input_=T, data=[T, tf.shape(T)], message=d)
 

def _time_distributed_dense(x, w, b=None, dropout=None,
                            input_dim=None, output_dim=None,
                            timesteps=None, training=None):
    """Apply `y . w + b` for every temporal slice y of x.
    # Arguments
        x: input tensor.
        w: weight matrix.
        b: optional bias vector.
        dropout: wether to apply dropout (same dropout mask
            for every temporal slice of the input).
        input_dim: integer; optional dimensionality of the input.
        output_dim: integer; optional dimensionality of the output.
        timesteps: integer; optional number of timesteps.
        training: training phase tensor or boolean.
    # Returns
        Output tensor.
    """
    if not input_dim:
        input_dim = K.shape(x)[2]
    if not timesteps:
        timesteps = K.shape(x)[1]
    if not output_dim:
        output_dim = K.shape(w)[1]

    if dropout is not None and 0. < dropout < 1.:
        # apply the same dropout pattern at every timestep
        ones = K.ones_like(K.reshape(x[:, 0, :], (-1, input_dim)))
        dropout_matrix = K.dropout(ones, dropout)
        expanded_dropout_matrix = K.repeat(dropout_matrix, timesteps)
        x = K.in_train_phase(x * expanded_dropout_matrix, x, training=training)

    # collapse time dimension and batch dimension together
    x = K.reshape(x, (-1, input_dim))
    x = K.dot(x, w)
    if b is not None:
        x = K.bias_add(x, b)
    # reshape to 3D tensor
    if K.backend() == 'tensorflow':
        x = K.reshape(x, K.stack([-1, timesteps, output_dim]))
        x.set_shape([None, None, output_dim])
    else:
        x = K.reshape(x, (-1, timesteps, output_dim))
    return x

class AttentionDecoder(Recurrent):
 
    def __init__(self, units, output_dim,
                 activation='tanh',
                 return_probabilities=False,
                 name='AttentionDecoder',
                 kernel_initializer='glorot_uniform',
                 recurrent_initializer='orthogonal',
                 bias_initializer='zeros',
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 activity_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 **kwargs):
        """
        Implements an AttentionDecoder that takes in a sequence encoded by an
        encoder and outputs the decoded states
        :param units: dimension of the hidden state and the attention matrices
        :param output_dim: the number of labels in the output space
 
        references:
            Bahdanau, Dzmitry, Kyunghyun Cho, and Yoshua Bengio.
            "Neural machine translation by jointly learning to align and translate."
            arXiv preprint arXiv:1409.0473 (2014).
        """
        self.units = units
        self.output_dim = output_dim
        self.return_probabilities = return_probabilities
        self.activation = activations.get(activation)
        self.kernel_initializer = initializers.get(kernel_initializer)
        self.recurrent_initializer = initializers.get(recurrent_initializer)
        self.bias_initializer = initializers.get(bias_initializer)
 
        self.kernel_regularizer = regularizers.get(kernel_regularizer)
        self.recurrent_regularizer = regularizers.get(kernel_regularizer)
        self.bias_regularizer = regularizers.get(bias_regularizer)
        self.activity_regularizer = regularizers.get(activity_regularizer)
 
        self.kernel_constraint = constraints.get(kernel_constraint)
        self.recurrent_constraint = constraints.get(kernel_constraint)
        self.bias_constraint = constraints.get(bias_constraint)
 
        super(AttentionDecoder, self).__init__(**kwargs)
        self.name = name
        self.return_sequences = True  # must return sequences
 
    def build(self, input_shape):
        """
          See Appendix 2 of Bahdanau 2014, arXiv:1409.0473
          for model details that correspond to the matrices here.
        """
 
        self.batch_size, self.timesteps, self.input_dim = input_shape
 
        if self.stateful:
            super(AttentionDecoder, self).reset_states()
 
        self.states = [None, None]  # y, s
 
        """
            Matrices for creating the context vector
        """
 
        self.V_a = self.add_weight(shape=(self.units,),
                                   name='V_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.W_a = self.add_weight(shape=(self.units, self.units),
                                   name='W_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.U_a = self.add_weight(shape=(self.input_dim, self.units),
                                   name='U_a',
                                   initializer=self.kernel_initializer,
                                   regularizer=self.kernel_regularizer,
                                   constraint=self.kernel_constraint)
        self.b_a = self.add_weight(shape=(self.units,),
                                   name='b_a',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for the r (reset) gate
        """
        self.C_r = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_r = self.add_weight(shape=(self.units, self.units),
                                   name='U_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_r = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_r',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_r = self.add_weight(shape=(self.units, ),
                                   name='b_r',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
 
        """
            Matrices for the z (update) gate
        """
        self.C_z = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_z = self.add_weight(shape=(self.units, self.units),
                                   name='U_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_z = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_z',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_z = self.add_weight(shape=(self.units, ),
                                   name='b_z',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for the proposal
        """
        self.C_p = self.add_weight(shape=(self.input_dim, self.units),
                                   name='C_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_p = self.add_weight(shape=(self.units, self.units),
                                   name='U_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_p = self.add_weight(shape=(self.output_dim, self.units),
                                   name='W_p',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_p = self.add_weight(shape=(self.units, ),
                                   name='b_p',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
        """
            Matrices for making the final prediction vector
        """
        self.C_o = self.add_weight(shape=(self.input_dim, self.output_dim),
                                   name='C_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.U_o = self.add_weight(shape=(self.units, self.output_dim),
                                   name='U_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.W_o = self.add_weight(shape=(self.output_dim, self.output_dim),
                                   name='W_o',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
        self.b_o = self.add_weight(shape=(self.output_dim, ),
                                   name='b_o',
                                   initializer=self.bias_initializer,
                                   regularizer=self.bias_regularizer,
                                   constraint=self.bias_constraint)
 
        # For creating the initial state:
        self.W_s = self.add_weight(shape=(self.input_dim, self.units),
                                   name='W_s',
                                   initializer=self.recurrent_initializer,
                                   regularizer=self.recurrent_regularizer,
                                   constraint=self.recurrent_constraint)
 
        self.input_spec = [
            InputSpec(shape=(self.batch_size, self.timesteps, self.input_dim))]
        self.built = True
 
    def call(self, x):
        # store the whole sequence so we can "attend" to it at each timestep
        self.x_seq = x
 
        # apply the a dense layer over the time dimension of the sequence
        # do it here because it doesn't depend on any previous steps
        # thefore we can save computation time:
        self._uxpb = _time_distributed_dense(self.x_seq, self.U_a, b=self.b_a,
                                             input_dim=self.input_dim,
                                             timesteps=self.timesteps,
                                             output_dim=self.units)
 
        return super(AttentionDecoder, self).call(x)
 
    def get_initial_state(self, inputs):
        # apply the matrix on the first time step to get the initial s0.
        s0 = activations.tanh(K.dot(inputs[:, 0], self.W_s))
 
        # from keras.layers.recurrent to initialize a vector of (batchsize,
        # output_dim)
        y0 = K.zeros_like(inputs)  # (samples, timesteps, input_dims)
        y0 = K.sum(y0, axis=(1, 2))  # (samples, )
        y0 = K.expand_dims(y0)  # (samples, 1)
        y0 = K.tile(y0, [1, self.output_dim])
 
        return [y0, s0]
 
    def step(self, x, states):
 
        ytm, stm = states
 
        # repeat the hidden state to the length of the sequence
        _stm = K.repeat(stm, self.timesteps)
 
        # now multiplty the weight matrix with the repeated hidden state
        _Wxstm = K.dot(_stm, self.W_a)
 
        # calculate the attention probabilities
        # this relates how much other timesteps contributed to this one.
        et = K.dot(activations.tanh(_Wxstm + self._uxpb),
                   K.expand_dims(self.V_a))
        at = K.exp(et)
        at_sum = K.sum(at, axis=1)
        at_sum_repeated = K.repeat(at_sum, self.timesteps)
        at /= at_sum_repeated  # vector of size (batchsize, timesteps, 1)
 
        # calculate the context vector
        context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1)
        # ~~~> calculate new hidden state
        # first calculate the "r" gate:
 
        rt = activations.sigmoid(
            K.dot(ytm, self.W_r)
            + K.dot(stm, self.U_r)
            + K.dot(context, self.C_r)
            + self.b_r)
 
        # now calculate the "z" gate
        zt = activations.sigmoid(
            K.dot(ytm, self.W_z)
            + K.dot(stm, self.U_z)
            + K.dot(context, self.C_z)
            + self.b_z)
 
        # calculate the proposal hidden state:
        s_tp = activations.tanh(
            K.dot(ytm, self.W_p)
            + K.dot((rt * stm), self.U_p)
            + K.dot(context, self.C_p)
            + self.b_p)
 
        # new hidden state:
        st = (1-zt)*stm + zt * s_tp
 
        yt = activations.softmax(
            K.dot(ytm, self.W_o)
            + K.dot(stm, self.U_o)
            + K.dot(context, self.C_o)
            + self.b_o)
 
        if self.return_probabilities:
            return at, [yt, st]
        else:
            return yt, [yt, st]
 
    def compute_output_shape(self, input_shape):
        """
            For Keras internal compatability checking
        """
        if self.return_probabilities:
            return (None, self.timesteps, self.timesteps)
        else:
            return (None, self.timesteps, self.output_dim)
 
    def get_config(self):
        """
            For rebuilding models on load time.
        """
        config = {
            'output_dim': self.output_dim,
            'units': self.units,
            'return_probabilities': self.return_probabilities
        }
        base_config = super(AttentionDecoder, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [None]:
def seq2seq_attention(n_timesteps_in):
  with K.tf.device('/gpu:0'):
    n_features = 100
    model = Sequential()
    model.add(Embedding(30979, n_features))
    model.add(LSTM(128, input_shape=(n_timesteps_in, n_features), return_sequences=True))
    model.add(AttentionDecoder(128, n_features))
#     model.add(Flatten())
    model.add(Dense(46))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
  return model

In [None]:
model = seq2seq_attention(200)
print(model.summary())
model.fit(x=_x_train, y=_y_train, validation_split=0.1, batch_size=512, verbose=1, epochs=30)
loss, metrics = model.evaluate(_x_test, _y_test, batch_size=512)
print(metrics)

In [None]:
# evaluate baseline model with standardized dataset
numpy.random.seed(seed)
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(build_fn=create_baseline, epochs=30, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(pipeline, _x_train, y_train, cv=kfold)
print("Standardized: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
loss, metrics = estimator.fit(_x_test, y_test, batch_size=128)
print(loss, metrics)

In [None]:
from keras.layers import merge
from keras.layers.core import *
from keras.layers.recurrent import LSTM
from keras.models import *

from attention_utils import get_activations, get_data_recurrent

INPUT_DIM = 2
TIME_STEPS = 20
# if True, the attention vector is shared across the input_dimensions where the attention is applied.
SINGLE_ATTENTION_VECTOR = False
APPLY_ATTENTION_BEFORE_LSTM = False


def attention_3d_block(inputs):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(TIME_STEPS, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
    return output_attention_mul


def model_attention_applied_after_lstm():
    inputs = Input(shape=(TIME_STEPS, INPUT_DIM,))
    lstm_units = 32
    lstm_out = LSTM(lstm_units, return_sequences=True)(inputs)
    attention_mul = attention_3d_block(lstm_out)
    attention_mul = Flatten()(attention_mul)
    output = Dense(1, activation='sigmoid')(attention_mul)
    model = Model(input=[inputs], output=output)
    return model


def model_attention_applied_before_lstm():
    inputs = Input(shape=(TIME_STEPS, INPUT_DIM,))
    attention_mul = attention_3d_block(inputs)
    lstm_units = 32
    attention_mul = LSTM(lstm_units, return_sequences=False)(attention_mul)
    output = Dense(1, activation='sigmoid')(attention_mul)
    model = Model(input=[inputs], output=output)
    return model


if __name__ == '__main__':

    N = 300000
    # N = 300 -> too few = no training
    inputs_1, outputs = get_data_recurrent(N, TIME_STEPS, INPUT_DIM)

    if APPLY_ATTENTION_BEFORE_LSTM:
        m = model_attention_applied_before_lstm()
    else:
        m = model_attention_applied_after_lstm()

    m.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    print(m.summary())

    m.fit([inputs_1], outputs, epochs=1, batch_size=64, validation_split=0.1)

    attention_vectors = []
    for i in range(300):
        testing_inputs_1, testing_outputs = get_data_recurrent(1, TIME_STEPS, INPUT_DIM)
        attention_vector = np.mean(get_activations(m,
                                                   testing_inputs_1,
                                                   print_shape_only=True,
                                                   layer_name='attention_vec')[0], axis=2).squeeze()
        print('attention =', attention_vector)
        assert (np.sum(attention_vector) - 1.0) < 1e-5
        attention_vectors.append(attention_vector)

    attention_vector_final = np.mean(np.array(attention_vectors), axis=0)
    # plot part.
    import matplotlib.pyplot as plt
    import pandas as pd

    pd.DataFrame(attention_vector_final, columns=['attention (%)']).plot(kind='bar',
                                                                         title='Attention Mechanism as '
                                                                               'a function of input'
                                                                               ' dimensions.')
    plt.show()
