# Bahdanau Attention Test 2

Let's test our Bahdanau Attention Layer with the following test code [1](https://machinelearningmastery.com/encoder-decoder-attention-sequence-to-sequence-prediction-keras/).

* Added Bahdanau layer, followed by a TimeDistributedDense layer.
* Also, changed LSTM layer to BiLSTM layer as in [2](https://arxiv.org/pdf/1409.0473.pdf).

In [None]:
from random import randint
from numpy import array
from numpy import argmax
from numpy import array_equal
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import TimeDistributed
from keras.layers import RepeatVector
from keras_bahdanau.recurrent import BahdanauGRU


# generate a sequence of random integers
def generate_sequence(length, n_unique):
    return [randint(0, n_unique - 1) for _ in range(length)]


# one hot encode sequence
def one_hot_encode(sequence, n_unique):
    encoding = list()
    for value in sequence:
        vector = [0 for _ in range(n_unique)]
        vector[value] = 1
        encoding.append(vector)
    return array(encoding)


# decode a one hot encoded string
def one_hot_decode(encoded_seq):
    return [argmax(vector) for vector in encoded_seq]


# prepare data for the LSTM
def get_pair(n_in, n_out, cardinality):
    # generate random sequence
    sequence_in = generate_sequence(n_in, cardinality)
    sequence_out = sequence_in[:n_out] + [0 for _ in range(n_in - n_out)]
    # one hot encode
    X = one_hot_encode(sequence_in, cardinality)
    y = one_hot_encode(sequence_out, cardinality)
    # reshape as 3D
    X = X.reshape((1, X.shape[0], X.shape[1]))
    y = y.reshape((1, y.shape[0], y.shape[1]))
    return X, y


# define the encoder-decoder model
def baseline_model(n_timesteps_in, n_features):
    model = Sequential()
    model.add(LSTM(150, input_shape=(n_timesteps_in, n_features)))
    model.add(RepeatVector(n_timesteps_in))
    model.add(LSTM(150, return_sequences=True))
    model.add(TimeDistributed(Dense(n_features, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.summary()
    return model


# define the encoder-decoder with attention model
def attention_model(n_timesteps_in, n_features):
    model = Sequential()
    model.add(LSTM(150, input_shape=(n_timesteps_in, n_features), return_sequences=True))
    #model.add(AttentionDecoder(150, n_features))
    model.add(BahdanauGRU(50, return_sequences=True))
    model.add(TimeDistributed(Dense(n_features, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
    model.summary()
    return model


# train and evaluate a model, return accuracy
def train_evaluate_model(model, n_timesteps_in, n_timesteps_out, n_features):
    # train LSTM
    for epoch in range(5000):
        # generate new random sequence
        X, y = get_pair(n_timesteps_in, n_timesteps_out, n_features)
        # fit model for one epoch on this sequence
        model.fit(X, y, epochs=1, verbose=0)
    # evaluate LSTM
    total, correct = 100, 0
    for _ in range(total):
        X, y = get_pair(n_timesteps_in, n_timesteps_out, n_features)
        yhat = model.predict(X, verbose=0)
        if array_equal(one_hot_decode(y[0]), one_hot_decode(yhat[0])):
            correct += 1
    return float(correct) / float(total) * 100.0


# configure problem
n_features = 50
n_timesteps_in = 5
n_timesteps_out = 2
n_repeats = 10
# evaluate encoder-decoder model
print('Encoder-Decoder Model')
results = list()
for _ in range(n_repeats):
    model = baseline_model(n_timesteps_in, n_features)
    accuracy = train_evaluate_model(model, n_timesteps_in, n_timesteps_out, n_features)
    results.append(accuracy)
    print(accuracy)
print('Mean Accuracy: %.2f%%' % (sum(results) / float(n_repeats)))
# evaluate attention model
print('Attention Model')
results = list()
for _ in range(n_repeats):
    model = attention_model(n_timesteps_in, n_features)
    accuracy = train_evaluate_model(model, n_timesteps_in, n_timesteps_out, n_features)
    results.append(accuracy)
    print(accuracy)
print('Mean Accuracy: %.2f%%' % (sum(results) / float(n_repeats)))

Here are the results and model summary of the baseline in the example:

In [None]:
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_1 (LSTM)                (None, 150)               120600    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 5, 150)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 5, 150)            180600    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 5, 50)             7550      
=================================================================
Total params: 308,750
Trainable params: 308,750
Non-trainable params: 0
_________________________________________________________________

In [None]:
'Encoder-Decoder Model'
23.0
22.0
12.0
14.000000000000002
35.0
24.0
21.0
20.0
24.0
24.0
Mean Accuracy: 21.90%

Here are the results and model summary for when we set # of units of Bahdanau layer to 150:

In [None]:
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_1 (LSTM)                (None, 5, 150)            120600    
_________________________________________________________________
bahdanau_gru_1 (BahdanauGRU) (None, 5, 150)            260250    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 5, 50)             7550      
=================================================================
Total params: 388,400
Trainable params: 388,400
Non-trainable params: 0
_________________________________________________________________

In [None]:
'Attention Model'
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
Mean Accuracy: 100.00%

Here are the results and model summary for when we set # of units of Bahdanau layer to 50:

In [None]:
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_1 (LSTM)                (None, 5, 150)            120600    
_________________________________________________________________
bahdanau_gru_1 (BahdanauGRU) (None, 5, 50)             66750     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 5, 50)             2550      
=================================================================
Total params: 189,900
Trainable params: 189,900
Non-trainable params: 0
_________________________________________________________________

In [None]:
'Attention Model'
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
100.0
Mean Accuracy: 100.00%

Here are the results and model summary for when we set # of units of Bahdanau layer to 10:

In [None]:
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm_1 (LSTM)                (None, 5, 150)            120600    
_________________________________________________________________
bahdanau_gru_1 (BahdanauGRU) (None, 5, 10)             11750     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 5, 50)             550       
=================================================================
Total params: 132,900
Trainable params: 132,900
Non-trainable params: 0
_________________________________________________________________

In [None]:
'Attention Model'
87.0
89.0
83.0
78.0
69.0
68.0
86.0
88.0
92.0
89.0
Mean Accuracy: 82.90%

We may observe that even with less number of weights, attention mechanism provides a great increase in the accuracy. However, we should be cautious about overfitting. In addition, we should investigate other factors (e.g. # of features, samples size, model structure etc.) in depth over a variety of domains. Though this simple example helps us to try out our attention layer.