In [1]:
# transformer implementation in Keras and TensorFlow 1.15

In [2]:
# base code for transformer block implements multi-head attention from Keras, only on TF 2X
# question: is it possible to change it to use multi-head attention from TF AddOns, on TF 1.15?

In [3]:
# IMPORTANT!
# there is no support for TensorFlow addons on TF 1.15
# code must be used from Python source
# then, complete the Transformer model using Keras MHA layer, on TF 2.4

# or just try using tf.compat.v1.keras.layers.MultiHeadAttention

In [4]:
# second experiment:
# TensorFlow 2.4
# ScaledDotProduct and InterpretableMultiHeadAttention from Temporal Fusion Transformer project
# Transformer-encoder only (autoencoder option)
# value embedding with Conv1D
# basic positional encoding with Keras embedding
# encoder layer with MHA
# encoder output to linear to multi-step target (TimeDistributed)

In [5]:
import numpy as np

In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [7]:
print(tf.__version__)

2.4.1


In [8]:
# get datasets for selected substation, load them as NumPy arrays

In [9]:
x_train = np.load('data/256_to_24_train_hourly.npy')

In [10]:
y_train = np.load('data/256_to_24_train_targets.npy')

In [11]:
x_train.shape, y_train.shape

((17824, 256), (17824, 24))

In [12]:
x_eval = np.load('data/256_to_24_eval_hourly.npy')

In [13]:
y_eval = np.load('data/256_to_24_eval_targets.npy')

In [14]:
x_eval.shape, y_eval.shape

((1984, 256), (1984, 24))

In [15]:
# use number of timesteps in the input sequence as the limit for positional encoding
num_timesteps = 256

In [16]:
# input layer for Keras functional
# use embedding dimension from SLDB as the input dimensionality
input_layer = layers.Input(shape=(num_timesteps,))
input_layer

<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'input_1')>

In [17]:
# a layer to expand dimensions of input tensor,
# required to project to a d_model space with a convolutional layer
expanded_input_layer = tf.expand_dims(input_layer, axis=2)
expanded_input_layer

<KerasTensor: shape=(None, 256, 1) dtype=float32 (created by layer 'tf.expand_dims')>

In [18]:
# a simple Conv1D layer to project time series data (scalar) to d_model
value_embedding_layer = layers.Conv1D(filters=32,
                                      kernel_size=3,
                                      activation="relu",
                                      padding="same")(expanded_input_layer)
value_embedding_layer

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'conv1d')>

In [19]:
# start with a simple position encoding
# for instance, the one in Keras Transformer-encoder block for text classification

In [20]:
positions_to_encode = tf.range(start=0, limit=num_timesteps, delta=1)
positions_to_encode

<tf.Tensor: shape=(256,), dtype=int32, numpy=
array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 1

In [21]:
# dimensionality of Q, K, V
embed_dim = 32

In [22]:
position_embedding_layer = layers.Embedding(input_dim=num_timesteps,
                                            output_dim=embed_dim) (positions_to_encode)
position_embedding_layer

<tf.Tensor: shape=(256, 32), dtype=float32, numpy=
array([[-0.00638062,  0.03770306,  0.02036254, ...,  0.04426051,
        -0.02193693,  0.00117866],
       [ 0.00312021, -0.01591484,  0.02912844, ..., -0.0019648 ,
        -0.02728313,  0.01884855],
       [ 0.03301282, -0.01530141,  0.0404519 , ...,  0.01020054,
         0.02359128, -0.01728591],
       ...,
       [-0.00277444, -0.00080197,  0.04465133, ..., -0.02851502,
         0.03069988,  0.00379462],
       [ 0.02876401, -0.03478192, -0.0308387 , ..., -0.03859157,
        -0.0405895 ,  0.00187563],
       [-0.01709599, -0.02897664,  0.02532138, ...,  0.03900309,
        -0.03712968, -0.00220356]], dtype=float32)>

In [23]:
input_to_transformer_block = value_embedding_layer + position_embedding_layer
input_to_transformer_block

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'tf.__operators__.add')>

In [24]:
# number of attention heads
num_heads = 2

In [25]:
# hidden layer size in feed forward network inside transformer
ff_dim = 32

In [26]:
# dropout rate inside the transformer block
rate = 0.1

In [27]:
# comment the following cell, use InterpretableMultiHeadAttention from TFT project

In [28]:
# transformer_block = TransformerBlock(embed_dim=embed_dim,
#                                      num_heads=num_heads,
#                                      ff_dim=ff_dim,
#                                      rate=rate)

In [66]:
# output_from_transformer_block = transformer_block(input_to_transformer_block)
# output_from_transformer_block

In [32]:
transformer_attention_layer = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)

In [40]:
attention_output_layer = transformer_attention_layer(input_to_transformer_block,
                                                     input_to_transformer_block)
attention_output_layer

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'multi_head_attention_1')>

In [38]:
transformer_dropout_1 = layers.Dropout(rate)

In [41]:
attention_dropout_1_layer = transformer_dropout_1(attention_output_layer)
attention_dropout_1_layer

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'dropout')>

In [36]:
transformer_layernorm_1 = layers.LayerNormalization(epsilon=1e-6)

In [43]:
layernorm_1_output_layer = transformer_layernorm_1(input_to_transformer_block + attention_output_layer)
layernorm_1_output_layer

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'layer_normalization_1')>

In [34]:
transformer_ffn_layer = keras.Sequential(
    [layers.Dense(units=ff_dim,
                  activation="relu"),
     layers.Dense(units=embed_dim)])

In [44]:
ffn_output_layer = transformer_ffn_layer(layernorm_1_output_layer)
ffn_output_layer

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'sequential')>

In [46]:
transformer_dropout_2 = layers.Dropout(rate)

In [48]:
dropout_2_layer_output = transformer_dropout_2(ffn_output_layer)
dropout_2_layer_output

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'dropout_2')>

In [49]:
transformer_layernorm_2 = layers.LayerNormalization(epsilon=1e-6)

In [50]:
output_from_transformer_block = transformer_layernorm_2(
    layernorm_1_output_layer + dropout_2_layer_output)
output_from_transformer_block

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'layer_normalization_3')>

In [51]:
# processing the output from transformer block towards the target
# case 1: based on TransformerBlock example at
# https://keras.io/examples/nlp/text_classification_with_transformer/

In [52]:
output_from_pooling = layers.GlobalAveragePooling1D()(output_from_transformer_block)
output_from_pooling

<KerasTensor: shape=(None, 32) dtype=float32 (created by layer 'global_average_pooling1d')>

In [53]:
num_targets = 24

In [54]:
repeated = layers.RepeatVector(num_targets)(output_from_pooling)
repeated

<KerasTensor: shape=(None, 24, 32) dtype=float32 (created by layer 'repeat_vector')>

In [55]:
first_dropout = layers.Dropout(0.1)

In [56]:
distributed_first_dropout = layers.TimeDistributed(first_dropout)(repeated)
distributed_first_dropout

<KerasTensor: shape=(None, 24, 32) dtype=float32 (created by layer 'time_distributed')>

In [57]:
units_in_first_dense = 16
first_dense = layers.Dense(units_in_first_dense, activation="relu")

In [58]:
distributed_first_dense = layers.TimeDistributed(first_dense)(distributed_first_dropout)
distributed_first_dense

<KerasTensor: shape=(None, 24, 16) dtype=float32 (created by layer 'time_distributed_1')>

In [59]:
second_dropout = layers.Dropout(0.1)

In [60]:
distributed_second_dropout = layers.TimeDistributed(second_dropout)(distributed_first_dense)
distributed_second_dropout

<KerasTensor: shape=(None, 24, 16) dtype=float32 (created by layer 'time_distributed_2')>

In [61]:
units_in_second_dense = 1
second_dense = layers.Dense(units_in_second_dense, activation="sigmoid")

In [62]:
distributed_second_dense = layers.TimeDistributed(second_dense)(distributed_second_dropout)
distributed_second_dense

<KerasTensor: shape=(None, 24, 1) dtype=float32 (created by layer 'time_distributed_3')>

In [63]:
model = keras.Model(inputs=input_layer, outputs=distributed_second_dense)

In [64]:
model.compile("adam", "mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [65]:
history = model.fit(
    x_train, y_train, batch_size=32, epochs=2, validation_data=(x_eval, y_eval)
)

Epoch 1/2
Epoch 2/2
