In [1]:
# transformer implementation in Keras and TensorFlow 1.15

In [2]:
# base code for transformer block implements multi-head attention from Keras, only on TF 2X
# question: is it possible to change it to use multi-head attention from TF AddOns, on TF 1.15?

In [3]:
# IMPORTANT!
# there is no support for TensorFlow addons on TF 1.15
# code must be used from Python source
# then, complete the Transformer model using Keras MHA layer, on TF 2.4

# or just try using tf.compat.v1.keras.layers.MultiHeadAttention

In [4]:
# first experiment:
# TensorFlow 2.4
# multi-head attention layer from Keras
# Transformer-encoder only (autoencoder option)
# value embedding
# positional embedding
# encoder layer with MHA
# encoder output to linear to multi-step target (vector output and TimeDistributed)

In [5]:
import numpy as np

In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [7]:
print(tf.__version__)

2.4.1


In [8]:
# transformer block as a layer, from
# https://keras.io/examples/nlp/text_classification_with_transformer/

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [9]:
class PositionEmbedding(layers.Layer):
    def __init__(self, maxlen, embed_dim):
        super(PositionEmbedding, self).__init__()
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        return x + positions

In [10]:
# get datasets for selected substation, load them as NumPy arrays

In [11]:
x_train = np.load('data/256_to_24_train_hourly.npy')

In [12]:
y_train = np.load('data/256_to_24_train_targets.npy')

In [13]:
x_train.shape, y_train.shape

((17824, 256), (17824, 24))

In [14]:
x_eval = np.load('data/256_to_24_eval_hourly.npy')

In [15]:
y_eval = np.load('data/256_to_24_eval_targets.npy')

In [16]:
x_eval.shape, y_eval.shape

((1984, 256), (1984, 24))

In [25]:
# use number of timesteps in the input sequence as maximal length for positional encoding
num_timesteps = 256

In [26]:
# input layer for Keras functional
# use embedding dimension from SLDB as the input dimensionality
input_layer = layers.Input(shape=(num_timesteps,))
input_layer

<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'input_2')>

In [27]:
# a layer to expand dimensions of input tensor (required prior to using a convolutional layer)
expanded_input_layer = tf.expand_dims(input_layer, axis=2)
expanded_input_layer

<KerasTensor: shape=(None, 256, 1) dtype=float32 (created by layer 'tf.expand_dims_1')>

In [28]:
# a simple (Dense) layer to project time series data (scalar) to d_model
value_embedding_layer = layers.Conv1D(filters=32,
                                      kernel_size=3,
                                      activation="relu",
                                      padding="same")(expanded_input_layer)
value_embedding_layer

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'conv1d_1')>

In [29]:
# start with a simple position encoding
# for instance, the one in Keras Transformer-encoder block for text classification

In [32]:
positions_to_encode = tf.range(start=0, limit=num_timesteps, delta=1)
positions_to_encode

<tf.Tensor: shape=(256,), dtype=int32, numpy=
array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 1

In [33]:
position_embedding_layer = layers.Embedding(input_dim=num_timesteps,
                                            output_dim=embed_dim) (positions_to_encode)
position_embedding_layer

<tf.Tensor: shape=(256, 32), dtype=float32, numpy=
array([[-0.00018823,  0.04932977,  0.03218402, ..., -0.034956  ,
        -0.03760637, -0.01789354],
       [ 0.01210878,  0.02822688,  0.03157374, ..., -0.00940269,
        -0.00340117,  0.01310406],
       [ 0.0178569 , -0.03671504,  0.04829026, ...,  0.01711998,
        -0.00582873,  0.00629357],
       ...,
       [-0.03976671,  0.03521476,  0.0437463 , ...,  0.00367503,
        -0.02330271,  0.00906388],
       [-0.00106432, -0.01607291,  0.00646626, ..., -0.03976947,
         0.00817459, -0.03963844],
       [ 0.04193385, -0.01860239, -0.0338046 , ..., -0.04548728,
        -0.03488844, -0.00129445]], dtype=float32)>

In [41]:
input_to_transformer_block = value_embedding_layer + position_embedding_layer
input_to_transformer_block

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'tf.__operators__.add_2')>

In [36]:
# dimensionality of Q, K, V
embed_dim = 32

In [37]:
# number of attention heads
num_heads = 2

In [38]:
# hidden layer size in feed forward network inside transformer
ff_dim = 32

In [39]:
# dropout rate inside the transformer block
rate = 0.1

In [40]:
transformer_block = TransformerBlock(embed_dim=embed_dim,
                                     num_heads=num_heads,
                                     ff_dim=ff_dim,
                                     rate=rate)

In [42]:
output_from_transformer_block = transformer_block(input_to_transformer_block)
output_from_transformer_block

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'transformer_block')>

In [None]:
# processing the output from transformer block towards the target
# case 1: based on TransformerBlock example at
# https://keras.io/examples/nlp/text_classification_with_transformer/

In [44]:
output_from_pooling = layers.GlobalAveragePooling1D()(output_from_transformer_block)
output_from_pooling

<KerasTensor: shape=(None, 32) dtype=float32 (created by layer 'global_average_pooling1d_1')>

In [55]:
num_targets = 24

In [56]:
repeated = layers.RepeatVector(num_targets)(output_from_pooling)
repeated

<KerasTensor: shape=(None, 24, 32) dtype=float32 (created by layer 'repeat_vector')>

In [59]:
first_dropout = layers.Dropout(0.1)

In [60]:
distributed_first_dropout = layers.TimeDistributed(first_dropout)(repeated)
distributed_first_dropout

<KerasTensor: shape=(None, 24, 32) dtype=float32 (created by layer 'time_distributed')>

In [61]:
units_in_first_dense = 16
first_dense = layers.Dense(units_in_first_dense, activation="relu")

In [62]:
distributed_first_dense = layers.TimeDistributed(first_dense)(distributed_first_dropout)
distributed_first_dense

<KerasTensor: shape=(None, 24, 16) dtype=float32 (created by layer 'time_distributed_1')>

In [63]:
second_dropout = layers.Dropout(0.1)

In [64]:
distributed_second_dropout = layers.TimeDistributed(second_dropout)(distributed_first_dense)
distributed_second_dropout

<KerasTensor: shape=(None, 24, 16) dtype=float32 (created by layer 'time_distributed_2')>

In [65]:
units_in_second_dense = 1
second_dense = layers.Dense(units_in_second_dense, activation="sigmoid")

In [66]:
distributed_second_dense = layers.TimeDistributed(second_dense)(distributed_second_dropout)
distributed_second_dense

<KerasTensor: shape=(None, 24, 1) dtype=float32 (created by layer 'time_distributed_3')>

In [67]:
model = keras.Model(inputs=input_layer, outputs=distributed_second_dense)

In [73]:
model.compile("adam", "mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [75]:
history = model.fit(
    x_train, y_train, batch_size=32, epochs=100, validation_data=(x_eval, y_eval)
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
