In [1]:
# transformer implementation in Keras and TensorFlow 1.15

In [2]:
# base code for transformer block implements multi-head attention from Keras, only on TF 2X
# question: is it possible to change it to use multi-head attention from TF AddOns, on TF 1.15?

In [3]:
# IMPORTANT!
# there is no support for TensorFlow addons on TF 1.15
# code must be used from Python source
# then, complete the Transformer model using Keras MHA layer, on TF 2.4

In [4]:
# second experiment:
# TensorFlow 2.4
# ScaledDotProduct and InterpretableMultiHeadAttention from Temporal Fusion Transformer project
# Transformer-encoder only (autoencoder option)
# value embedding with Conv1D
# basic positional encoding with Keras embedding
# encoder layer with MHA
# encoder output to linear to multi-step target (TimeDistributed)

In [5]:
import numpy as np

In [6]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [7]:
print(tf.__version__)

2.4.1


In [8]:
# get datasets for selected substation, load them as NumPy arrays

In [9]:
x_train = np.load('data/256_to_24_train_hourly.npy')

In [10]:
y_train = np.load('data/256_to_24_train_targets.npy')

In [11]:
x_train.shape, y_train.shape

((17824, 256), (17824, 24))

In [12]:
x_eval = np.load('data/256_to_24_eval_hourly.npy')

In [13]:
y_eval = np.load('data/256_to_24_eval_targets.npy')

In [14]:
x_eval.shape, y_eval.shape

((1984, 256), (1984, 24))

In [15]:
# use number of timesteps in the input sequence as the limit for positional encoding
num_timesteps = 256

In [16]:
# input layer for Keras functional
# use embedding dimension from SLDB as the input dimensionality
input_layer = layers.Input(shape=(num_timesteps,))
input_layer

<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'input_1')>

In [17]:
# a layer to expand dimensions of input tensor,
# required to project to a d_model space with a convolutional layer
expanded_input_layer = tf.expand_dims(input_layer, axis=2)
expanded_input_layer

<KerasTensor: shape=(None, 256, 1) dtype=float32 (created by layer 'tf.expand_dims')>

In [18]:
# a simple Conv1D layer to project time series data (scalar) to d_model
value_embedding_layer = layers.Conv1D(filters=32,
                                      kernel_size=3,
                                      activation="relu",
                                      padding="same")(expanded_input_layer)
value_embedding_layer

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'conv1d')>

In [19]:
# start with a simple position encoding
# for instance, the one in Keras Transformer-encoder block for text classification

In [20]:
positions_to_encode = tf.range(start=0, limit=num_timesteps, delta=1)
positions_to_encode

<tf.Tensor: shape=(256,), dtype=int32, numpy=
array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 1

In [21]:
# dimensionality of Q, K, V
embed_dim = 32

In [22]:
position_embedding_layer = layers.Embedding(input_dim=num_timesteps,
                                            output_dim=embed_dim) (positions_to_encode)
position_embedding_layer

<tf.Tensor: shape=(256, 32), dtype=float32, numpy=
array([[-1.5643168e-02,  3.8953926e-02,  4.5481611e-02, ...,
         2.2824656e-02, -1.5913226e-02, -3.2707453e-02],
       [-2.7845129e-03, -1.9653786e-02, -8.0652013e-03, ...,
         8.2932413e-05, -4.7322180e-02, -5.0240867e-03],
       [-4.4209003e-02, -2.6102280e-02,  2.1990426e-03, ...,
        -3.9558709e-02, -2.6990287e-03, -3.3473648e-02],
       ...,
       [ 3.5214666e-02,  1.7538894e-02,  3.5113189e-02, ...,
         4.1705180e-02, -1.7804991e-02,  3.3349935e-02],
       [ 4.4760194e-02, -1.6459297e-02,  3.0569855e-02, ...,
        -2.7947832e-02,  4.6667445e-02, -2.3421491e-02],
       [-1.2169778e-02,  3.9804470e-02, -2.7731478e-02, ...,
         1.2008548e-02, -4.7910582e-02, -2.2382105e-02]], dtype=float32)>

In [23]:
input_to_transformer_block = value_embedding_layer + position_embedding_layer
input_to_transformer_block

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'tf.__operators__.add')>

In [24]:
# number of attention heads
num_heads = 2

In [25]:
# hidden layer size in feed forward network inside transformer
ff_dim = 32

In [26]:
# dropout rate inside the transformer block
rate = 0.1

In [28]:
# transformer_block = TransformerBlock(embed_dim=embed_dim,
#                                      num_heads=num_heads,
#                                      ff_dim=ff_dim,
#                                      rate=rate)

In [29]:
# output_from_transformer_block = transformer_block(input_to_transformer_block)
# output_from_transformer_block

In [44]:
import typing
import warnings

In [46]:
class MultiHeadAttention(tf.keras.layers.Layer):
    r"""MultiHead Attention layer.
    Defines the MultiHead Attention operation as described in
    [Attention Is All You Need](https://arxiv.org/abs/1706.03762) which takes
    in the tensors `query`, `key`, and `value`, and returns the dot-product attention
    between them:
    >>> mha = MultiHeadAttention(head_size=128, num_heads=12)
    >>> query = np.random.rand(3, 5, 4) # (batch_size, query_elements, query_depth)
    >>> key = np.random.rand(3, 6, 5) # (batch_size, key_elements, key_depth)
    >>> value = np.random.rand(3, 6, 6) # (batch_size, key_elements, value_depth)
    >>> attention = mha([query, key, value]) # (batch_size, query_elements, value_depth)
    >>> attention.shape
    TensorShape([3, 5, 6])
    If `value` is not given then internally `value = key` will be used:
    >>> mha = MultiHeadAttention(head_size=128, num_heads=12)
    >>> query = np.random.rand(3, 5, 5) # (batch_size, query_elements, query_depth)
    >>> key = np.random.rand(3, 6, 10) # (batch_size, key_elements, key_depth)
    >>> attention = mha([query, key]) # (batch_size, query_elements, key_depth)
    >>> attention.shape
    TensorShape([3, 5, 10])
    Args:
        head_size: int, dimensionality of the `query`, `key` and `value` tensors
            after the linear transformation.
        num_heads: int, number of attention heads.
        output_size: int, dimensionality of the output space, if `None` then the
            input dimension of `value` or `key` will be used,
            default `None`.
        dropout: float, `rate` parameter for the dropout layer that is
            applied to attention after softmax,
        default `0`.
        use_projection_bias: bool, whether to use a bias term after the linear
            output projection.
        return_attn_coef: bool, if `True`, return the attention coefficients as
            an additional output argument.
        kernel_initializer: initializer, initializer for the kernel weights.
        kernel_regularizer: regularizer, regularizer for the kernel weights.
        kernel_constraint: constraint, constraint for the kernel weights.
        bias_initializer: initializer, initializer for the bias weights.
        bias_regularizer: regularizer, regularizer for the bias weights.
        bias_constraint: constraint, constraint for the bias weights.
    Call Args:
        inputs:  List of `[query, key, value]` where
            * `query`: Tensor of shape `(..., query_elements, query_depth)`
            * `key`: `Tensor of shape '(..., key_elements, key_depth)`
            * `value`: Tensor of shape `(..., key_elements, value_depth)`, optional, if not given `key` will be used.
        mask: a binary Tensor of shape `[batch_size?, num_heads?, query_elements, key_elements]`
        which specifies which query elements can attendo to which key elements,
        `1` indicates attention and `0` indicates no attention.
    Output shape:
        * `(..., query_elements, output_size)` if `output_size` is given, else
        * `(..., query_elements, value_depth)` if `value` is given, else
        * `(..., query_elements, key_depth)`
    """

    def __init__(
        self,
        head_size: int,
        num_heads: int,
        output_size: int = None,
        dropout: float = 0.0,
        use_projection_bias: bool = True,
        return_attn_coef: bool = False,
        kernel_initializer: typing.Union[str, typing.Callable] = "glorot_uniform",
        kernel_regularizer: typing.Union[str, typing.Callable] = None,
        kernel_constraint: typing.Union[str, typing.Callable] = None,
        bias_initializer: typing.Union[str, typing.Callable] = "zeros",
        bias_regularizer: typing.Union[str, typing.Callable] = None,
        bias_constraint: typing.Union[str, typing.Callable] = None,
        **kwargs,
    ):
        warnings.warn(
            "`MultiHeadAttention` will be deprecated in Addons 0.13. "
            "Please use `tf.keras.layers.MultiHeadAttention` instead.",
            DeprecationWarning,
        )

        super().__init__(**kwargs)

        if output_size is not None and output_size < 1:
            raise ValueError("output_size must be a positive number")

        self.head_size = head_size
        self.num_heads = num_heads
        self.output_size = output_size
        self.use_projection_bias = use_projection_bias
        self.return_attn_coef = return_attn_coef

        self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
        self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
        self.kernel_constraint = tf.keras.constraints.get(kernel_constraint)
        self.bias_initializer = tf.keras.initializers.get(bias_initializer)
        self.bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
        self.bias_constraint = tf.keras.constraints.get(bias_constraint)

        self.dropout = tf.keras.layers.Dropout(dropout)
        self._droput_rate = dropout

    def build(self, input_shape):

        num_query_features = input_shape[0][-1]
        num_key_features = input_shape[1][-1]
        num_value_features = (
            input_shape[2][-1] if len(input_shape) > 2 else num_key_features
        )
        output_size = (
            self.output_size if self.output_size is not None else num_value_features
        )

        self.query_kernel = self.add_weight(
            name="query_kernel",
            shape=[self.num_heads, num_query_features, self.head_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.key_kernel = self.add_weight(
            name="key_kernel",
            shape=[self.num_heads, num_key_features, self.head_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.value_kernel = self.add_weight(
            name="value_kernel",
            shape=[self.num_heads, num_value_features, self.head_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.projection_kernel = self.add_weight(
            name="projection_kernel",
            shape=[self.num_heads, self.head_size, output_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )

        if self.use_projection_bias:
            self.projection_bias = self.add_weight(
                name="projection_bias",
                shape=[output_size],
                initializer=self.bias_initializer,
                regularizer=self.bias_regularizer,
                constraint=self.bias_constraint,
            )
        else:
            self.projection_bias = None

        super().build(input_shape)

    def call(self, inputs, training=None, mask=None):

        # einsum nomenclature
        # ------------------------
        # N = query elements
        # M = key/value elements
        # H = heads
        # I = input features
        # O = output features

        query = inputs[0]
        key = inputs[1]
        value = inputs[2] if len(inputs) > 2 else key

        # verify shapes
        if key.shape[-2] != value.shape[-2]:
            raise ValueError(
                "the number of elements in 'key' must be equal to the same as the number of elements in 'value'"
            )

        if mask is not None:
            if len(mask.shape) < 2:
                raise ValueError("'mask' must have atleast 2 dimensions")
            if query.shape[-2] != mask.shape[-2]:
                raise ValueError(
                    "mask's second to last dimension must be equal to the number of elements in 'query'"
                )
            if key.shape[-2] != mask.shape[-1]:
                raise ValueError(
                    "mask's last dimension must be equal to the number of elements in 'key'"
                )

        # Linear transformations
        query = tf.einsum("...NI , HIO -> ...NHO", query, self.query_kernel)
        key = tf.einsum("...MI , HIO -> ...MHO", key, self.key_kernel)
        value = tf.einsum("...MI , HIO -> ...MHO", value, self.value_kernel)

        # Scale dot-product, doing the division to either query or key
        # instead of their product saves some computation
        depth = tf.constant(self.head_size, dtype=query.dtype)
        query /= tf.sqrt(depth)

        # Calculate dot product attention
        logits = tf.einsum("...NHO,...MHO->...HNM", query, key)

        # apply mask
        if mask is not None:
            mask = tf.cast(mask, tf.float32)

            # possibly expand on the head dimension so broadcasting works
            if len(mask.shape) != len(logits.shape):
                mask = tf.expand_dims(mask, -3)

            logits += -10e9 * (1.0 - mask)

        attn_coef = tf.nn.softmax(logits)

        # attention dropout
        attn_coef_dropout = self.dropout(attn_coef, training=training)

        # attention * value
        multihead_output = tf.einsum("...HNM,...MHI->...NHI", attn_coef_dropout, value)

        # Run the outputs through another linear projection layer. Recombining heads
        # is automatically done.
        output = tf.einsum(
            "...NHI,HIO->...NO", multihead_output, self.projection_kernel
        )

        if self.projection_bias is not None:
            output += self.projection_bias

        if self.return_attn_coef:
            return output, attn_coef
        else:
            return output

    def compute_output_shape(self, input_shape):
        num_value_features = (
            input_shape[2][-1] if len(input_shape) > 2 else input_shape[1][-1]
        )
        output_size = (
            self.output_size if self.output_size is not None else num_value_features
        )

        output_shape = input_shape[0][:-1] + (output_size,)

        if self.return_attn_coef:
            num_query_elements = input_shape[0][-2]
            num_key_elements = input_shape[1][-2]
            attn_coef_shape = input_shape[0][:-2] + (
                self.num_heads,
                num_query_elements,
                num_key_elements,
            )

            return output_shape, attn_coef_shape
        else:
            return output_shape

    def get_config(self):
        config = super().get_config()

        config.update(
            head_size=self.head_size,
            num_heads=self.num_heads,
            output_size=self.output_size,
            dropout=self._droput_rate,
            use_projection_bias=self.use_projection_bias,
            return_attn_coef=self.return_attn_coef,
            kernel_initializer=tf.keras.initializers.serialize(self.kernel_initializer),
            kernel_regularizer=tf.keras.regularizers.serialize(self.kernel_regularizer),
            kernel_constraint=tf.keras.constraints.serialize(self.kernel_constraint),
            bias_initializer=tf.keras.initializers.serialize(self.bias_initializer),
            bias_regularizer=tf.keras.regularizers.serialize(self.bias_regularizer),
            bias_constraint=tf.keras.constraints.serialize(self.bias_constraint),
        )

        return config

In [47]:
transformer_attention_layer = MultiHeadAttention(head_size=embed_dim, num_heads=num_heads)



In [48]:
attention_output_layer = transformer_attention_layer(inputs=[input_to_transformer_block,
                                                             input_to_transformer_block])
attention_output_layer

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'multi_head_attention')>

In [49]:
transformer_dropout_1 = layers.Dropout(rate)

In [50]:
attention_dropout_1_layer = transformer_dropout_1(attention_output_layer)
attention_dropout_1_layer

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'dropout_3')>

In [51]:
transformer_layernorm_1 = layers.LayerNormalization(epsilon=1e-6)

In [52]:
layernorm_1_output_layer = transformer_layernorm_1(input_to_transformer_block + attention_output_layer)
layernorm_1_output_layer

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'layer_normalization')>

In [53]:
transformer_ffn_layer = keras.Sequential(
    [layers.Dense(units=ff_dim,
                  activation="relu"),
     layers.Dense(units=embed_dim)])

In [54]:
ffn_output_layer = transformer_ffn_layer(layernorm_1_output_layer)
ffn_output_layer

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'sequential')>

In [55]:
transformer_dropout_2 = layers.Dropout(rate)

In [56]:
dropout_2_layer_output = transformer_dropout_2(ffn_output_layer)
dropout_2_layer_output

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'dropout_4')>

In [57]:
transformer_layernorm_2 = layers.LayerNormalization(epsilon=1e-6)

In [58]:
output_from_transformer_block = transformer_layernorm_2(
    layernorm_1_output_layer + dropout_2_layer_output)
output_from_transformer_block

<KerasTensor: shape=(None, 256, 32) dtype=float32 (created by layer 'layer_normalization_1')>

In [59]:
# processing the output from transformer block towards the target
# case 1: based on TransformerBlock example at
# https://keras.io/examples/nlp/text_classification_with_transformer/

In [60]:
output_from_pooling = layers.GlobalAveragePooling1D()(output_from_transformer_block)
output_from_pooling

<KerasTensor: shape=(None, 32) dtype=float32 (created by layer 'global_average_pooling1d')>

In [61]:
num_targets = 24

In [62]:
repeated = layers.RepeatVector(num_targets)(output_from_pooling)
repeated

<KerasTensor: shape=(None, 24, 32) dtype=float32 (created by layer 'repeat_vector')>

In [63]:
first_dropout = layers.Dropout(0.1)

In [64]:
distributed_first_dropout = layers.TimeDistributed(first_dropout)(repeated)
distributed_first_dropout

<KerasTensor: shape=(None, 24, 32) dtype=float32 (created by layer 'time_distributed')>

In [65]:
units_in_first_dense = 16
first_dense = layers.Dense(units_in_first_dense, activation="relu")

In [66]:
distributed_first_dense = layers.TimeDistributed(first_dense)(distributed_first_dropout)
distributed_first_dense

<KerasTensor: shape=(None, 24, 16) dtype=float32 (created by layer 'time_distributed_1')>

In [67]:
second_dropout = layers.Dropout(0.1)

In [68]:
distributed_second_dropout = layers.TimeDistributed(second_dropout)(distributed_first_dense)
distributed_second_dropout

<KerasTensor: shape=(None, 24, 16) dtype=float32 (created by layer 'time_distributed_2')>

In [69]:
units_in_second_dense = 1
second_dense = layers.Dense(units_in_second_dense, activation="sigmoid")

In [70]:
distributed_second_dense = layers.TimeDistributed(second_dense)(distributed_second_dropout)
distributed_second_dense

<KerasTensor: shape=(None, 24, 1) dtype=float32 (created by layer 'time_distributed_3')>

In [71]:
model = keras.Model(inputs=input_layer, outputs=distributed_second_dense)

In [72]:
model.compile("adam", "mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [73]:
history = model.fit(
    x_train, y_train, batch_size=32, epochs=2, validation_data=(x_eval, y_eval)
)

Epoch 1/2
Epoch 2/2
