In [1]:
# transformer implementation in Keras and TensorFlow 1.15

In [2]:
# IMPORTANT!
# code must be used from Python source
# then, complete the Transformer model using Keras MHA layer, on TF 2.4

In [3]:
# fourth experiment:
# TensorFlow 1.15
# (there is no support for Keras MultiHeadAttention or TensorFlow addons on TF 1.15!)

# MultiHeadAttention from TensorFlow AddOns source
# Transformer-encoder only (autoencoder option)
# value embedding with Conv1D
# basic positional encoding with Keras embedding
# encoder output to multi-step target with TimeDistributed

In [4]:
import numpy as np

In [5]:
import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers

In [6]:
print(tf.__version__)

2.4.1


In [7]:
# required for TFA MultiHeadAttention
import typing
import warnings

In [8]:
class MultiHeadAttention(tf.keras.layers.Layer):
    r"""MultiHead Attention layer.
    Defines the MultiHead Attention operation as described in
    [Attention Is All You Need](https://arxiv.org/abs/1706.03762) which takes
    in the tensors `query`, `key`, and `value`, and returns the dot-product attention
    between them:
    >>> mha = MultiHeadAttention(head_size=128, num_heads=12)
    >>> query = np.random.rand(3, 5, 4) # (batch_size, query_elements, query_depth)
    >>> key = np.random.rand(3, 6, 5) # (batch_size, key_elements, key_depth)
    >>> value = np.random.rand(3, 6, 6) # (batch_size, key_elements, value_depth)
    >>> attention = mha([query, key, value]) # (batch_size, query_elements, value_depth)
    >>> attention.shape
    TensorShape([3, 5, 6])
    If `value` is not given then internally `value = key` will be used:
    >>> mha = MultiHeadAttention(head_size=128, num_heads=12)
    >>> query = np.random.rand(3, 5, 5) # (batch_size, query_elements, query_depth)
    >>> key = np.random.rand(3, 6, 10) # (batch_size, key_elements, key_depth)
    >>> attention = mha([query, key]) # (batch_size, query_elements, key_depth)
    >>> attention.shape
    TensorShape([3, 5, 10])
    Args:
        head_size: int, dimensionality of the `query`, `key` and `value` tensors
            after the linear transformation.
        num_heads: int, number of attention heads.
        output_size: int, dimensionality of the output space, if `None` then the
            input dimension of `value` or `key` will be used,
            default `None`.
        dropout: float, `rate` parameter for the dropout layer that is
            applied to attention after softmax,
        default `0`.
        use_projection_bias: bool, whether to use a bias term after the linear
            output projection.
        return_attn_coef: bool, if `True`, return the attention coefficients as
            an additional output argument.
        kernel_initializer: initializer, initializer for the kernel weights.
        kernel_regularizer: regularizer, regularizer for the kernel weights.
        kernel_constraint: constraint, constraint for the kernel weights.
        bias_initializer: initializer, initializer for the bias weights.
        bias_regularizer: regularizer, regularizer for the bias weights.
        bias_constraint: constraint, constraint for the bias weights.
    Call Args:
        inputs:  List of `[query, key, value]` where
            * `query`: Tensor of shape `(..., query_elements, query_depth)`
            * `key`: `Tensor of shape '(..., key_elements, key_depth)`
            * `value`: Tensor of shape `(..., key_elements, value_depth)`, optional, if not given `key` will be used.
        mask: a binary Tensor of shape `[batch_size?, num_heads?, query_elements, key_elements]`
        which specifies which query elements can attendo to which key elements,
        `1` indicates attention and `0` indicates no attention.
    Output shape:
        * `(..., query_elements, output_size)` if `output_size` is given, else
        * `(..., query_elements, value_depth)` if `value` is given, else
        * `(..., query_elements, key_depth)`
    """

    def __init__(
        self,
        head_size: int,
        num_heads: int,
        output_size: int = None,
        dropout: float = 0.0,
        use_projection_bias: bool = True,
        return_attn_coef: bool = False,
        kernel_initializer: typing.Union[str, typing.Callable] = "glorot_uniform",
        kernel_regularizer: typing.Union[str, typing.Callable] = None,
        kernel_constraint: typing.Union[str, typing.Callable] = None,
        bias_initializer: typing.Union[str, typing.Callable] = "zeros",
        bias_regularizer: typing.Union[str, typing.Callable] = None,
        bias_constraint: typing.Union[str, typing.Callable] = None,
        **kwargs,
    ):
        warnings.warn(
            "`MultiHeadAttention` will be deprecated in Addons 0.13. "
            "Please use `tf.keras.layers.MultiHeadAttention` instead.",
            DeprecationWarning,
        )

        super().__init__(**kwargs)

        if output_size is not None and output_size < 1:
            raise ValueError("output_size must be a positive number")

        self.head_size = head_size
        self.num_heads = num_heads
        self.output_size = output_size
        self.use_projection_bias = use_projection_bias
        self.return_attn_coef = return_attn_coef

        self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
        self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
        self.kernel_constraint = tf.keras.constraints.get(kernel_constraint)
        self.bias_initializer = tf.keras.initializers.get(bias_initializer)
        self.bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
        self.bias_constraint = tf.keras.constraints.get(bias_constraint)

        self.dropout = tf.keras.layers.Dropout(dropout)
        self._dropout_rate = dropout

    def build(self, input_shape):

        num_query_features = input_shape[0][-1]
        num_key_features = input_shape[1][-1]
        num_value_features = (
            input_shape[2][-1] if len(input_shape) > 2 else num_key_features
        )
        output_size = (
            self.output_size if self.output_size is not None else num_value_features
        )

        self.query_kernel = self.add_weight(
            name="query_kernel",
            shape=[self.num_heads, num_query_features, self.head_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.key_kernel = self.add_weight(
            name="key_kernel",
            shape=[self.num_heads, num_key_features, self.head_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.value_kernel = self.add_weight(
            name="value_kernel",
            shape=[self.num_heads, num_value_features, self.head_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.projection_kernel = self.add_weight(
            name="projection_kernel",
            shape=[self.num_heads, self.head_size, output_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )

        if self.use_projection_bias:
            self.projection_bias = self.add_weight(
                name="projection_bias",
                shape=[output_size],
                initializer=self.bias_initializer,
                regularizer=self.bias_regularizer,
                constraint=self.bias_constraint,
            )
        else:
            self.projection_bias = None

        super().build(input_shape)

    def call(self, inputs, training=None, mask=None):

        # einsum nomenclature
        # ------------------------
        # N = query elements
        # M = key/value elements
        # H = heads
        # I = input features
        # O = output features

        query = inputs[0]
        key = inputs[1]
        value = inputs[2] if len(inputs) > 2 else key

        # verify shapes
        if key.shape[-2] != value.shape[-2]:
            raise ValueError(
                "the number of elements in 'key' must be equal to the same as the number of elements in 'value'"
            )

        if mask is not None:
            if len(mask.shape) < 2:
                raise ValueError("'mask' must have atleast 2 dimensions")
            if query.shape[-2] != mask.shape[-2]:
                raise ValueError(
                    "mask's second to last dimension must be equal to the number of elements in 'query'"
                )
            if key.shape[-2] != mask.shape[-1]:
                raise ValueError(
                    "mask's last dimension must be equal to the number of elements in 'key'"
                )

        # Linear transformations
        query = tf.einsum("...NI , HIO -> ...NHO", query, self.query_kernel)
        key = tf.einsum("...MI , HIO -> ...MHO", key, self.key_kernel)
        value = tf.einsum("...MI , HIO -> ...MHO", value, self.value_kernel)

        # Scale dot-product, doing the division to either query or key
        # instead of their product saves some computation
        depth = tf.constant(self.head_size, dtype=query.dtype)
        query /= tf.sqrt(depth)

        # Calculate dot product attention
        logits = tf.einsum("...NHO,...MHO->...HNM", query, key)

        # apply mask
        if mask is not None:
            mask = tf.cast(mask, tf.float32)

            # possibly expand on the head dimension so broadcasting works
            if len(mask.shape) != len(logits.shape):
                mask = tf.expand_dims(mask, -3)

            logits += -10e9 * (1.0 - mask)

        attn_coef = tf.nn.softmax(logits)

        # attention dropout
        attn_coef_dropout = self.dropout(attn_coef, training=training)

        # attention * value
        multihead_output = tf.einsum("...HNM,...MHI->...NHI", attn_coef_dropout, value)

        # Run the outputs through another linear projection layer. Recombining heads
        # is automatically done.
        output = tf.einsum(
            "...NHI,HIO->...NO", multihead_output, self.projection_kernel
        )

        if self.projection_bias is not None:
            output += self.projection_bias

        if self.return_attn_coef:
            return output, attn_coef
        else:
            return output

    def compute_output_shape(self, input_shape):
        num_value_features = (
            input_shape[2][-1] if len(input_shape) > 2 else input_shape[1][-1]
        )
        output_size = (
            self.output_size if self.output_size is not None else num_value_features
        )

        output_shape = input_shape[0][:-1] + (output_size,)

        if self.return_attn_coef:
            num_query_elements = input_shape[0][-2]
            num_key_elements = input_shape[1][-2]
            attn_coef_shape = input_shape[0][:-2] + (
                self.num_heads,
                num_query_elements,
                num_key_elements,
            )

            return output_shape, attn_coef_shape
        else:
            return output_shape

    def get_config(self):
        config = super().get_config()

        config.update(
            head_size=self.head_size,
            num_heads=self.num_heads,
            output_size=self.output_size,
            dropout=self._dropout_rate,
            use_projection_bias=self.use_projection_bias,
            return_attn_coef=self.return_attn_coef,
            kernel_initializer=tf.keras.initializers.serialize(self.kernel_initializer),
            kernel_regularizer=tf.keras.regularizers.serialize(self.kernel_regularizer),
            kernel_constraint=tf.keras.constraints.serialize(self.kernel_constraint),
            bias_initializer=tf.keras.initializers.serialize(self.bias_initializer),
            bias_regularizer=tf.keras.regularizers.serialize(self.bias_regularizer),
            bias_constraint=tf.keras.constraints.serialize(self.bias_constraint),
        )

        return config

In [9]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.attention_layer = MultiHeadAttention(head_size=embed_dim, num_heads=num_heads)
        self.ff_layer = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"),
             tf.keras.layers.Dense(embed_dim)]
        )
        self.add_norm_layer_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.add_norm_layer_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout_1 = tf.keras.layers.Dropout(dropout)
        self.dropout_2 = tf.keras.layers.Dropout(dropout)

    def call(self, inputs, training):
        attention_output = self.attention_layer([inputs, inputs])
        attention_output = self.dropout_1(attention_output, training=training)
        input_to_ffn = self.add_norm_layer_1(inputs + attention_output)
        ffn_output = self.ff_layer(input_to_ffn)
        ffn_output = self.dropout_2(ffn_output, training=training)
        return self.add_norm_layer_2(input_to_ffn + ffn_output)

In [10]:
# get datasets for selected substation, load them as NumPy arrays

In [11]:
x_train = np.load('data/256_to_24_train_hourly.npy')

In [12]:
y_train = np.load('data/256_to_24_train_targets.npy')

In [13]:
x_train.shape, y_train.shape

((17824, 256), (17824, 24))

In [14]:
x_eval = np.load('data/256_to_24_eval_hourly.npy')

In [15]:
y_eval = np.load('data/256_to_24_eval_targets.npy')

In [16]:
x_eval.shape, y_eval.shape

((1984, 256), (1984, 24))

In [17]:
# use number of timesteps in the input sequence as the limit for positional encoding
num_timesteps = 256

In [18]:
# input layer for Keras functional
# use embedding dimension from SLDB as the input dimensionality
input_layer = tf.keras.layers.Input(shape=(num_timesteps,))
input_layer

<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'input_1')>

In [19]:
# a layer to expand dimensions of input tensor,
# required to project to a d_model space with a convolutional layer
expanded_input_layer = tf.expand_dims(input_layer, axis=2)
expanded_input_layer

<KerasTensor: shape=(None, 256, 1) dtype=float32 (created by layer 'tf.expand_dims')>

In [20]:
# dimensionality of Q, K, V
embed_dim = 64

In [21]:
# a simple Conv1D layer to project time series data (scalar) to d_model
value_embedding_layer = tf.keras.layers.Conv1D(filters=embed_dim,
                                               kernel_size=3,
                                               activation="relu",
                                               padding="same")(expanded_input_layer)
value_embedding_layer

<KerasTensor: shape=(None, 256, 64) dtype=float32 (created by layer 'conv1d')>

In [22]:
# start with a simple position encoding
# for instance, the one in Keras Transformer-encoder block for text classification

In [1]:
positions_to_encode = tf.range(start=0, limit=num_timesteps, delta=1)

NameError: name 'tf' is not defined

In [24]:
position_embedding_layer = tf.keras.layers.Embedding(input_dim=num_timesteps,
                                                     output_dim=embed_dim) (positions_to_encode)
position_embedding_layer

<tf.Tensor: shape=(256, 64), dtype=float32, numpy=
array([[ 0.02658891, -0.03204795, -0.03786908, ..., -0.02055534,
        -0.02937177, -0.04134696],
       [ 0.04068748, -0.00821483,  0.03064803, ...,  0.03557043,
        -0.04187651,  0.00584674],
       [ 0.03897362,  0.03680462, -0.04512105, ...,  0.00641111,
        -0.04568533, -0.02206776],
       ...,
       [ 0.02469539, -0.00703088, -0.01136528, ...,  0.00360604,
         0.01088633,  0.02674219],
       [-0.01031395,  0.00929768, -0.00996647, ..., -0.00400395,
         0.00120878, -0.00935572],
       [ 0.02701252,  0.02953235,  0.00127114, ...,  0.02082128,
        -0.04961213,  0.005363  ]], dtype=float32)>

In [25]:
input_to_transformer_block = value_embedding_layer + position_embedding_layer
input_to_transformer_block

<KerasTensor: shape=(None, 256, 64) dtype=float32 (created by layer 'tf.__operators__.add')>

In [26]:
num_heads = 4
ff_dim = 64
dropout = 0.2

In [27]:
encoder_layer_1 = EncoderLayer(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, dropout=dropout)



In [28]:
encoder_layer_2 = EncoderLayer(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, dropout=dropout)



In [29]:
encoder_layer_3 = EncoderLayer(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, dropout=dropout)



In [30]:
encoder_layer_4 = EncoderLayer(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim, dropout=dropout)



In [31]:
output_from_encoder_1 = encoder_layer_1(input_to_transformer_block)
output_from_encoder_2 = encoder_layer_2(output_from_encoder_1)
output_from_encoder_3 = encoder_layer_3(output_from_encoder_2)
output_from_encoder_4 = encoder_layer_4(output_from_encoder_3)

output_from_encoder_4

<KerasTensor: shape=(None, 256, 64) dtype=float32 (created by layer 'encoder_layer_3')>

In [32]:
# processing the output from transformer block towards the target
# case 1: based on TransformerBlock example at
# https://keras.io/examples/nlp/text_classification_with_transformer/

In [49]:
output_from_pooling = tf.keras.layers.GlobalAveragePooling1D()(output_from_encoder_4)
output_from_pooling

<KerasTensor: shape=(None, 64) dtype=float32 (created by layer 'global_average_pooling1d_1')>

In [50]:
num_targets = 24

In [51]:
repeated = tf.keras.layers.RepeatVector(num_targets)(output_from_pooling)
repeated

<KerasTensor: shape=(None, 24, 64) dtype=float32 (created by layer 'repeat_vector_1')>

In [52]:
first_dropout = tf.keras.layers.Dropout(0.1)

In [53]:
distributed_first_dropout = tf.keras.layers.TimeDistributed(first_dropout)(repeated)
distributed_first_dropout

<KerasTensor: shape=(None, 24, 64) dtype=float32 (created by layer 'time_distributed_4')>

In [54]:
units_in_first_dense = 32
first_dense = tf.keras.layers.Dense(units_in_first_dense, activation="relu")

In [55]:
distributed_first_dense = tf.keras.layers.TimeDistributed(first_dense)(distributed_first_dropout)
distributed_first_dense

<KerasTensor: shape=(None, 24, 32) dtype=float32 (created by layer 'time_distributed_5')>

In [56]:
second_dropout = tf.keras.layers.Dropout(0.1)

In [57]:
distributed_second_dropout = tf.keras.layers.TimeDistributed(second_dropout)(distributed_first_dense)
distributed_second_dropout

<KerasTensor: shape=(None, 24, 32) dtype=float32 (created by layer 'time_distributed_6')>

In [58]:
units_in_second_dense = 1
second_dense = tf.keras.layers.Dense(units_in_second_dense, activation="sigmoid")

In [59]:
distributed_second_dense = tf.keras.layers.TimeDistributed(second_dense)(distributed_second_dropout)
distributed_second_dense

<KerasTensor: shape=(None, 24, 1) dtype=float32 (created by layer 'time_distributed_7')>

In [60]:
squeezed_output = tf.keras.backend.squeeze(distributed_second_dense, axis=-1)
squeezed_output

<KerasTensor: shape=(None, 24) dtype=float32 (created by layer 'tf.compat.v1.squeeze_1')>

In [61]:
model = tf.keras.Model(inputs=input_layer, outputs=squeezed_output)

In [62]:
model.compile("adam", "mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [63]:
history = model.fit(
    x_train, y_train, batch_size=32, epochs=1, validation_data=(x_eval, y_eval)
)

