In [1]:
# transformer-decoder with no Seq2Seq component (autoregressive)

# no value embedding
# sine-cosine positional encoding on the hour, day, and month of timestamp
# modified transformer-encoder layer for masked self-attention

# so far the Auto Regressive Transformer Decoder architecture (ARTRFDC)
# based on Klingenbrunn is working in TensorFlow, TPU-based,
# with teacher-forcing training (only target true values are passed)

# it is intended now to include scheduled sampling functionality
# to confirm if better models can be generated passing target true values and predictions

# the basic problem with scheduled sampling on Keras is:
# the length of the input sequence to the multi-head attention component
# must be inferred outside the decoder layer,
# then this length is used to build the look-backwards mask to be passed to all decoder layers

In [2]:
import numpy as np
import pandas as pd

In [3]:
import tensorflow as tf

In [4]:
print(tf.__version__)

2.4.1


In [5]:
# required for TFA MultiHeadAttention
import typing
import warnings

In [6]:
# MHA class from TensorFlow AddOns source
# it is compatible with TF 1.15 for CloudTPU usage

In [7]:
class MultiHeadAttention(tf.keras.layers.Layer):
    r"""MultiHead Attention layer.
    Defines the MultiHead Attention operation as described in
    [Attention Is All You Need](https://arxiv.org/abs/1706.03762) which takes
    in the tensors `query`, `key`, and `value`, and returns the dot-product attention
    between them:
    >>> mha = MultiHeadAttention(head_size=128, num_heads=12)
    >>> query = np.random.rand(3, 5, 4) # (batch_size, query_elements, query_depth)
    >>> key = np.random.rand(3, 6, 5) # (batch_size, key_elements, key_depth)
    >>> value = np.random.rand(3, 6, 6) # (batch_size, key_elements, value_depth)
    >>> attention = mha([query, key, value]) # (batch_size, query_elements, value_depth)
    >>> attention.shape
    TensorShape([3, 5, 6])
    If `value` is not given then internally `value = key` will be used:
    >>> mha = MultiHeadAttention(head_size=128, num_heads=12)
    >>> query = np.random.rand(3, 5, 5) # (batch_size, query_elements, query_depth)
    >>> key = np.random.rand(3, 6, 10) # (batch_size, key_elements, key_depth)
    >>> attention = mha([query, key]) # (batch_size, query_elements, key_depth)
    >>> attention.shape
    TensorShape([3, 5, 10])
    Args:
        head_size: int, dimensionality of the `query`, `key` and `value` tensors
            after the linear transformation.
        num_heads: int, number of attention heads.
        output_size: int, dimensionality of the output space, if `None` then the
            input dimension of `value` or `key` will be used,
            default `None`.
        dropout: float, `rate` parameter for the dropout layer that is
            applied to attention after softmax,
        default `0`.
        use_projection_bias: bool, whether to use a bias term after the linear
            output projection.
        return_attn_coef: bool, if `True`, return the attention coefficients as
            an additional output argument.
        kernel_initializer: initializer, initializer for the kernel weights.
        kernel_regularizer: regularizer, regularizer for the kernel weights.
        kernel_constraint: constraint, constraint for the kernel weights.
        bias_initializer: initializer, initializer for the bias weights.
        bias_regularizer: regularizer, regularizer for the bias weights.
        bias_constraint: constraint, constraint for the bias weights.
    Call Args:
        inputs:  List of `[query, key, value]` where
            * `query`: Tensor of shape `(..., query_elements, query_depth)`
            * `key`: `Tensor of shape '(..., key_elements, key_depth)`
            * `value`: Tensor of shape `(..., key_elements, value_depth)`, optional, if not given `key` will be used.
        mask: a binary Tensor of shape `[batch_size?, num_heads?, query_elements, key_elements]`
        which specifies which query elements can attend to which key elements,
        `1` indicates attention and `0` indicates no attention.
    Output shape:
        * `(..., query_elements, output_size)` if `output_size` is given, else
        * `(..., query_elements, value_depth)` if `value` is given, else
        * `(..., query_elements, key_depth)`
    """

    def __init__(
        self,
        head_size: int,
        num_heads: int,
        output_size: int = None,
        dropout: float = 0.0,
        use_projection_bias: bool = True,
        return_attn_coef: bool = False,
        kernel_initializer: typing.Union[str, typing.Callable] = "glorot_uniform",
        kernel_regularizer: typing.Union[str, typing.Callable] = None,
        kernel_constraint: typing.Union[str, typing.Callable] = None,
        bias_initializer: typing.Union[str, typing.Callable] = "zeros",
        bias_regularizer: typing.Union[str, typing.Callable] = None,
        bias_constraint: typing.Union[str, typing.Callable] = None,
        **kwargs,
    ):
        warnings.warn(
            "`MultiHeadAttention` will be deprecated in Addons 0.13. "
            "Please use `tf.keras.layers.MultiHeadAttention` instead.",
            DeprecationWarning,
        )

        super().__init__(**kwargs)

        if output_size is not None and output_size < 1:
            raise ValueError("output_size must be a positive number")

        self.head_size = head_size
        self.num_heads = num_heads
        self.output_size = output_size
        self.use_projection_bias = use_projection_bias
        self.return_attn_coef = return_attn_coef

        self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
        self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
        self.kernel_constraint = tf.keras.constraints.get(kernel_constraint)
        self.bias_initializer = tf.keras.initializers.get(bias_initializer)
        self.bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
        self.bias_constraint = tf.keras.constraints.get(bias_constraint)

        self.dropout = tf.keras.layers.Dropout(dropout)
        self._dropout_rate = dropout

    def build(self, input_shape):

        num_query_features = input_shape[0][-1]
        num_key_features = input_shape[1][-1]
        num_value_features = (
            input_shape[2][-1] if len(input_shape) > 2 else num_key_features
        )
        output_size = (
            self.output_size if self.output_size is not None else num_value_features
        )

        self.query_kernel = self.add_weight(
            name="query_kernel",
            shape=[self.num_heads, num_query_features, self.head_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.key_kernel = self.add_weight(
            name="key_kernel",
            shape=[self.num_heads, num_key_features, self.head_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.value_kernel = self.add_weight(
            name="value_kernel",
            shape=[self.num_heads, num_value_features, self.head_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.projection_kernel = self.add_weight(
            name="projection_kernel",
            shape=[self.num_heads, self.head_size, output_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )

        if self.use_projection_bias:
            self.projection_bias = self.add_weight(
                name="projection_bias",
                shape=[output_size],
                initializer=self.bias_initializer,
                regularizer=self.bias_regularizer,
                constraint=self.bias_constraint,
            )
        else:
            self.projection_bias = None

        super().build(input_shape)

    def call(self, inputs, training=None, mask=None):

        # einsum nomenclature
        # ------------------------
        # N = query elements
        # M = key/value elements
        # H = heads
        # I = input features
        # O = output features

        query = inputs[0]
        key = inputs[1]
        value = inputs[2] if len(inputs) > 2 else key

        # verify shapes
        if key.shape[-2] != value.shape[-2]:
            raise ValueError(
                "the number of elements in 'key' must be equal to the same as the number of elements in 'value'"
            )

        if mask is not None:
            if len(mask.shape) < 2:
                raise ValueError("'mask' must have atleast 2 dimensions")
            if query.shape[-2] != mask.shape[-2]:
                raise ValueError(
                    "mask's second to last dimension must be equal to the number of elements in 'query'"
                )
            if key.shape[-2] != mask.shape[-1]:
                raise ValueError(
                    "mask's last dimension must be equal to the number of elements in 'key'"
                )

        # Linear transformations
        query = tf.einsum("...NI , HIO -> ...NHO", query, self.query_kernel)
        key = tf.einsum("...MI , HIO -> ...MHO", key, self.key_kernel)
        value = tf.einsum("...MI , HIO -> ...MHO", value, self.value_kernel)

        # Scale dot-product, doing the division to either query or key
        # instead of their product saves some computation
        depth = tf.constant(self.head_size, dtype=query.dtype)
        query /= tf.sqrt(depth)

        # Calculate dot product attention
        logits = tf.einsum("...NHO,...MHO->...HNM", query, key)

        # apply mask
        if mask is not None:
            mask = tf.cast(mask, tf.float32)

            # possibly expand on the head dimension so broadcasting works
            if len(mask.shape) != len(logits.shape):
                mask = tf.expand_dims(mask, -3)

            logits += -10e9 * (1.0 - mask)

        attn_coef = tf.nn.softmax(logits)

        # attention dropout
        attn_coef_dropout = self.dropout(attn_coef, training=training)

        # attention * value
        multihead_output = tf.einsum("...HNM,...MHI->...NHI", attn_coef_dropout, value)

        # Run the outputs through another linear projection layer. Recombining heads
        # is automatically done.
        output = tf.einsum(
            "...NHI,HIO->...NO", multihead_output, self.projection_kernel
        )

        if self.projection_bias is not None:
            output += self.projection_bias

        if self.return_attn_coef:
            return output, attn_coef
        else:
            return output

    def compute_output_shape(self, input_shape):
        num_value_features = (
            input_shape[2][-1] if len(input_shape) > 2 else input_shape[1][-1]
        )
        output_size = (
            self.output_size if self.output_size is not None else num_value_features
        )

        output_shape = input_shape[0][:-1] + (output_size,)

        if self.return_attn_coef:
            num_query_elements = input_shape[0][-2]
            num_key_elements = input_shape[1][-2]
            attn_coef_shape = input_shape[0][:-2] + (
                self.num_heads,
                num_query_elements,
                num_key_elements,
            )

            return output_shape, attn_coef_shape
        else:
            return output_shape

    def get_config(self):
        config = super().get_config()

        config.update(
            head_size=self.head_size,
            num_heads=self.num_heads,
            output_size=self.output_size,
            dropout=self._dropout_rate,
            use_projection_bias=self.use_projection_bias,
            return_attn_coef=self.return_attn_coef,
            kernel_initializer=tf.keras.initializers.serialize(self.kernel_initializer),
            kernel_regularizer=tf.keras.regularizers.serialize(self.kernel_regularizer),
            kernel_constraint=tf.keras.constraints.serialize(self.kernel_constraint),
            bias_initializer=tf.keras.initializers.serialize(self.bias_initializer),
            bias_regularizer=tf.keras.regularizers.serialize(self.bias_regularizer),
            bias_constraint=tf.keras.constraints.serialize(self.bias_constraint),
        )

        return config

In [8]:
# the autoregressive version of the transformer-decoder does not use the Seq2Seq intermediate layer
# as there is no transformer-encoder component sending hidden states, therefore
# having only a self-attention layer and a position-wise feed-forward layer,
# the autoregressive transformer-decoder is, in fact, a transformer-encoder

# the only important modification is the masked self-attention layer

# masked self-attention layer seems to be already implemented in
# MHA module from TensorFlow AddOns, then will be added to the EncoderLayer class

In [8]:
# build a mask for self-attention on the autoregressive transformer decoder
def get_decoder_mask(self_attention_inputs):
    # self_attention_input shape is (?, n_timesteps, n_features)
    # get the dimension value of n_timesteps and build the mask
    n_timesteps = self_attention_inputs.shape[1]
    mask = tf.convert_to_tensor(np.tril(np.ones([n_timesteps, n_timesteps]), 0),
                                dtype=tf.float32)
    return mask

In [9]:
class ARDecoderLayer(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(ARDecoderLayer, self).__init__()
        # multi-head attention initialization
        self.attention_layer = MultiHeadAttention(head_size=embed_dim, num_heads=num_heads)
        self.ff_layer = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"),
             tf.keras.layers.Dense(embed_dim)]
        )
        self.add_norm_layer_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.add_norm_layer_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout_1 = tf.keras.layers.Dropout(dropout)
        self.dropout_2 = tf.keras.layers.Dropout(dropout)

    def call(self, inputs, mask):
        # mask for self-attention is passed to MHA on call
        attention_output = self.attention_layer([inputs, inputs], mask=mask)
        attention_output = self.dropout_1(attention_output)
        input_to_ffn = self.add_norm_layer_1(inputs + attention_output)
        ffn_output = self.ff_layer(input_to_ffn)
        ffn_output = self.dropout_2(ffn_output)
        return self.add_norm_layer_2(input_to_ffn + ffn_output)

In [10]:
ts = pd.read_pickle("/home/developer/gcp/cbidmltsf/timeseries/CPE04115_H_kw_20201021084001/ts.pkl")

In [11]:
ts.head()

Unnamed: 0_level_0,kw_scaled
timestamp,Unnamed: 1_level_1
2016-01-01 00:00:00,0.274317
2016-01-01 01:00:00,0.217363
2016-01-01 02:00:00,0.168545
2016-01-01 03:00:00,0.122996
2016-01-01 04:00:00,0.08044


In [12]:
# length of time series
total_lectures = ts['kw_scaled'].count()
total_lectures

22629

In [13]:
# build a training dataset
# features: m consecutive lectures with their timestamps
# target: m consecutive lectures (lectures in features, shifted by 1 to the future)

In [14]:
# FEATURE ENGINEERING

In [15]:
# define the embedding dimension, the lenght of tranining, evaluation, and test datasets
# use the most of the 22K+ lectures in the original time series

In [16]:
# length of input sequence, hours in a week
m = 168

In [17]:
# prepare sine-cosine positional encoding for the input sequence
hours_in_day = 24
days_in_month = 30
months_in_year = 12

In [18]:
total_lectures*np.array([0.7, 0.2, 0.1])

array([15840.3,  4525.8,  2262.9])

In [19]:
# from the previous dataset split, use the following indexes for building datasets

# 15000 rows in training dataset
train_start = 0
train_end = 15000

# 4000 rows in evaluation dataset
eval_start = 16000
eval_end = 20000

# 1000 rows in test dataset
test_start = 21000
test_end = 22000

In [20]:
# load the training NumPy arrays
with open('data/x_train.npy', 'rb') as filename:
    x_train = np.load(filename)

with open('data/y_train.npy', 'rb') as filename:
    y_train = np.load(filename)

In [21]:
x_train.shape, y_train.shape

((15000, 168, 7), (15000, 168, 1))

In [22]:
# load the evaluation NumPy arrays
with open('data/x_eval.npy', 'rb') as filename:
    x_eval = np.load(filename)

with open('data/y_eval.npy', 'rb') as filename:
    y_eval = np.load(filename)

In [23]:
x_eval.shape, y_eval.shape

((4000, 168, 7), (4000, 168, 1))

In [24]:
# load the test NumPy arrays
with open('data/x_test.npy', 'rb') as filename:
    x_test = np.load(filename)

with open('data/y_test.npy', 'rb') as filename:
    y_test = np.load(filename)

In [25]:
x_test.shape, y_test.shape

((1000, 168, 7), (1000, 168, 1))

In [26]:
# architecture details according to the Klingenbrunn experiment
# (including notes to further modifications on the basic autoregressive model)

In [27]:
# number of timesteps is the length of the input sequence,
# is the embedding dimension from SLDB
num_timesteps = m

In [28]:
# number of features is the active load value (main feature)
# plus the six components of the sine-cosine positional encoding on hour, day, month

# important: there is no value embedding, therefore d_model is very low
d_model = 7

# ToDo: use value embedding to a high-dimensional space and compare results
# ToDo: use a different positional encoding system and compare results

In [29]:
# as long as there is no value embedding, neither convolutional nor dense layers are required

In [32]:
# input layer for Keras functional
input_layer = tf.keras.layers.Input(shape=(num_timesteps, d_model))
input_layer

<KerasTensor: shape=(None, 168, 7) dtype=float32 (created by layer 'input_1')>

In [33]:
input_to_transformer_block = input_layer
input_to_transformer_block

<KerasTensor: shape=(None, 168, 7) dtype=float32 (created by layer 'input_1')>

In [34]:
num_heads = 2
ff_dim = 1024
dropout = 0.2

In [35]:
# use the same mask for all the ARDecoderLayers in the ARDecoder
mask = get_decoder_mask(input_to_transformer_block)
mask

<tf.Tensor: shape=(168, 168), dtype=float32, numpy=
array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 1., 0., 0.],
       [1., 1., 1., ..., 1., 1., 0.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32)>

In [36]:
ar_decoder_layer = ARDecoderLayer(embed_dim=d_model,
                                  num_heads=num_heads,
                                  ff_dim=ff_dim,
                                  dropout=dropout)



In [37]:
output_from_ar_decoder = ar_decoder_layer(input_to_transformer_block, mask=mask)
output_from_ar_decoder

<KerasTensor: shape=(None, 168, 7) dtype=float32 (created by layer 'ar_decoder_layer')>

In [38]:
# Klingenbrunn uses a linear layer to decode the output_from_encoder
# from (?, num_timesteps, num_features) to (?, num_timesteps, 1)

# the equivalent operation in TensorFlow is a TimeDistributed Dense layer to 1

In [39]:
units_in_first_dense = 1
first_dense = tf.keras.layers.Dense(units_in_first_dense, activation="sigmoid")

In [40]:
distributed_first_dense = tf.keras.layers.TimeDistributed(first_dense)(output_from_ar_decoder)
distributed_first_dense

<KerasTensor: shape=(None, 168, 1) dtype=float32 (created by layer 'time_distributed')>

In [41]:
model = tf.keras.Model(inputs=input_layer, outputs=distributed_first_dense)

In [42]:
model.compile("adam", "mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])

In [43]:
history = model.fit(
    x_train, y_train, batch_size=32, epochs=10, validation_data=(x_eval, y_eval)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# the get_decoder_mask function is working,
# now use it as a base to code the scheduled sampling process

In [44]:
from random import random

In [46]:
def flip_from_probability(p):
    return True if random() < p else False

In [None]:
# VERY IMPORTANT NOTE
# THERE IS NO SIMPLE WAY TO CODE THE SCHEDULED SAMPLING PROCESS FOR TPUESTIMATOR
# (AS IT CAN BE DONE IN KERAS OR IN LOW-LEVEL TENSORFLOW)
# BECAUSE THE INPUT TO THE MODEL IS DEFINED IN THE TRAINING INPUT FUNCTION
# AND THE TEMPORARY OUTPUT OF THE MODEL IS DEFINED IN THE MODEL FUNCTION,
# AND IT IS NOT CLEAR IF THESE SCOPES CAN BE COMBINED (MAYBE A HOOK?)

# AS A RESULT, THIS AUTOREGRESSIVE TRANSFORMER FOR TPUESTIMATOR WILL ALWAYS USE TRUE VALUES,
# AND THE TPU COMPUTING POWER WILL BE USED TO EXTEND BOTH THE SOURCE SEQUENCE AND THE FORECAST WINDOW
# TO ENSURE THE PREDICTIVE PERFORMANCE IS COMPETITIVE WITH THE STATE-OF-THE-ART

# LOOK FOR ANOTHER INTERESTING VARIATIONS TO SUPPORT THE THESIS
# FOR INSTANCE, MODIFY THE MULTI-HEAD-ATTENTION COMPONENT
# SUCH AS IN THE INFORMER, OR IN THE BOTTLENECK TRANSFORMER

In [None]:
# therefore, pause here coding for the autoregressive transformer
# and start a new notebook for making inferences with the TPU-based saved model