In [2]:
import numpy as np
import pandas as pd

In [3]:
import tensorflow as tf

In [4]:
print(tf.__version__)

2.4.1


In [5]:
# required for TFA MultiHeadAttention
import typing
import warnings

In [6]:
# MHA class from TensorFlow AddOns source
# it is compatible with TF 1.15 for CloudTPU usage

In [7]:
class MultiHeadAttention(tf.keras.layers.Layer):
    r"""MultiHead Attention layer.
    Defines the MultiHead Attention operation as described in
    [Attention Is All You Need](https://arxiv.org/abs/1706.03762) which takes
    in the tensors `query`, `key`, and `value`, and returns the dot-product attention
    between them:
    >>> mha = MultiHeadAttention(head_size=128, num_heads=12)
    >>> query = np.random.rand(3, 5, 4) # (batch_size, query_elements, query_depth)
    >>> key = np.random.rand(3, 6, 5) # (batch_size, key_elements, key_depth)
    >>> value = np.random.rand(3, 6, 6) # (batch_size, key_elements, value_depth)
    >>> attention = mha([query, key, value]) # (batch_size, query_elements, value_depth)
    >>> attention.shape
    TensorShape([3, 5, 6])
    If `value` is not given then internally `value = key` will be used:
    >>> mha = MultiHeadAttention(head_size=128, num_heads=12)
    >>> query = np.random.rand(3, 5, 5) # (batch_size, query_elements, query_depth)
    >>> key = np.random.rand(3, 6, 10) # (batch_size, key_elements, key_depth)
    >>> attention = mha([query, key]) # (batch_size, query_elements, key_depth)
    >>> attention.shape
    TensorShape([3, 5, 10])
    Args:
        head_size: int, dimensionality of the `query`, `key` and `value` tensors
            after the linear transformation.
        num_heads: int, number of attention heads.
        output_size: int, dimensionality of the output space, if `None` then the
            input dimension of `value` or `key` will be used,
            default `None`.
        dropout: float, `rate` parameter for the dropout layer that is
            applied to attention after softmax,
        default `0`.
        use_projection_bias: bool, whether to use a bias term after the linear
            output projection.
        return_attn_coef: bool, if `True`, return the attention coefficients as
            an additional output argument.
        kernel_initializer: initializer, initializer for the kernel weights.
        kernel_regularizer: regularizer, regularizer for the kernel weights.
        kernel_constraint: constraint, constraint for the kernel weights.
        bias_initializer: initializer, initializer for the bias weights.
        bias_regularizer: regularizer, regularizer for the bias weights.
        bias_constraint: constraint, constraint for the bias weights.
    Call Args:
        inputs:  List of `[query, key, value]` where
            * `query`: Tensor of shape `(..., query_elements, query_depth)`
            * `key`: `Tensor of shape '(..., key_elements, key_depth)`
            * `value`: Tensor of shape `(..., key_elements, value_depth)`, optional, if not given `key` will be used.
        mask: a binary Tensor of shape `[batch_size?, num_heads?, query_elements, key_elements]`
        which specifies which query elements can attend to which key elements,
        `1` indicates attention and `0` indicates no attention.
    Output shape:
        * `(..., query_elements, output_size)` if `output_size` is given, else
        * `(..., query_elements, value_depth)` if `value` is given, else
        * `(..., query_elements, key_depth)`
    """

    def __init__(
        self,
        head_size: int,
        num_heads: int,
        output_size: int = None,
        dropout: float = 0.0,
        use_projection_bias: bool = True,
        return_attn_coef: bool = False,
        kernel_initializer: typing.Union[str, typing.Callable] = "glorot_uniform",
        kernel_regularizer: typing.Union[str, typing.Callable] = None,
        kernel_constraint: typing.Union[str, typing.Callable] = None,
        bias_initializer: typing.Union[str, typing.Callable] = "zeros",
        bias_regularizer: typing.Union[str, typing.Callable] = None,
        bias_constraint: typing.Union[str, typing.Callable] = None,
        **kwargs,
    ):
        warnings.warn(
            "`MultiHeadAttention` will be deprecated in Addons 0.13. "
            "Please use `tf.keras.layers.MultiHeadAttention` instead.",
            DeprecationWarning,
        )

        super().__init__(**kwargs)

        if output_size is not None and output_size < 1:
            raise ValueError("output_size must be a positive number")

        self.head_size = head_size
        self.num_heads = num_heads
        self.output_size = output_size
        self.use_projection_bias = use_projection_bias
        self.return_attn_coef = return_attn_coef

        self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
        self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
        self.kernel_constraint = tf.keras.constraints.get(kernel_constraint)
        self.bias_initializer = tf.keras.initializers.get(bias_initializer)
        self.bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
        self.bias_constraint = tf.keras.constraints.get(bias_constraint)

        self.dropout = tf.keras.layers.Dropout(dropout)
        self._dropout_rate = dropout

    def build(self, input_shape):

        num_query_features = input_shape[0][-1]
        num_key_features = input_shape[1][-1]
        num_value_features = (
            input_shape[2][-1] if len(input_shape) > 2 else num_key_features
        )
        output_size = (
            self.output_size if self.output_size is not None else num_value_features
        )

        self.query_kernel = self.add_weight(
            name="query_kernel",
            shape=[self.num_heads, num_query_features, self.head_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.key_kernel = self.add_weight(
            name="key_kernel",
            shape=[self.num_heads, num_key_features, self.head_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.value_kernel = self.add_weight(
            name="value_kernel",
            shape=[self.num_heads, num_value_features, self.head_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )
        self.projection_kernel = self.add_weight(
            name="projection_kernel",
            shape=[self.num_heads, self.head_size, output_size],
            initializer=self.kernel_initializer,
            regularizer=self.kernel_regularizer,
            constraint=self.kernel_constraint,
        )

        if self.use_projection_bias:
            self.projection_bias = self.add_weight(
                name="projection_bias",
                shape=[output_size],
                initializer=self.bias_initializer,
                regularizer=self.bias_regularizer,
                constraint=self.bias_constraint,
            )
        else:
            self.projection_bias = None

        super().build(input_shape)

    def call(self, inputs, training=None, mask=None):

        # einsum nomenclature
        # ------------------------
        # N = query elements
        # M = key/value elements
        # H = heads
        # I = input features
        # O = output features

        query = inputs[0]
        key = inputs[1]
        value = inputs[2] if len(inputs) > 2 else key

        # verify shapes
        if key.shape[-2] != value.shape[-2]:
            raise ValueError(
                "the number of elements in 'key' must be equal to the same as the number of elements in 'value'"
            )

        if mask is not None:
            if len(mask.shape) < 2:
                raise ValueError("'mask' must have atleast 2 dimensions")
            if query.shape[-2] != mask.shape[-2]:
                raise ValueError(
                    "mask's second to last dimension must be equal to the number of elements in 'query'"
                )
            if key.shape[-2] != mask.shape[-1]:
                raise ValueError(
                    "mask's last dimension must be equal to the number of elements in 'key'"
                )

        # Linear transformations
        query = tf.einsum("...NI , HIO -> ...NHO", query, self.query_kernel)
        key = tf.einsum("...MI , HIO -> ...MHO", key, self.key_kernel)
        value = tf.einsum("...MI , HIO -> ...MHO", value, self.value_kernel)

        # Scale dot-product, doing the division to either query or key
        # instead of their product saves some computation
        depth = tf.constant(self.head_size, dtype=query.dtype)
        query /= tf.sqrt(depth)

        # Calculate dot product attention
        logits = tf.einsum("...NHO,...MHO->...HNM", query, key)

        # apply mask
        if mask is not None:
            mask = tf.cast(mask, tf.float32)

            # possibly expand on the head dimension so broadcasting works
            if len(mask.shape) != len(logits.shape):
                mask = tf.expand_dims(mask, -3)

            logits += -10e9 * (1.0 - mask)

        attn_coef = tf.nn.softmax(logits)

        # attention dropout
        attn_coef_dropout = self.dropout(attn_coef, training=training)

        # attention * value
        multihead_output = tf.einsum("...HNM,...MHI->...NHI", attn_coef_dropout, value)

        # Run the outputs through another linear projection layer. Recombining heads
        # is automatically done.
        output = tf.einsum(
            "...NHI,HIO->...NO", multihead_output, self.projection_kernel
        )

        if self.projection_bias is not None:
            output += self.projection_bias

        if self.return_attn_coef:
            return output, attn_coef
        else:
            return output

    def compute_output_shape(self, input_shape):
        num_value_features = (
            input_shape[2][-1] if len(input_shape) > 2 else input_shape[1][-1]
        )
        output_size = (
            self.output_size if self.output_size is not None else num_value_features
        )

        output_shape = input_shape[0][:-1] + (output_size,)

        if self.return_attn_coef:
            num_query_elements = input_shape[0][-2]
            num_key_elements = input_shape[1][-2]
            attn_coef_shape = input_shape[0][:-2] + (
                self.num_heads,
                num_query_elements,
                num_key_elements,
            )

            return output_shape, attn_coef_shape
        else:
            return output_shape

    def get_config(self):
        config = super().get_config()

        config.update(
            head_size=self.head_size,
            num_heads=self.num_heads,
            output_size=self.output_size,
            dropout=self._dropout_rate,
            use_projection_bias=self.use_projection_bias,
            return_attn_coef=self.return_attn_coef,
            kernel_initializer=tf.keras.initializers.serialize(self.kernel_initializer),
            kernel_regularizer=tf.keras.regularizers.serialize(self.kernel_regularizer),
            kernel_constraint=tf.keras.constraints.serialize(self.kernel_constraint),
            bias_initializer=tf.keras.initializers.serialize(self.bias_initializer),
            bias_regularizer=tf.keras.regularizers.serialize(self.bias_regularizer),
            bias_constraint=tf.keras.constraints.serialize(self.bias_constraint),
        )

        return config

In [14]:
# generate triangular mask for self-attention
# as a NumPy array
d = 3
np.tril(np.ones([d, d]), 0)

array([[1., 0., 0.],
       [1., 1., 0.],
       [1., 1., 1.]])

In [16]:
# generate triangular mask for self-attention
# as a TensorFlow tensor
d = 3
tf.convert_to_tensor(np.tril(np.ones([d, d]), 0), dtype=tf.float32)

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[1., 0., 0.],
       [1., 1., 0.],
       [1., 1., 1.]], dtype=float32)>

In [None]:
# the autoregressive version of the transformer-decoder does not use the Seq2Seq intermediate layer
# as there is no transformer-encoder component that sends encoding hidden states, therefore
# having only a self-attention layer and position-wise feed-forward layer,
# the autoregressive transformer-decoder is, in fact, a transformer-encoder

# the only important modification is the masked self-attention layer

# masked self-attention layer seems to be already implemented in
# MHA module from TensorFlow AddOns

In [31]:
# base transformer encoder layer from # https://keras.io/examples/nlp/text_classification_with_transformer/
# modified to include masked self attention
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(EncoderLayer, self).__init__()
        # multi-head attention initialization
        self.attention_layer = MultiHeadAttention(head_size=embed_dim, num_heads=num_heads)
        self.ff_layer = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"),
             tf.keras.layers.Dense(embed_dim)]
        )
        self.add_norm_layer_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.add_norm_layer_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout_1 = tf.keras.layers.Dropout(dropout)
        self.dropout_2 = tf.keras.layers.Dropout(dropout)
        # build mask for self-attention
        self.mask = tf.convert_to_tensor(np.tril(np.ones([embed_dim, embed_dim]), 0), dtype=tf.float32)

    def call(self, inputs, training):
        # mask for self-attention is passed to MHA on call
        attention_output = self.attention_layer([inputs, inputs], mask=self.mask)
        attention_output = self.dropout_1(attention_output, training=training)
        input_to_ffn = self.add_norm_layer_1(inputs + attention_output)
        ffn_output = self.ff_layer(input_to_ffn)
        ffn_output = self.dropout_2(ffn_output, training=training)
        return self.add_norm_layer_2(input_to_ffn + ffn_output)

In [2]:
# get the active power time series as main data source
! ls -l /home/developer/gcp/cbidmltsf/timeseries

total 8
drwxrwxr-x 2 developer developer 4096 feb 11 13:08 CPE04115_H_kw_20201021084001
drwxrwxr-x 2 developer developer 4096 feb 16 13:17 CPE04115_H_kw_20201021084001_csv


In [3]:
! ls -l /home/developer/gcp/cbidmltsf/timeseries/CPE04115_H_kw_20201021084001

total 364
-rw-rw-r-- 1 developer developer    621 oct 21  2020 scaler.save
-rw-rw-r-- 1 developer developer    218 oct 21  2020 ts.json
-rw-rw-r-- 1 developer developer 362977 oct 21  2020 ts.pkl


In [4]:
ts = pd.read_pickle("/home/developer/gcp/cbidmltsf/timeseries/CPE04115_H_kw_20201021084001/ts.pkl")

In [5]:
ts.head()

Unnamed: 0_level_0,kw_scaled
timestamp,Unnamed: 1_level_1
2016-01-01 00:00:00,0.274317
2016-01-01 01:00:00,0.217363
2016-01-01 02:00:00,0.168545
2016-01-01 03:00:00,0.122996
2016-01-01 04:00:00,0.08044


In [6]:
# build a training dataset
# features: m consecutive lectures with their timestamps
# target: m consecutive lectures (lectures in features, shifted by 1 to the future)

In [7]:
# length of time series
ts['kw_scaled'].count()

22629

In [8]:
# length of input sequence
m = 8

In [9]:
# get a single example for testing purposes

In [10]:
start = 0

In [11]:
end = start + m

In [12]:
values = np.array(ts[start:end]['kw_scaled'])

In [13]:
values

array([0.27431688, 0.21736328, 0.16854513, 0.12299635, 0.08044036,
       0.04925277, 0.06771694, 0.04966028])

In [14]:
values.shape

(8,)

In [15]:
# get positional encoding for the input sequence

In [16]:
hours_in_day = 24
days_in_month = 30
months_in_year = 12

In [17]:
timestamps_hour = ts[start:end].index.hour
timestamps_day = ts[start:end].index.day
timestamps_month = ts[start:end].index.month

In [18]:
sin_hour = np.sin(2*np.pi*timestamps_hour/hours_in_day)
cos_hour = np.cos(2*np.pi*timestamps_hour/hours_in_day)
sin_day = np.sin(2*np.pi*timestamps_day/days_in_month)
cos_day = np.cos(2*np.pi*timestamps_day/days_in_month)
sin_month = np.sin(2*np.pi*timestamps_month/months_in_year)
cos_month = np.cos(2*np.pi*timestamps_month/months_in_year)

In [19]:
# expand dims for all features components
values = np.expand_dims(np.array(ts[start:end]['kw_scaled']), axis=1)
sin_hour = np.expand_dims(sin_hour, axis=1)
cos_hour = np.expand_dims(cos_hour, axis=1)
sin_day = np.expand_dims(sin_day, axis=1)
cos_day = np.expand_dims(cos_day, axis=1)
sin_month = np.expand_dims(sin_month, axis=1)
cos_month = np.expand_dims(cos_month, axis=1)

In [20]:
features = np.concatenate((values,
                           sin_hour, cos_hour,
                           sin_day, cos_day,
                           sin_month, cos_month), axis=1)

In [21]:
features.shape

(8, 7)

In [22]:
features

array([[ 2.74316881e-01,  0.00000000e+00,  1.00000000e+00,
         2.07911691e-01,  9.78147601e-01,  5.00000000e-01,
         8.66025404e-01],
       [ 2.17363279e-01,  2.58819045e-01,  9.65925826e-01,
         2.07911691e-01,  9.78147601e-01,  5.00000000e-01,
         8.66025404e-01],
       [ 1.68545132e-01,  5.00000000e-01,  8.66025404e-01,
         2.07911691e-01,  9.78147601e-01,  5.00000000e-01,
         8.66025404e-01],
       [ 1.22996351e-01,  7.07106781e-01,  7.07106781e-01,
         2.07911691e-01,  9.78147601e-01,  5.00000000e-01,
         8.66025404e-01],
       [ 8.04403573e-02,  8.66025404e-01,  5.00000000e-01,
         2.07911691e-01,  9.78147601e-01,  5.00000000e-01,
         8.66025404e-01],
       [ 4.92527716e-02,  9.65925826e-01,  2.58819045e-01,
         2.07911691e-01,  9.78147601e-01,  5.00000000e-01,
         8.66025404e-01],
       [ 6.77169441e-02,  1.00000000e+00,  6.12323400e-17,
         2.07911691e-01,  9.78147601e-01,  5.00000000e-01,
         8.6602540

In [23]:
# targets are the source values, shifted one timestep to the future
targets = np.array(ts[start+1:end+1]['kw_scaled'])

In [24]:
targets

array([0.21736328, 0.16854513, 0.12299635, 0.08044036, 0.04925277,
       0.06771694, 0.04966028, 0.02315827])

In [25]:
targets.shape

(8,)