# Temporal Featurewise Attention Network (TFAN)

Temporal Featurewise Attention Network (TFAN) is an interpretable architecture for multi-variate time series forecast. This notebook can be used to build TFAN, train and evaluate the model on a given test set.

In [None]:
import pandas as pd
import numpy as np
import os
import random

import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.set_memory_growth(gpu, True)

    except RuntimeError as e:
        print(e)

## Load Data

In [None]:
path = os.path.join(os.getcwd(), 'data')
files = os.listdir(path)

def load_data(path, list_of_files):
    data = {}
    for file in list_of_files:
        filepath = os.path.join(path, file)
        df = pd.read_cdv(filepath)
        data[file] = df
        print(f"Loaded {file} with shape {data[file].shape}")
    return data

# load data
data = load_data(path, files)
# concat data
df = pd.concat([data[file] for file in data])

## Preprocessing

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split



# train-test-split
df_train, df_test = train_test_split(df.to_numpy(), shuffle=False, train_size=0.8)

# standardize
scaler = StandardScaler()
scaler.fit(df_train[:, :-1])
x_train = scaler.transform(df_train[:, :-1])
x_test = scaler.transform(df_test[:, :-1])

df_train = np.concatenate((x_train, df_train[:, -1].reshape(-1, 1)), 1)
df_test = np.concatenate((x_test, df_test[:, -1].reshape(-1, 1)), 1)

# val set
df_val, df_test = train_test_split(df_test, shuffle=False, train_size=0.5)   



def create_dataset(df, window_size, batch_size, shuffle=False):
    """ Create a tf.data.dataset from numpy array.
    """
    data = df.astype('float32')
    data = tf.data.Dataset.from_tensor_slices(data)
    data = data.window(window_size, shift=1, stride=1, drop_remainder=True)
    data = data.flat_map(lambda window: window.batch(window_size))
    if shuffle:
        data = data.shuffle(5000)
    data = data.map(lambda k: (k[:,:-1], 
                               tf.reshape(k[:,-1], [window_size, 1]))) # tuple (x, y)
    data = data.batch(batch_size, drop_remainder=True).prefetch(2)
    
    return data

In [None]:
from helpers.load_cfg import load_cfg

cfg = load_cfg('cfg.txt')

# inputs
Tx = cfg['window_size'] # window size
batch_size = cfg['batch_size']
val_batch_size = cfg['val_batch_size']

# create tf.datasets
train_data = create_dataset(df_train, Tx, batch_size,
                            shuffle=True)
val_data = create_dataset(df_val, Tx, val_batch_size,
                          shuffle=False)
test_data = create_dataset(df_test, Tx, val_batch_size,
                          shuffle=False)

## Model

In [None]:
from tfan_layers import *

# initialise model
model = TFAN(residual_blocks=3, residual_dropout=0.2,
             activation=tf.keras.activations.swish,
             depthwise_padding="causal", depthwise_kernel_size=2,
             depth_multiplier=1, Tx=Tx,
             kernel_initializer=tf.keras.initializers.HeUniform(),
             num_heads=8, d_model=Tx,
             regularization="dropout", p=0.25,
             final_filters=8, final_kernel_size=2,
             final_dilations=[1, 2], final_padding='causal',
             final_dropout=0.2)
# build model
num_features = df.shape[1] - 1 # columns - target -> adjust according to data
model.build(input_shape=(batch_size, Tx, num_features))

## Optimizer

In [None]:
from tensorflow_addons.optimizers import ExponentialCyclicalLearningRate

# inputs
maximal_learning_rate = 0.005
initial_learning_rate = maximal_learning_rate * 0.1
epoch_iterations = int(len(df_train)/batch_size)

# early stopping
checkpoint_filepath = os.path.join(os.getcwd(),'tmp', 'checkpoint')
callbacks = [ tf.keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0,
                                               patience=5, verbose=1),
             tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, save_weights_only=True,
                                               monitor='val_loss', mode='min', save_best_only=True, 
                                                verbose=0)]  
# Cyclic lr with exponential decay
lr_schedule = ExponentialCyclicalLearningRate(
    initial_learning_rate=initial_learning_rate,
    maximal_learning_rate=maximal_learning_rate, gamma=0.97,
    step_size = (epoch_iterations/2)) # step_size = 1/2 cycle

opt = tf.keras.optimizers.Adam(learning_rate=lr_schedule) 
loss_fn = tf.keras.losses.MeanSquaredError()

# compile
model.compile(optimizer=opt, loss=loss_fn)

In [None]:
epochs = cfg['epochs']
# fit model
history = model.fit(train_data, validation_data=val_data, epochs=epochs, callbacks=callbacks) 
# load best model weights
#model.load_weights(checkpoint_filepath)

## History

In [None]:
import plotly.graph_objs as go

# data
train_loss = history.history['loss']
val_loss = history.history['val_loss']
epoch_stop = callbacks[0].stopped_epoch
if epoch_stop == 0:
    epoch_stop = epochs
x_epochs = np.arange(1, epoch_stop+1)

trace1 = go.Scatter(x=x_epochs,
                    y=train_loss,
                    mode="lines",
                    marker=dict(color = 'blue'),
                    name='train loss')
trace2 = go.Scatter(x=x_epochs,
                    y=val_loss,
                    mode="lines",
                    marker=dict(color = 'red'),
                    name='validation loss')

data = [trace1, trace2]
layout = dict(title="train- and validation loss",
              yaxis=dict(title="Loss", ticklen=5,zeroline=False),
              xaxis=dict(title="Epochs", ticklen=5,zeroline=False, 
                         tickmode='linear', tick0=1, dtick=1)
             )
fig = go.Figure(dict(data=data, layout=layout))
fig.show()

## MSE

In [None]:
import scipy

import numpy.ma as ma

from sklearn.metrics import mean_squared_error

# inputs

test = True # True : Test, False : Validation


# create dataframes to store data when iterating over test sets

if test:
    iterator = iter(test_data)
else:
    iterator = iter(val_data)

# iterate and get model predictions
x, y = iterator.get_next()
y_pred, _, _ = model(x, training=False)
y_pred = y_pred[:, -1, :] # y_pred == shape(batch, 1)
y_true = y[:, -1, :].numpy()

# inverse transform features 
x = x[:, -1, :].numpy()
x = scaler.inverse_transform(x)

# build dataframe with features and store it in "features" dictionary
cols = list(df.columns)[:-1]
x_features = pd.DataFrame(x, columns=cols)

# build dataframe with labels and predictions and store it in "predictions" dictionary
cols = ["Prediction", "True"]
predictions =  pd.DataFrame(np.concatenate(( y_pred, y_true ), axis=1), columns=cols)

# Evaluate MSE

mse = mean_squared_error(predictions["True"], predictions["Prediction"])
print(f"MSE: {mse}")

## Predictions Plots

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as po

# inputs 

start = 0
end = 50000



# select relevant window
df = predictions[start:end]

# True Target Values
trace1 = go.Scatter(
                    x=np.arange(start, end),
                    y=df["True"],
                    mode="lines",
                    name="target",
                    marker=dict(color='rgb(20, 112, 204)'))
# Predicted TFAN values
trace2 = go.Scatter(
                    x=np.arange(start,end),
                    y=df["Prediction"],
                    mode="lines",
                    name="TFAN",
                    marker=dict(color='rgb(255, 55, 55)'))

layout = dict(title="TFAN predictions on test trips - start: {} - end: {}".format(start, end),
              yaxis=dict(title="Target", ticklen=5,zeroline= False),
              xaxis=dict(title="Samples", ticklen=5,zeroline= False)
             )

fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()

## Evaluate feature-wise attention

The attention of the model in the context of time series-forecast is understood as the contribution to a certain prediction. This means that a high contribution reflects in high final attention weight and a lower contribution leads to a smaller attention weight. To do so, following components of MHA are studied:
    * attention weights a
    * norm of values ||v||
    * norm of scaled values ||a*v||


In [None]:
# The following tensors have the shapes:
# y_pred == (batch, Tx, 1)
# att == (batch, num_heads, features, features)
# val == (batch, num_heads, features, depth) (Note: d_model = Tx = depth*num_heads)

#inputs
dataset = test_data
# get data
iterator = iter(dataset)
x, y = iterator.get_next()

# get tuple (predicitions, attention_weigths, values)
y_pred, att_w, val = model(x, training=False)

### Attention 

In the following cells the notation will be:

* ||v|| : the feature-wise norm of shape == (batch, num_heads, features) scaled by its Wo' component,
    where Wo' is the segment of Wo (used to combine the different heads) applied to the specific head 
* a : the attention weights of shape == (batch, num_heads, features, features) 
* ||a*v|| : the norm of the feature-representations scaled by the attention weights

Note that the result of the MHA is the result of the operation: x_mha = a@v. This means that every row of x_mha is computed by computing a different weighted sum of the values (each row is a different feature) with the attention weights.The weights in row m of the matrix a reflect how much the feature representations in v align with the specific feature (row  m in a). To give an example:

The 3rd row of the matrix a (with one attention weight per feature) is used to perform a weighted sum of the values v, based on how much each of the features aligns with the 3rd feature.

#### Scale values with Wo from linear combination of heads

In [None]:
# get weights of linear layer used to combine the different heads of mha
mha = model.layers[-2]
batch_size = val.shape[0]
num_heads = val.shape[1]
depth = val.shape[-1]
d_model = model.layers[-2].d_model
if num_heads > 1:
    wo = mha.weights[-2] # shape == (d_model, d_model) = (Tx, Tx) with d_model = num_heads*depth
# note: that the bias is omitted as adding a fixed vector should not affect the inter-feature interactions

# linearity of matrix multiplications is used to integrate the linear combination of heads into the values vector
# A: scaled values B: attention weights C: values
# A = B@C
# A@Wo = (B@C)@Wo = B@(C@Wo)

# compute v = v@Wo' for all heads

# reshape Wo to shape == (1, num_heads, depth, d_model)
if num_heads > 1:
    wo = tf.reshape(wo, [1, num_heads, depth, d_model])
    v = tf.matmul(val, wo) # shape == (batch, num_heads, features, d_model) 
else:
    v = val # no linear combination is used to combine heads
# Note: each feature representation of v now has d_model dimensions

#### Compute ||v||, a and ||a*v||

In [None]:
def compute_attention(att, values):
    """ compute attention analysis using the attention weights att and values (scaled with Wo).
    Args:
        att: attention weighst of shape == (batch, num_heads, features, features)
        values: values scaled with Wo of shape == (batch, num_heads, features, d_model)
        
    Output:
        tuple (||v||, att, ||att*v||)
    """
    # ||v||
    v_norm = tf.norm(values, axis = -1) # (batch, num_heads, features)
    # ||att*v||
    # each row of att is used to perform a different weighted sum of the values
    # thus, the attention weights need to be reshaped and expanded to element-wise multiply them with the values
    att_expand = tf.transpose(att, perm = [0, 1, 3, 2])
    att_expand = tf.expand_dims(att_expand, -2) # same weight along d_model dimension
    values_expand = tf.expand_dims(values, -1) # use same set of values for all original rows of att
    scaled_values = att_expand*values_expand # shape == (batch, num_heads, features, d_model, features)
    scaled_values_norm = tf.norm(scaled_values, axis = -2) # shape == (batch, num_heads, features, features)
    scaled_values_norm = tf.transpose(scaled_values_norm, perm = [0, 1, 3, 2]) # to have same alignment as att
    # each row of scaled_values_norm has one value per feature, determining how to attend between the feature 
    # at the respective row and all input features
            
    return v_norm, att, scaled_values_norm

# compute
v_norm, att, scaled_values_norm = compute_attention(att_w, v)
print("||v|| of shape == ({}, {}, {}) -> (batch size, num_heads, features)".format(v_norm.shape[0], v_norm.shape[1], v_norm.shape[2]))
print("att of shape == ({}, {}, {}, {}) -> (batch size, num_heads, features, features)".format(att.shape[0], att.shape[1], att.shape[2], att.shape[3]))
print("||att*v|| of shape == ({}, {}, {}, {}) -> (batch size, num_heads, features, features)".format(scaled_values_norm.shape[0], scaled_values_norm.shape[1], 
                                                                                                      scaled_values_norm.shape[2], scaled_values_norm.shape[3]))

## Plots

In [None]:
from plots import *

#### Attention Distribution

In [None]:
fig, df_att = attention_distribution(att, name_of_features=[],
                           start=None, end=None)
fig.show()

## Featurewise Attention

In [None]:
fig = featurewise_attention(att, x_features, predictions['True'], predictions['Prediction'], feature=None,
                            show_target=False, start=None, end=None)
fig.show()