In [1]:
# Notebook for prediction and evaluation of multi-step forecasting ARTRFDC models

In [2]:
import os
import json
import numpy as np
import pandas as pd
import joblib
from datetime import datetime
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# uncomment the following line for compatibility with TensorFlow 1.15 (on GCP)
# import tensorflow.compat.v1 as tf
# uncomment the following line for TensorFlow 2.X (local execution)
import tensorflow as tf

# forecast model was saved in TensorFlow 1.15
# but, in order to make predictions locally, has to be loaded with TensorFlow 2
from tensorflow.saved_model import load

In [3]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
# select a palette
from bokeh.palettes import d3
output_notebook()

In [4]:
# symmetrical mean absolute percentage error
def smape(targets, predictions):
    '''
    predictions: a list with the predicted values
    targets: a list with the actual values
    '''
    import numpy as np
    # lists to NumPy arrays
    targets, predictions = np.array(targets), np.array(predictions)
    # verify predictions and targets have the same shape
    if predictions.shape == targets.shape:
            return(np.sum(2*np.abs(predictions - targets) /
                          (np.abs(targets) + np.abs(predictions)))/predictions.shape[0])

In [42]:
def _parse_dataset_function(example_proto, objective_shapes, parse_timestamp):
    # parse the input tf.Example proto using the dictionary above
    row = tf.io.parse_single_example(example_proto, read_features)
    
    # pass objective shape as a list of lists [hourly_shape, daily_shape, weekly_shape]
    source = tf.reshape(row['source'].values, objective_shapes['source'])
    target = tf.reshape(row['target'].values, objective_shapes['target'])

    # the parsed dataset must have the shape {features}, target!!!
    # so:
    feature_dict = {
        'source': source
    }
    
    # Do not parse the timestamp for training!!! Strings are not supported in TPUs!!!,
    # or parse it as a number
    if parse_timestamp:
        feature_dict['timestamp'] = timestamp

    # _parse_dataset_function returns:
    # features as a dictionary, and
    # target as a float vector
    return feature_dict, target

In [41]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [44]:
def tensor_to_tensor_example(float_tensor):
    # first, pass the float tensor to NumPy array, then flatten it
    flat_array = float_tensor.numpy().flatten()
    # second, build the protobuffer example
    example = tf.train.Example(
        # features within the example
        features=tf.train.Features(
            # individual feature definition
            feature={'source': _float_feature_from_list_of_values(flat_array)}
        )
    )    
    # third, serialize the example dictionary to a string
    serialized_example = example.SerializeToString()
    # fourth, wrap the serialized example as a NumPy-string array
    numpy_example = np.array(serialized_example, dtype='S')
    # fifth, wrap the NumPy-string array as a string tensor
    tensor_example = tf.convert_to_tensor(numpy_example)

    return tensor_example

In [43]:
read_features = {
    'source': tf.io.VarLenFeature(dtype=tf.float32),
    'target': tf.io.VarLenFeature(dtype=tf.float32)
}

In [4]:
PROJECT_ROOT = '/home/developer/gcp/cbidmltsf'

# during batch prediction, the SLDB identifier is obtained via Abseil Flags
sldb_id = 'CPE04115_H_kw_20201021084001_ARTRFDC_168'

In [5]:
# build a path to the SLDB json file
data_dir = '{}/{}/{}'.format(PROJECT_ROOT, 'sldbs', sldb_id)

# then get the ts_identifier from the json file in the sldb directory
sldb_json_file = '{}/sldb.json'.format(data_dir)

In [6]:
# open the json file
with open(sldb_json_file, 'r') as inputfile:
    sldb_dict = json.load(inputfile)

In [7]:
# and get the time series identifier
ts_identifier = sldb_dict['ts']

In [8]:
# use the time series identifier to obtain the SK-Learn scaler used on it
scaler = joblib.load('{}/{}/{}/scaler.save'.format(PROJECT_ROOT,
                                                    'timeseries',
                                                    ts_identifier))

print('Scaler loaded for time series {}'.format(ts_identifier))

Scaler loaded for time series CPE04115_H_kw_20201021084001




In [9]:
# pass all the code to a single notebook cell, then to a function, later...

In [10]:
# during batch prediction, the model identifier is obtained via Abseil Flags
model_id = 'ARTRFDC_TPU_001'

In [18]:
# during batch prediction, the dataset name is obtained via Abseil Flags
dataset = 'test'

In [11]:
# during batch prediction, the execution identifier is obtained via Abseil Flags
execution = 2

In [12]:
# use model identifier and execution number to build the model directory string
model_dir = '{}_{:02d}'.format(model_id, execution)

In [13]:
# get the path to the saved model main directory
saved_model_path = '{}/{}/{}/export/exporter'.format(PROJECT_ROOT,
                                                     'models',
                                                     model_dir)

In [14]:
# get all the files in the saved model path, to find the most recent one
all_files = os.listdir(saved_model_path)
# get the path to the most recent saved model
latest_saved_model_id = sorted(all_files)[-1]

In [15]:
# build the full path for the latest saved model dir
export_dir = '{}/{}'.format(saved_model_path, latest_saved_model_id)
print ('Exported model path is {}'.format(export_dir))

Exported model path is /home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_001_02/export/exporter/1621561318


In [16]:
# load the saved model and the prediction function
imported = load(export_dir=export_dir, tags='serve')
predict_fn = imported.signatures["serving_default"]

In [19]:
# build a path to the dataset for prediction
dataset_path = '{}/{}.tfrecord'.format(data_dir, dataset)

# load the dataset
tfrecord_dataset = tf.data.TFRecordDataset(dataset_path)

In [20]:
# get the SLDB parameters for the forecasting model
config_json_file = '{}/{}/{}.json'.format(PROJECT_ROOT,
                                          'parameters',
                                          model_id)

In [21]:
# recover the sldb dictionary from the json file in parameters/
with open(config_json_file, 'r') as inputfile:
    configuration = json.load(inputfile)

In [22]:
# store the objective shapes for reshaping tensors in a dictionary
_EXTRACTING_OBJECTIVE_SHAPES = {
    'source': [configuration['num_timesteps'], configuration['model_dimension']],
    'target': [configuration['num_timesteps'], configuration['model_dimension']]
}

_EXTRACTING_OBJECTIVE_SHAPES

{'source': [168, 7], 'target': [168, 7]}

In [24]:
# parse the test dataset from the TFRecord file
# and use its features to produce predictions in a different way

parsed_dataset = tfrecord_dataset.map(
    lambda row: _parse_dataset_function(
        example_proto=row,
        objective_shapes=_EXTRACTING_OBJECTIVE_SHAPES,
        # ToDo: parse the timestamps for plotting or additional positional encoding, later...
        parse_timestamp=False
    )
)

In [25]:
# from now on, inferences for ARTRFDC are produced in a very different way
# from the used for DMSLSTM or EDALSTM models (prediction process has to be iterative)

In [26]:
# each predicted row is a sequence of n_timesteps values,
# but only the first element in this sequence is used, as the first prediction,
# then it is added (along with its positional encodings) to the end of the input sequence
# (first entry of the input sequence is discarded to keep tensor shape)
# to get the second prediction, and so on up to the n_timesteps-th prediction,
# which completes the n_timesteps prediction sequence (the forecast window)
# that starts immediately after the source input sequence ends (in time dimension)

In [27]:
# the iterative process for inference over the ARTRFDC saved model can be initiated now:
# source feature (?, 168, 7) (unseen data) is on parsed_dataset[0]['source']
# target feature (?, 168, 7) (unseen data) is on parsed_dataset[1]

# uncomment and run the following two cells to confirm that

In [28]:
# it is not possible to iterate over a segment of a dataset, as required by iterative inference
# then the complete test dataset will be passed to two NumPy arrays:

# source_array, with shape (n_rows, n_timesteps, n_features), in this example (2095, 168, 7), and
# target_array, with shape (n_rows, n_timesteps, n_features), in this example (2095, 168, 7)

# remember source_array[1:, :, :] = target_array[:-2, :, :]

In [29]:
# build temporary lists to store source (features) and target (labels) tensors
source_list = list()
target_list = list()

# fill in the lists from the parsed dataset
for element in parsed_dataset:
    source_list.append(element[0]['source'])
    target_list.append(element[1])

# from the temporary lists, build NumPy arrays to feed the model
source_array = np.array(source_list)
target_array = np.array(target_list)

In [30]:
# verify shape of resulting arrays
source_array.shape, target_array.shape

((2095, 168, 7), (2095, 168, 7))

In [31]:
# now follow the inference process detailed in Klingenbrunn to:
# predict over the forecast window,
# calculate prediction error metrics, and
# plot prediction results

In [32]:
# define a forecast window to guide the iterative prediction process
# start with a hourly, day-ahead process
forecast_window = 24

In [33]:
# the first source or input to the model is the first source row
# that means, the true variable value, plus the six positional encodings for the timestamp
# in the first row of the test dataset

In [34]:
# uncomment the following line to get the source as a tensor with TensorShape([1, 168, 7])
# source_tensor = tf.expand_dims(source_array[0, :, :], axis=0)

In [35]:
# ToDo: include timestamps in train, eval, and test datasets to easily keep tracking of prediction dates!

In [36]:
# now please code all the complicated previous stuff into an easy Python function, would you?

In [37]:
# important, the inference cycle was coded for tensors, not for NumPy arrays
# then use source and prediction tensors and translate to tensor examples from float tensors

In [159]:
# which row of the test dataset will be used for inference?
row = 702

# verify the dataset is long enough to iteratively predict from that row
max_row_index = len(source_list) - configuration['num_timesteps'] - 1
if row > max_row_index:
    # clear row value to raise error
    row = None
    print('The source row index for iterative inference cannot be greater than {}.'.format(max_row_index))

In [160]:
# then again, build the initial source tensor
source_tensor = tf.expand_dims(source_array[row, :, :], axis=0)

# and build the initial prediction tensor
# a forecast-window-sized tensor (1, forecast_window, 7)
# formed with the forecast_window true values, starting at the end of the source tensor
# that means

prediction_tensor = tf.expand_dims(
    source_array[row + configuration['num_timesteps'], :forecast_window, :],
    axis=0)

next_input_model = source_tensor

# re-initialize the prediction list previously used for prediction over TFRecords
predictions_list = list()

# fill the predictions list over the forecast window
for i in range(forecast_window):
    
    # from the current next_input_model tensor (1, 168, 7)
    # get a prediction as NumPy array (1, 168, 1)
    prediction = predict_fn(tensor_to_tensor_example(next_input_model))['forecast'].numpy()
    
    # get the value of the most recent prediction (last timestep) into the predictions list
    predictions_list.append(prediction[:, -1, :][0][0])
    
    # from the source tensor, get the positional encodings for ti+1 to t167 (that is 168-i-1 values)
    pos_encoding_old_values = source_tensor[:, i+1:, 1:]

    # from target tensor, get the positional encodings for t168 to t168+i (that is i+1 values)
    pos_encoding_new_val = prediction_tensor[:, :i+1, 1:]

    # build new positional encodings with 168 values
    pos_encodings = tf.concat([pos_encoding_old_values, pos_encoding_new_val], axis=1)
    pos_encodings = tf.cast(pos_encodings, dtype=tf.float32)

    # build the values feature for the next input to the model
    # pop i+1 values at the beginning of the previous input
    value_feature_old_values = tf.expand_dims(source_tensor[:, i+1:, 0], axis=-1)
    value_feature_old_values = tf.cast(value_feature_old_values, dtype=tf.float32)

    # current predictions_list to NumPy array
    value_feature_new_values = np.array(predictions_list[:i+1])

    # current prediction array to tensor
    value_feature_new_values = tf.convert_to_tensor(value_feature_new_values)

    # expand dimensions of current prediction tensor to single-value feature
    value_feature_new_values = tf.expand_dims(value_feature_new_values, axis=-1)

    # expand dimensions of current prediction tensor to single-value batch
    value_feature_new_values = tf.expand_dims(value_feature_new_values, axis=0)
    
    # build the value feature tensor
    next_input_model = tf.concat([value_feature_old_values, value_feature_new_values], axis=1)
    
    # build the next input tensor for the model
    next_input_model = tf.concat([next_input_model, pos_encodings], axis=2)

    
# iterative predictions over the forecast window reside in predictions_list
predicted_values = np.array(predictions_list)

# and the true values remain in the prediction tensor, pass them to a NumPy array
true_values = prediction_tensor[0, :, 0].numpy()

resulting_smape = smape(true_values, predicted_values)


In [161]:
# a dictionary to manage plots
plots = dict()

In [162]:
plots['identifier'] = figure(
    # x_axis_type='datetime',
    plot_width=960,
    plot_height=400,
    title='Multi-step prediction for model {}, execution {}, on {} dataset, SMAPE = {}'.format(
        model_id,
        execution,
        dataset,
        resulting_smape))

plots['identifier'].grid.grid_line_alpha=0.5

plots['identifier'].xaxis.axis_label = 'Timestep'
plots['identifier'].yaxis.axis_label = 'Value'

plots['identifier'].line(
    np.arange(true_values.shape[0]),
    true_values,
    color='green',
    legend_label='real')

plots['identifier'].line(
    np.arange(predicted_values.shape[0]),
    predicted_values,
    color='red',
    legend_label='predicted')

# uncomment the following two lines to save plot
# output_file('/home/developer/gcp/cbidmltsf/datasets/cfe/{}_H_kw.html'.format(device))
# save(fig_kw)

# uncomment the following line to display plot
show(plots['identifier'])