In [1]:
# Notebook for prediction and evaluation of multi-step forecasting ARTRFDC models

In [2]:
import os
import json
import numpy as np
import pandas as pd
import joblib
from datetime import datetime
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# uncomment the following line for compatibility with TensorFlow 1.15 (on GCP)
# import tensorflow.compat.v1 as tf
# uncomment the following line for TensorFlow 2.X (local execution)
import tensorflow as tf

# forecast model was saved in TensorFlow 1.15
# but, in order to make predictions locally, has to be loaded with TensorFlow 2
from tensorflow.saved_model import load

In [3]:
# symmetrical mean absolute percentage error
def smape(targets, predictions):
    '''
    predictions: a list with the predicted values
    targets: a list with the actual values
    '''
    import numpy as np
    # lists to NumPy arrays
    targets, predictions = np.array(targets), np.array(predictions)
    # verify predictions and targets have the same shape
    if predictions.shape == targets.shape:
            return(np.sum(2*np.abs(predictions - targets) /
                          (np.abs(targets) + np.abs(predictions)))/predictions.shape[0])

In [4]:
PROJECT_ROOT = '/home/developer/gcp/cbidmltsf'

# during batch prediction, the SLDB identifier is obtained via Abseil Flags
sldb_id = 'CPE04115_H_kw_20201021084001_ARTRFDC_168'

In [5]:
# build a path to the SLDB json file
data_dir = '{}/{}/{}'.format(PROJECT_ROOT, 'sldbs', sldb_id)

# then get the ts_identifier from the json file in the sldb directory
sldb_json_file = '{}/sldb.json'.format(data_dir)

In [6]:
# open the json file
with open(sldb_json_file, 'r') as inputfile:
    sldb_dict = json.load(inputfile)

In [7]:
# and get the time series identifier
ts_identifier = sldb_dict['ts']

In [8]:
# use the time series identifier to obtain the SK-Learn scaler used on it
scaler = joblib.load('{}/{}/{}/scaler.save'.format(PROJECT_ROOT,
                                                    'timeseries',
                                                    ts_identifier))

print('Scaler loaded for time series {}'.format(ts_identifier))

Scaler loaded for time series CPE04115_H_kw_20201021084001




In [9]:
# this code block will be imported as the _parse_dataset_function

read_features = {
    'source': tf.io.VarLenFeature(dtype=tf.float32),
    'target': tf.io.VarLenFeature(dtype=tf.float32)
}


def _parse_dataset_function(example_proto, objective_shapes, parse_timestamp):
    # parse the input tf.Example proto using the dictionary above
    row = tf.io.parse_single_example(example_proto, read_features)
    
    # pass objective shape as a list of lists [hourly_shape, daily_shape, weekly_shape]
    source = tf.reshape(row['source'].values, objective_shapes['source'])
    target = tf.reshape(row['target'].values, objective_shapes['target'])

    # the parsed dataset must have the shape {features}, target!!!
    # so:
    feature_dict = {
        'source': source
    }
    
    # Do not parse the timestamp for training!!! Strings are not supported in TPUs!!!,
    # or parse it as a number
    if parse_timestamp:
        feature_dict['timestamp'] = timestamp

    # _parse_dataset_function returns:
    # features as a dictionary, and
    # target as a float vector
    return feature_dict, target

In [10]:
# pass all the code to a single notebook cell, then to a function, later...

In [11]:
# during batch prediction, the model identifier is obtained via Abseil Flags
model_id = 'ARTRFDC_TPU_000'

In [12]:
# during batch prediction, the dataset name is obtained via Abseil Flags
dataset = 'test'

In [13]:
# during batch prediction, the execution identifier is obtained via Abseil Flags
execution = 0

In [14]:
# use model identifier and execution number to build the model directory string
model_dir = '{}_{:02d}'.format(model_id, execution)

In [15]:
# get the path to the saved model main directory
saved_model_path = '{}/{}/{}/export/exporter'.format(PROJECT_ROOT,
                                                     'models',
                                                     model_dir)

In [16]:
# get all the files in the saved model path, to find the most recent one
all_files = os.listdir(saved_model_path)
# get the path to the most recent saved model
latest_saved_model_id = sorted(all_files)[-1]

In [17]:
# build the full path for the latest saved model dir
export_dir = '{}/{}'.format(saved_model_path, latest_saved_model_id)
print ('Exported model path is {}'.format(export_dir))

Exported model path is /home/developer/gcp/cbidmltsf/models/ARTRFDC_TPU_000_00/export/exporter/1621431330


In [18]:
# load the saved model and the prediction function
imported = load(export_dir=export_dir, tags='serve')
predict_fn = imported.signatures["serving_default"]

In [19]:
# on DMSLSTM or EDALSTM, predictions are obtained by applying the prediction function
# directly on the TFRECORD file, over th features of the test dataset, like this:

In [20]:
# build a path to the dataset for prediction
dataset_path = '{}/{}.tfrecord'.format(data_dir, dataset)

# load the dataset
tfrecord_dataset = tf.data.TFRecordDataset(dataset_path)

In [21]:
# a list to store prediction values
predictions_list = list()

In [22]:
# TensorFlow 2 eager execution allows to iterate over a dataset
for element in tfrecord_dataset:
    predictions_list.append(predict_fn(element))


In [23]:
# get prediction values from predictions list
predictions = [p['forecast'][0] for p in predictions_list]

In [24]:
# confirm the number of predictions and the tensor shape
len(predictions), predictions[0].shape

(2095, TensorShape([168, 1]))

In [25]:
# from now on, inferences for ARTRFDC are produced in a very different way
# from the used for DMSLSTM or EDALSTM models (prediction process has to be iterative)

In [26]:
# each predicted row is a sequence of n_timesteps values,
# but only the first element in this sequence is used, as the first prediction,
# then it is added (along with its positional encodings) to the end of the input sequence
# (first entry of the input sequence is discarded to keep tensor shape)
# to get the second prediction, and so on up to the n_timesteps-th prediction,
# which completes the n_timesteps prediction sequence (the forecast window)
# that starts immediately after the source input sequence ends (in time dimension)

In [27]:
# get the SLDB parameters for the forecasting model
config_json_file = '{}/{}/{}.json'.format(PROJECT_ROOT,
                                          'parameters',
                                          model_id)

In [28]:
# recover the sldb dictionary from the json file in parameters/
with open(config_json_file, 'r') as inputfile:
    configuration = json.load(inputfile)

In [29]:
# store the objective shapes for reshaping tensors in a dictionary
_EXTRACTING_OBJECTIVE_SHAPES = {
    'source': [configuration['num_timesteps'], configuration['model_dimension']],
    'target': [configuration['num_timesteps'], configuration['model_dimension']]
}

_EXTRACTING_OBJECTIVE_SHAPES

{'source': [168, 7], 'target': [168, 7]}

In [30]:
# parse the test dataset from the TFRecord file
# and use its features to produce predictions in a different way

parsed_dataset = tfrecord_dataset.map(
    lambda row: _parse_dataset_function(
        example_proto=row,
        objective_shapes=_EXTRACTING_OBJECTIVE_SHAPES,
        # ToDo: parse the timestamps for plotting or additional positional encoding, later...
        parse_timestamp=False
    )
)

In [31]:
parsed_dataset

<MapDataset shapes: ({source: (168, 7)}, (168, 7)), types: ({source: tf.float32}, tf.float32)>

In [32]:
# the iterative process for inference over the ARTRFDC saved model can be initiated now:
# source feature (?, 168, 7) (unseen data) is on parsed_dataset[0]['source']
# target feature (?, 168, 7) (unseen data) is on parsed_dataset[1]

# uncomment and run the following two cells to confirm that

In [33]:
# for element in parsed_dataset:
#     print (element[0]['source'])

# confirmed!

In [34]:
# for element in parsed_dataset:
#     print (element[1])
    
# confirmed!

In [38]:
# it is not possible to iterate over a segment of a dataset, as required by iterative inference
# then the complete test dataset will be passed to two NumPy arrays:

# source_array, with shape (n_rows, n_timesteps, n_features), in this example (2095, 168, 7), and
# target_array, with shape (n_rows, n_timesteps, n_features), in this example (2095, 168, 7)

# remember source_array[1:, :, :] = target_array[:-2, :, :]

In [39]:
source_list = list()
target_list = list()

for element in parsed_dataset:
    source_list.append(element[0]['source'])
    target_list.append(element[1])

source_array = np.array(source_list)
target_array = np.array(target_list)

In [40]:
# verify shape of resulting arrays
source_array.shape, target_array.shape

((2095, 168, 7), (2095, 168, 7))

In [35]:
# now follow the inference process detailed in Klingenbrunn to:
# predict over the forecast window,
# calculate prediction error metrics, and
# plot prediction results

In [36]:
# define a forecast window to guide the iterative prediction process
# start with a hourly, day-ahead process
forecast_window = 24

In [None]:
# the first source or input to the model is the first source row
# that means, the true variable value, plus the six positional encodings for the timestamp
# in the first row of the test dataset

In [126]:
# uncomment the following line to get the source as a tensor with TensorShape([1, 168, 7])
# source_tensor = tf.expand_dims(source_array[0, :, :], axis=0)

In [130]:
# in order to use the predict_fn built for TFRecords,
# it is better to keep the source as a NumPy array
source = source_array[:1, :, :]
source.shape

(1, 168, 7)

In [None]:
# break Klingenbrunn naming conventions here:
# the forecast-window-size array with true values (1, 24, 7)
# will not be called 'target', but 'prediction_target'

In [None]:
# ToDo: include timestamps in train, eval, and test datasets to easily keep tracking of prediction dates!

In [131]:
# get the prediction target from the first row of the test dataset
prediction_target = target_array[:1, :forecast_window, :]
prediction_target.shape

(1, 24, 7)

In [72]:
# take a look at the variable values of source and prediction_target
# prediction_target should be equal to source, trimmed to forecast window and shifted right

In [132]:
source[:, :forecast_window, 0]

array([[0.63997614, 0.6179838 , 0.61890036, 0.6585131 , 0.695615  ,
        0.6312844 , 0.52303356, 0.40595847, 0.2784648 , 0.20911317,
        0.15208519, 0.14222518, 0.1461949 , 0.19253159, 0.22639433,
        0.2271737 , 0.28913516, 0.37109476, 0.4790466 , 0.6021724 ,
        0.63034314, 0.64071214, 0.6699102 , 0.65957916]], dtype=float32)

In [133]:
prediction_target[:, :, 0]

array([[0.6179838 , 0.61890036, 0.6585131 , 0.695615  , 0.6312844 ,
        0.52303356, 0.40595847, 0.2784648 , 0.20911317, 0.15208519,
        0.14222518, 0.1461949 , 0.19253159, 0.22639433, 0.2271737 ,
        0.28913516, 0.37109476, 0.4790466 , 0.6021724 , 0.63034314,
        0.64071214, 0.6699102 , 0.65957916, 0.6380292 ]], dtype=float32)

In [None]:
# therefore, source and prediction_target arrays are ready for inference process

In [134]:
# however, predict_fn from saved model signatures only works with TFRecord datasets as input!

# basic question: how to get a prediction from a saved-model prediction function
# that expects as input example protocol-buffer rows from a TFRecord dataset
# when the data is dynamically obtained from a NumPy array???

# here is the answer from the ML-pipeline specialist I have turned into

In [135]:
# first, flat the NumPy array to pass it to a protobuffer example
flat_array = source.flatten()
flat_array.shape

(1176,)

In [136]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [137]:
# second, build the protobuffer example
example = tf.train.Example(
    # features within the example
    features=tf.train.Features(
        # individual feature definition
        feature={'source': _float_feature_from_list_of_values(flat_array)
                 # 'target': _float_feature_from_list_of_values(results[stage]['target'][row].flatten())
                 }
    )
)

In [138]:
# third, serialize the example dictionary to a string
serialized_example = example.SerializeToString()

In [139]:
# fourth, wrap the serialized example as a NumPy-string array
numpy_example = np.array(serialized_example, dtype='S')

In [140]:
# fifth, wrap the NumPy-string array as a string tensor
tensor_example = tf.convert_to_tensor(numpy_example)

In [165]:
# now we can use the prediction function from the saved model
# on the generated tensor example, and get the prediction (1, 168, 1)
# show only a few of the 168 timesteps
predict_fn(tensor_example)['forecast'][:, :6, :]

<tf.Tensor: shape=(1, 6, 1), dtype=float32, numpy=
array([[[0.6134994 ],
        [0.59551287],
        [0.70238775],
        [0.65899956],
        [0.61110747],
        [0.48638266]]], dtype=float32)>

In [143]:
# as I said, I have turned into a serious ML-pipeline specialist (dark glasses with pixelated shine)

In [144]:
# now please code all the complicated previous stuff into an easy Python function, would you?

In [None]:
# important, the inference cycle was coded for tensors, not for NumPy arrays
# then use source and prediction tensors and translate to tensor examples from float tensors

In [167]:
# then again, build the initial source tensor
source_tensor = tf.expand_dims(source_array[0, :, :], axis=0)
source_tensor.shape

TensorShape([1, 168, 7])

In [168]:
# and build the initial prediction tensor
prediction_tensor = tf.expand_dims(target_array[0, :forecast_window, :], axis=0)
prediction_tensor.shape

TensorShape([1, 24, 7])

In [149]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [161]:
def tensor_to_tensor_example(float_tensor):
    # first, pass the float tensor to NumPy array, then flatten it
    flat_array = float_tensor.numpy().flatten()
    # second, build the protobuffer example
    example = tf.train.Example(
        # features within the example
        features=tf.train.Features(
            # individual feature definition
            feature={'source': _float_feature_from_list_of_values(flat_array)}
        )
    )    
    # third, serialize the example dictionary to a string
    serialized_example = example.SerializeToString()
    # fourth, wrap the serialized example as a NumPy-string array
    numpy_example = np.array(serialized_example, dtype='S')
    # fifth, wrap the NumPy-string array as a string tensor
    tensor_example = tf.convert_to_tensor(numpy_example)

    return tensor_example

In [182]:
next_input_model = source_tensor

In [183]:
# re-initialize the prediction list previously used for prediction over TFRecords
predictions_list = list()

In [184]:
for i in range(forecast_window):
    
    # from the current next_input_model tensor (1, 168, 7)
    # get a prediction as NumPy array (1, 168, 1)
    prediction = predict_fn(tensor_to_tensor_example(next_input_model))['forecast'].numpy()
    
    # get the value of the most recent prediction (last timestep) into the predictions list
    predictions_list.append(prediction[:, -1, :][0][0])
    
    # from the source tensor, get the positional encodings for ti+1 to t167 (that is 168-i-1 values)
    pos_encoding_old_values = source_tensor[:, i+1:, 1:]

    # from target tensor, get the positional encodings for t168 to t168+i (that is i+1 values)
    pos_encoding_new_val = prediction_tensor[:, :i+1, 1:]

    # build new positional encodings with 168 values
    pos_encodings = tf.concat([pos_encoding_old_values, pos_encoding_new_val], axis=1)
    pos_encodings = tf.cast(pos_encodings, dtype=tf.float32)

    # build the values feature for the next input to the model
    # pop i+1 values at the beginning of the previous input
    value_feature_old_values = tf.expand_dims(source_tensor[:, i+1:, 0], axis=-1)
    value_feature_old_values = tf.cast(value_feature_old_values, dtype=tf.float32)

    # current predictions_list to NumPy array
    value_feature_new_values = np.array(predictions_list[:i+1])

    # current prediction array to tensor
    value_feature_new_values = tf.convert_to_tensor(value_feature_new_values)

    # expand dimensions of current prediction tensor to single-value feature
    value_feature_new_values = tf.expand_dims(value_feature_new_values, axis=-1)

    # expand dimensions of current prediction tensor to single-value batch
    value_feature_new_values = tf.expand_dims(value_feature_new_values, axis=0)
    
    # build the value feature tensor
    next_input_model = tf.concat([value_feature_old_values, value_feature_new_values], axis=1)
    
    # build the next input tensor for the model
    next_input_model = tf.concat([next_input_model, pos_encodings], axis=2)

In [186]:
# iterative predictions over the forecast window reside in predictions_list
np.array(predictions_list)

array([0.63384354, 0.3797493 , 0.3085147 , 0.27068666, 0.22977859,
       0.19375685, 0.16552916, 0.14017448, 0.11583665, 0.10005224,
       0.09879532, 0.10349613, 0.12018436, 0.14141637, 0.16745162,
       0.19853613, 0.23117605, 0.25464848, 0.2710778 , 0.28217995,
       0.28996986, 0.2943427 , 0.29726797, 0.3010141 ], dtype=float32)

In [188]:
# and the true values remain in the prediction tensor
prediction_tensor[0, :, 0]

<tf.Tensor: shape=(24,), dtype=float32, numpy=
array([0.6179838 , 0.61890036, 0.6585131 , 0.695615  , 0.6312844 ,
       0.52303356, 0.40595847, 0.2784648 , 0.20911317, 0.15208519,
       0.14222518, 0.1461949 , 0.19253159, 0.22639433, 0.2271737 ,
       0.28913516, 0.37109476, 0.4790466 , 0.6021724 , 0.63034314,
       0.64071214, 0.6699102 , 0.65957916, 0.6380292 ], dtype=float32)>

In [189]:
smape(prediction_tensor[0, :, 0], np.array(predictions_list))

0.5981172323226929