## Recursive inference (multi-step) for BSCTRFM models

### use time series instead of SLDB arrays for easier and more efficient timestamp management

In [1]:
import os
import json
import numpy as np
import pandas as pd
import joblib
from datetime import datetime
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# uncomment the following line for compatibility with TensorFlow 1.15 (on GCP)
# import tensorflow.compat.v1 as tf
# uncomment the following line for TensorFlow 2.X (local execution)
import tensorflow as tf

# forecast model was saved in TensorFlow 1.15
# but, in order to make predictions locally, has to be loaded with TensorFlow 2
from tensorflow.saved_model import load

In [2]:
# symmetric mean absolute percentage error
def symmetric_mean_absolute_percentage_error(targets, predictions):
    '''
    predictions: a list with the predicted values
    targets: a list with the actual values
    '''
    import numpy as np
    # lists to NumPy arrays
    targets, predictions = np.array(targets), np.array(predictions)
    # verify predictions and targets have the same shape
    if predictions.shape == targets.shape:
            return(np.sum(2*np.abs(predictions - targets) /
                          (np.abs(targets) + np.abs(predictions)))/predictions.shape[0])

In [3]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [4]:
# converts a set of tensors to a feature dict to a serialized example to pass it
# to the prediction function of the saved model 
def input_tensors_to_serialized_example(encoder_input_float_tensor,
                                        decoder_input_float_tensor,
                                        id_float_tensor):
    # first, pass the float tensors to NumPy array, then flatten them
    encoder_input_flat_array = encoder_input_float_tensor.numpy().flatten()
    decoder_input_flat_array = decoder_input_float_tensor.numpy().flatten()
    id_flat_array = id_float_tensor.numpy().flatten()
    
    # second, build the protobuffer example
    example = tf.train.Example(
        # features within the example
        features=tf.train.Features(
            # feature definition
            feature={
                'encoder_input': _float_feature_from_list_of_values(encoder_input_flat_array),
                'decoder_input': _float_feature_from_list_of_values(decoder_input_flat_array),
                'id': _float_feature_from_list_of_values(id_flat_array)
            }
        )
    )    
    # third, serialize the example dictionary to a string
    serialized_example = example.SerializeToString()
    # fourth, wrap the serialized example as a NumPy-string array
    numpy_example = np.array(serialized_example, dtype='S')
    # fifth, wrap the NumPy-string array as a string tensor
    serialized_example = tf.convert_to_tensor(numpy_example)

    return serialized_example

In [5]:
PROJECT_ROOT = '/home/developer/gcp/cbidmltsf'

In [6]:
# during batch prediction, the model identifier is obtained via Abseil Flags
# remember this notebook is based on local execution,
# therefore model directory must be downloaded from GS before running the notebook
model_id = 'BSCTRFM_TPU_021'

In [7]:
# during batch prediction, the execution identifier is obtained via Abseil Flags
execution = 0

In [8]:
# during batch prediction, the SLDB identifier is obtained via Abseil Flags
# THE SLDB FOR INFERENCE MUST BE THE SAME USED FOR TRAINING! (THE ONE SETUP IN THE CONFIGURATION FILE)
sldb_id = 'LD2011-2014_SEPARATED_MT_320-MT_330_BSCTRFM_168_168_07DB_MMX'

In [9]:
# during batch prediction, the dataset name is obtained via Abseil Flags
dataset = 'test'

In [10]:
# define a forecast window to guide the iterative prediction process
# start with a hourly, day-ahead process
forecast_window = 24

In [11]:
# ADD AN INFERENCE IDENTIFIER, BECAUSE FOR TRANSFORMER-BASED MODELS, DIFFERENT INFERENCES
# CAN BE PRODUCED FROM A SINGLE SAVED MODEL (USUALLY TO PRODUCE DIFFERENT FORECAST WINDOWS)
# during batch prediction, the inference identifier should be obtained via Abseil Flags
inference = '{:03d}'.format(forecast_window)

In [12]:
# build a path to the SLDB json file
data_dir = '{}/{}/{}'.format(PROJECT_ROOT, 'sldbs', sldb_id)

# then get the ts_identifier from the json file in the sldb directory
sldb_json_file = '{}/sldb.json'.format(data_dir)

In [13]:
# open the json file
with open(sldb_json_file, 'r') as inputfile:
    sldb_dict = json.load(inputfile)

In [14]:
# and get the time series identifier
ts_identifier = sldb_dict['ts']
ts_identifier

'LD2011-2014_SEPARATED_MT_320-MT_330'

In [15]:
# get the SLDB parameters for the forecasting model
config_json_file = '{}/{}/{}.json'.format(PROJECT_ROOT,
                                          'parameters',
                                          model_id)

# recover the sldb dictionary from the json file in parameters/
with open(config_json_file, 'r') as inputfile:
    configuration = json.load(inputfile)

In [16]:
m = sldb_dict['embedding']['hourly']
m

168

In [17]:
t = sldb_dict['no_targets']
t

168

In [18]:
# verify the values of the variables for batch inference
model_id, execution, dataset, inference

('BSCTRFM_TPU_021', 0, 'test', '024')

In [19]:
# build a list of saved models, given the parameters in the previous cell
saved_models = os.listdir('{}/models/{}_{:02d}/export/exporter'.format(PROJECT_ROOT, model_id, execution))

In [20]:
saved_models.sort()
saved_models

['1633995397', '1633995589', '1633995770', '1633995965', '1633996143']

In [21]:
encoder_input_columns = [
    'kw_scaled',
    'sin_hours_from_start',
    'cos_hours_from_start',
    'sin_hour_day',
    'cos_hour_day',
    'sin_day_week',
    'cos_day_week',
    # 'sin_day_month',
    # 'cos_day_month',
    # 'sin_day_year',
    # 'cos_day_year'
]

In [22]:
decoder_input_columns = encoder_input_columns

In [23]:
id_columns = ['token_id']

In [24]:
# now get the time series for the test dataset (unseen data)

In [25]:
# build the time series directory
time_series_folder = '{}/test'.format(data_dir)
time_series_folder

'/home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_SEPARATED_MT_320-MT_330_BSCTRFM_168_168_07DB_MMX/test'

In [26]:
time_series_list = os.listdir(time_series_folder)
time_series_list.sort()
time_series_list

['MT_320.pkl',
 'MT_321.pkl',
 'MT_322.pkl',
 'MT_323.pkl',
 'MT_324.pkl',
 'MT_325.pkl',
 'MT_326.pkl',
 'MT_327.pkl',
 'MT_328.pkl',
 'MT_329.pkl',
 'MT_330.pkl']

In [27]:
# use a dictionary to remain the code consistent with the SLDB building process
# most of the times, only ts['test'] will be used for inference
# however, ts['eval'] might also be used, as it have not really been seen by training process
# (no tranining modification resulted from evaluation stage)

### rename the time series dictionary to ts_test, use the customer_id as key

In [28]:
ts_test = dict()

In [29]:
# a dictionary to store predictions detail dataframe per customer id
predictions_detail = dict()

## skip the following code to avoid inference process

### set customer_id, load time series and scaler on training data

In [30]:
# a second inference identifier to run more than one inference
# on the same combination model_id, execution, inference;
# to produce different inference processes because predict_fn is stochastic
event = 0

for customer_id in ['MT_320', 'MT_321', 'MT_322', 'MT_323', 'MT_324', 'MT_325',
                    'MT_326', 'MT_327', 'MT_328', 'MT_329', 'MT_330']:
    
    # read the time series for the current customer
    ts_test[customer_id] = pd.read_pickle('{}/test/{}.pkl'.format(data_dir, customer_id))

    # for consistency, rename the column 'date' as 'timestamp'
    ts_test[customer_id] = ts_test[customer_id].rename(columns={"date": "timestamp"})
    # set the column 'timestamp' as index
    ts_test[customer_id] = ts_test[customer_id].set_index('timestamp')

    # report start and end timestamp for the loaded time series
    print('Loaded test time series for {}, which spans from {} to {}'.\
         format(customer_id,
                str(ts_test[customer_id].index[:1][0]),
                str(ts_test[customer_id].index[-1:][0])))

    scaler_type = 'min_max'
    
    if scaler_type == 'min_max':
        # build a path to the scaler of the time series
        scaler_path = '{}/scalers/min_max_{}.save'.format(data_dir, customer_id)
        # and load it
        scaler = joblib.load(scaler_path)


    '''
    if scaler_type == 'standard':
        # build a path to the scaler of the time series
        scaler_path = '{}/scalers/standard_{}.save'.format(data_dir, customer_id)
        # and load it
        scaler = joblib.load(scaler_path)
    '''    
        
        
    # pass the saved model identifier (it determines the number of training epochs)
    # and avoid using the latest saved model by default
    for saved_model_id in saved_models:

        # a columns list for the predictions dataframe
        pred_df_columns = ['model_id',
                           'execution',
                           'dataset',
                           'inference',
                           'customer_id',
                           'string_timestamps',
                           'predictions',
                           'targets']

        # build the predictions dataframe as a key-value pair of the dictionary
        predictions_detail[customer_id] = pd.DataFrame(columns=pred_df_columns)

        # use model identifier and execution number to build the model directory string
        model_dir = '{}_{:02d}'.format(model_id, execution)

        # get the path to the saved model main directory
        saved_model_path = '{}/{}/{}/export/exporter'.format(PROJECT_ROOT,
                                                             'models',
                                                             model_dir)

        # get all the files in the saved model path, to find the most recent one
        # all_files = os.listdir(saved_model_path)
        # get the path to the most recent saved model
        # latest_saved_model_id = sorted(all_files)[-1]

        # build the full path for the latest saved model dir
        export_dir = '{}/{}'.format(saved_model_path, saved_model_id)
        print ('Exported model path is {}'.format(export_dir))

        # load the saved model and the prediction function
        imported = load(export_dir=export_dir, tags='serve')
        predict_fn = imported.signatures["serving_default"]

        # iterate on a set of valid rows of the test dataset
        starting_point = 0 # based on the inference dataset
        span = 1 + 6*24 # number of days in the test dataset, expressed in hours
        dataset_row_indexes_list = starting_point + np.arange(span)

        for start_index in dataset_row_indexes_list:

            # define first prediction interval with start- and end-index
            # given the interval time_series[start_index:end_index]
            # the conditioning range is the union of the encoder-input and the decoder-input
            # and the prediction range is only the last lecture in the interval,
            # by means of a recursive inference process
            # on each step the last prediction is added to the decoder input
            # and the prediction range grows one step into the future

            # get the end-index of this recursive inference interval
            end_index = start_index + (m + t)

            # initialize a list to store recurrent predictions for this interval
            predictions_list = list()

            for i in np.arange(forecast_window):

                # build the inference interval as a sub-series of the dataset
                sub_series = ts_test[customer_id][start_index + i : end_index + i]

                # important: build sources as copies of the sub-series (and therefore of the global time series)
                # to avoid overwriting the original dataset

                # the encoder input source
                encoder_input = sub_series[encoder_input_columns][:m].copy()

                # the decoder input source
                # decoder_input = sub_series[m-1:-1].copy()
                decoder_input = sub_series[decoder_input_columns][m-1:m-1+t].copy()

                # the id (integer) for the customer
                id_input = sub_series[id_columns][:1].copy()

                # on first step (i=0), the decoder input carries only true values
                # and the predictions list is empty
                # on subsequent steps, the decoder input includes all previous predictions
                # (stored in the predictions list)
                if i > 0:
                    decoder_input['kw_scaled'][-i:] = predictions_list

                # the target source, for metrics calculation
                # the first part of the sub-series is the encoder input, and
                # the second part of the sub-series is the target (only the variable column!)
                target = sub_series['kw_scaled'][m:].copy()

                # build source tensors from the sub-series    
                encoder_input_tensor = tf.expand_dims(encoder_input, axis=0)
                decoder_input_tensor = tf.expand_dims(decoder_input, axis=0)
                id_tensor = tf.expand_dims(id_input, axis=0)

                # make input example for the prediction function
                input_example = input_tensors_to_serialized_example(encoder_input_tensor,
                                                                    decoder_input_tensor,
                                                                    id_tensor)

                # get the output of the prediction function as a dictionary
                predict_output_dict = predict_fn(input_example)

                # get the prediction output tensor
                predict_output_tensor = predict_output_dict['forecast']

                # get the most recent prediction
                most_recent_prediction = predict_output_tensor[0, :, 0].numpy()[-1]

                # append the most recent prediction timestep to the predictions list
                predictions_list.append(most_recent_prediction)

                # pass the predictions list to an array
                # current_predictions_array = np.array(predictions_list).reshape(-1, 1)
                # get the targets vector to be compared with the current predictions array
                # current_targets = np.array(target[-i-1:]).reshape(-1, 1)

                # calculate SMAPE on the rescaled variable
                # rescaled_predictions = min_max_scaler.inverse_transform(current_predictions_array)
                # rescaled_targets = min_max_scaler.inverse_transform(current_targets)

                # current_smape = symmetric_mean_absolute_percentage_error(
                #     rescaled_targets, rescaled_predictions)
                # print('On row {}, SMAPE for the first {} rescaled prediction(s) is {}'.format(start_index,
                #                                                                               i + 1,
                #                                                                               current_smape))        

            # iterative predictions over the forecast window reside in predictions_list
            # convert list to array, then expand feature dimension with value 1
            predicted_values = np.array(predictions_list).reshape(-1, 1)

            # inverse-scale predictions
            rescaled_predicted_values = scaler.inverse_transform(predicted_values)

            # and the true values remain in the prediction tensor, pass them to a NumPy array
            # for the true values array, expand feature dimension with value 1
            true_values = np.array(target[-i-1:]).reshape(-1, 1)

            # inverse-scale true values
            rescaled_true_values = scaler.inverse_transform(true_values)

            # a temporary dataframe built from the data in the current row
            df = pd.DataFrame(columns=pred_df_columns)
            df['model_id'] = [model_id]
            df['execution'] = [execution]
            df['dataset'] = [dataset]
            df['inference'] = [inference]
            df['customer_id'] = [customer_id]
            df['string_timestamps']= [pd.to_datetime(target.index[-i-1:]).astype(str).tolist()]
            df['predictions'] = [np.squeeze(rescaled_predicted_values).tolist()]
            df['targets'] = [np.squeeze(rescaled_true_values).tolist()]

            # calculate mean absolute error and normalized deviation
            mae = mean_absolute_error(rescaled_true_values, rescaled_predicted_values)
            df['mae'] = mae

            true_values_average = np.mean(rescaled_true_values)
            df['nd'] = mae/true_values_average

            # calculate root mean squared error and normalized root mean squared error
            rmse = sqrt(mean_squared_error(rescaled_true_values, rescaled_predicted_values))
            df['rmse'] = rmse
            df['nrmse'] = rmse/true_values_average

            df['smape'] = symmetric_mean_absolute_percentage_error(rescaled_true_values,
                                                                   rescaled_predicted_values)

            # append the temporary dataframe to the predictions detail dataframe
            predictions_detail[customer_id] = pd.concat([predictions_detail[customer_id], df])


        # reset the index of final dataframe, once all of its rows (dataset) have been processed
        predictions_detail[customer_id] = predictions_detail[customer_id].reset_index(drop=True)


        # hard-wired switcher to persist dataframe
        persist_detail = True
        
        if persist_detail:
            # build a path to persist the dataframe to database/predictions_detail/
            detail_pickle_path = '{}/{}/{}/{}_{:02d}_{}_{}_{}_{:02d}.pkl'.format(
                PROJECT_ROOT,
                'database',
                'predictions_detail',
                model_id,
                execution,
                saved_model_id,
                # for electricity dataset, replace dataset with customer_id
                customer_id,
                inference,
                event)

            # persist the Pandas dataframe to database/predictions_detail/
            predictions_detail[customer_id].to_pickle(detail_pickle_path)
            print(
                'Persisted predictions detail of {}_{:02d}_{}_{}_{}_{:02d}'.format(model_id,
                                                                                   execution,
                                                                                   saved_model_id,
                                                                                   customer_id,
                                                                                   inference,
                                                                                   event)
            )

Loaded test time series for MT_320, which spans from 2014-08-18 01:00:00 to 2014-09-07 23:00:00
Exported model path is /home/developer/gcp/cbidmltsf/models/BSCTRFM_TPU_021_00/export/exporter/1633995397
Persisted predictions detail of BSCTRFM_TPU_021_00_1633995397_MT_320_024_00
Exported model path is /home/developer/gcp/cbidmltsf/models/BSCTRFM_TPU_021_00/export/exporter/1633995589
Persisted predictions detail of BSCTRFM_TPU_021_00_1633995589_MT_320_024_00
Exported model path is /home/developer/gcp/cbidmltsf/models/BSCTRFM_TPU_021_00/export/exporter/1633995770
Persisted predictions detail of BSCTRFM_TPU_021_00_1633995770_MT_320_024_00
Exported model path is /home/developer/gcp/cbidmltsf/models/BSCTRFM_TPU_021_00/export/exporter/1633995965
Persisted predictions detail of BSCTRFM_TPU_021_00_1633995965_MT_320_024_00
Exported model path is /home/developer/gcp/cbidmltsf/models/BSCTRFM_TPU_021_00/export/exporter/1633996143
Persisted predictions detail of BSCTRFM_TPU_021_00_1633996143_MT_320_0

Persisted predictions detail of BSCTRFM_TPU_021_00_1633995397_MT_328_024_00
Exported model path is /home/developer/gcp/cbidmltsf/models/BSCTRFM_TPU_021_00/export/exporter/1633995589
Persisted predictions detail of BSCTRFM_TPU_021_00_1633995589_MT_328_024_00
Exported model path is /home/developer/gcp/cbidmltsf/models/BSCTRFM_TPU_021_00/export/exporter/1633995770
Persisted predictions detail of BSCTRFM_TPU_021_00_1633995770_MT_328_024_00
Exported model path is /home/developer/gcp/cbidmltsf/models/BSCTRFM_TPU_021_00/export/exporter/1633995965
Persisted predictions detail of BSCTRFM_TPU_021_00_1633995965_MT_328_024_00
Exported model path is /home/developer/gcp/cbidmltsf/models/BSCTRFM_TPU_021_00/export/exporter/1633996143
Persisted predictions detail of BSCTRFM_TPU_021_00_1633996143_MT_328_024_00
Loaded test time series for MT_329, which spans from 2014-08-18 01:00:00 to 2014-09-07 23:00:00
Exported model path is /home/developer/gcp/cbidmltsf/models/BSCTRFM_TPU_021_00/export/exporter/16339