## Recursive inference (multi-step) for BSCTRFM models

### use time series instead of SLDB arrays for easier and more efficient timestamp management

In [1]:
import os
import json
import numpy as np
import pandas as pd
import joblib
from datetime import datetime
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# uncomment the following line for compatibility with TensorFlow 1.15 (on GCP)
# import tensorflow.compat.v1 as tf
# uncomment the following line for TensorFlow 2.X (local execution)
import tensorflow as tf

# forecast model was saved in TensorFlow 1.15
# but, in order to make predictions locally, has to be loaded with TensorFlow 2
from tensorflow.saved_model import load

In [2]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
# select a palette
from bokeh.palettes import d3
output_notebook()

In [3]:
# symmetrical mean absolute percentage error
def smape(targets, predictions):
    '''
    predictions: a list with the predicted values
    targets: a list with the actual values
    '''
    import numpy as np
    # lists to NumPy arrays
    targets, predictions = np.array(targets), np.array(predictions)
    # verify predictions and targets have the same shape
    if predictions.shape == targets.shape:
            return(np.sum(2*np.abs(predictions - targets) /
                          (np.abs(targets) + np.abs(predictions)))/predictions.shape[0])

In [4]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [5]:
# converts a set of tensors to a feature dict to a serialized example to pass it
# to the prediction function of the saved model 
def input_tensors_to_serialized_example(encoder_input_float_tensor,
                                        decoder_input_float_tensor,
                                        id_float_tensor):
    # first, pass the float tensors to NumPy array, then flatten them
    encoder_input_flat_array = encoder_input_float_tensor.numpy().flatten()
    decoder_input_flat_array = decoder_input_float_tensor.numpy().flatten()
    id_flat_array = id_float_tensor.numpy().flatten()
    
    # second, build the protobuffer example
    example = tf.train.Example(
        # features within the example
        features=tf.train.Features(
            # feature definition
            feature={
                'encoder_input': _float_feature_from_list_of_values(encoder_input_flat_array),
                'decoder_input': _float_feature_from_list_of_values(decoder_input_flat_array),
                'id': _float_feature_from_list_of_values(id_flat_array)
            }
        )
    )    
    # third, serialize the example dictionary to a string
    serialized_example = example.SerializeToString()
    # fourth, wrap the serialized example as a NumPy-string array
    numpy_example = np.array(serialized_example, dtype='S')
    # fifth, wrap the NumPy-string array as a string tensor
    serialized_example = tf.convert_to_tensor(numpy_example)

    return serialized_example

In [6]:
PROJECT_ROOT = '/home/developer/gcp/cbidmltsf'

In [7]:
# during batch prediction, the model identifier is obtained via Abseil Flags
# remember this notebook is based on local execution,
# therefore model directory must be downloaded from GS before running the notebook
model_id = 'BSCTRFM_TPU_014'

In [8]:
# during batch prediction, the SLDB identifier is obtained via Abseil Flags
# THE SLDB FOR INFERENCE MUST BE THE SAME USED FOR TRAINING! (THE ONE SETUP IN THE CONFIGURATION FILE)
sldb_id = 'LD2011-2014_MT320-MT330_BSCTRFM_168_168'

In [9]:
# during batch prediction, the dataset name is obtained via Abseil Flags
dataset = 'test'

In [10]:
# define a forecast window to guide the iterative prediction process
# start with a hourly, day-ahead process
forecast_window = 24

In [11]:
# ADD AN INFERENCE IDENTIFIER, BECAUSE FOR TRANSFORMER-BASED MODELS, DIFFERENT INFERENCES
# CAN BE PRODUCED FROM A SINGLE SAVED MODEL (USUALLY TO PRODUCE DIFFERENT FORECAST WINDOWS)
# during batch prediction, the inference identifier should be obtained via Abseil Flags
inference = '{:03d}'.format(forecast_window)

In [12]:
# build a path to the SLDB json file
data_dir = '{}/{}/{}'.format(PROJECT_ROOT, 'sldbs', sldb_id)

# then get the ts_identifier from the json file in the sldb directory
sldb_json_file = '{}/sldb.json'.format(data_dir)

In [13]:
# open the json file
with open(sldb_json_file, 'r') as inputfile:
    sldb_dict = json.load(inputfile)

In [14]:
# and get the time series identifier
ts_identifier = sldb_dict['ts']
ts_identifier

'LD2011-2014_MT320-MT330'

In [15]:
# get the SLDB parameters for the forecasting model
config_json_file = '{}/{}/{}.json'.format(PROJECT_ROOT,
                                          'parameters',
                                          model_id)

# recover the sldb dictionary from the json file in parameters/
with open(config_json_file, 'r') as inputfile:
    configuration = json.load(inputfile)

In [16]:
m = sldb_dict['embedding']['hourly']
m

168

In [17]:
t = sldb_dict['no_targets']
t

168

In [18]:
# verify the values of the variables for batch inference
model_id, dataset, inference

('BSCTRFM_TPU_014', 'test', '024')

In [19]:
encoder_input_columns = [
    'kw_scaled',
    'sin_hour_day',
    'cos_hour_day',
    'sin_day_week',
    'cos_day_week',
    'sin_day_month',
    'cos_day_month',
    'sin_day_year',
    'cos_day_year'
]

In [20]:
decoder_input_columns = encoder_input_columns

In [21]:
id_columns = ['token_id']

In [22]:
# now get the time series for the test dataset (unseen data)

In [23]:
# define a identifier string to access to the preprocessed time series
identifier = 'LD2011-2014'

In [24]:
# build the time series directory
time_series_folder = '/home/developer/gcp/cbidmltsf/timeseries/{}'.format(identifier)

In [25]:
# use a dictionary to remain the code consistent with the SLDB building process
# most of the times, only ts['test'] will be used for inference
# however, ts['eval'] might also be used, as it have not really been seen by training process
# (no tranining modification resulted from evaluation stage)

### rename the time series dictionary to ts_test, use the customer_id as key

In [46]:
ts_test = dict()

In [47]:
# a dictionary to store predictions detail dataframe per customer id
predictions_detail = dict()

In [48]:
# a dictionary to store predictions summary dataframe per customer id
predictions_summary = dict()

## skip the following code to avoid inference process

## load predictions detail dataframes from the database directory

In [49]:
model_id = 'BSCTRFM_TPU_014'
execution = 4
saved_model_id = '1632841048'
inference = '024'

In [50]:
customer_ids = [
    'MT_320', 'MT_321', 'MT_322', 'MT_323', 'MT_324', 'MT_325', 'MT_326', 'MT_327', 'MT_328', 'MT_329', 'MT_330'
]

In [51]:
for customer_id in customer_ids:
    detail_pickle_path = '{}/{}/{}/{}_{:02d}_{}_{}_{}.pkl'.format(
        PROJECT_ROOT,
        'database',
        'predictions_detail',
        model_id,
        execution,
        saved_model_id,
        # for electricity dataset, replace dataset with customer_id
        customer_id,
        inference)
    
    predictions_detail[customer_id] = pd.read_pickle(detail_pickle_path)

In [52]:
# a dictionary to manage Bokeh plots
plots = dict()

In [53]:
for customer_id in customer_ids:
    # label the metrics plot per customer_id
    label = '{}_metrics'.format(customer_id)
    
    plots[label] = figure(
        title='ND, NRMSE, and SMAPE over the test set for {}'.format(customer_id),
        width=960,
        height=320,
        x_axis_label='Test Dataset Row Index',
        y_axis_label='Error')

    plots[label].line(
        x=predictions_detail[customer_id].index,
        y=predictions_detail[customer_id]['nd'],
        legend_label='ND',
        color='orange',
        line_width=2)

    plots[label].line(
        x=predictions_detail[customer_id].index,
        y=predictions_detail[customer_id]['nrmse'],
        legend_label='NRMSE',
        color='green',
        line_width=1)

    plots[label].line(
        x=predictions_detail[customer_id].index,
        y=predictions_detail[customer_id]['smape'],
        legend_label='SMAPE',
        color='purple',
        line_width=1)

    show(plots[label])

In [54]:
# now get a global, unique value for ND and NRSME for the 7 days in the test dataset

In [55]:
global_df = pd.DataFrame(columns=['customer_id', 'timestamp', 'prediction', 'target'])

In [56]:
# starting timestamps to predict over a complete day
start_row_ids = [0, 24, 48, 72, 96, 120, 144]

In [57]:
# iterate over customer_ids
for customer_id in customer_ids:
    
    # for each customer, iterate on starting rows for each day
    for start_row_id in start_row_ids:
        # make a buffer dataframe for one-day predictions
        buffer_df = pd.DataFrame()
        # populate the buffer dataframe

        # build a 24-time repeated list for the customer_id column
        buffer_df['customer_id'] = 24*[customer_id]
        # remember that predictions_detail dataframe stores lists,
        # then the index required to retrieve the list contents is the row index (find out why, later...)
        buffer_df['timestamp'] = pd.to_datetime(
            predictions_detail[customer_id]['string_timestamps'][start_row_id:start_row_id+1][start_row_id]
        )
        buffer_df['prediction'] = \
            predictions_detail[customer_id]['predictions'][start_row_id:start_row_id+1][start_row_id]

        buffer_df['target'] = \
            predictions_detail[customer_id]['targets'][start_row_id:start_row_id+1][start_row_id]

        # buffer_df = buffer_df.set_index('timestamp')

        global_df = pd.concat([global_df, buffer_df])

In [58]:
global_df

Unnamed: 0,customer_id,timestamp,prediction,target
0,MT_320,2014-09-01 00:00:00,72.091568,74.781215
1,MT_320,2014-09-01 01:00:00,71.266380,69.396600
2,MT_320,2014-09-01 02:00:00,68.512970,63.776477
3,MT_320,2014-09-01 03:00:00,62.013844,64.335284
4,MT_320,2014-09-01 04:00:00,59.939304,61.266722
...,...,...,...,...
19,MT_330,2014-09-07 19:00:00,127.521690,141.406250
20,MT_330,2014-09-07 20:00:00,128.070511,130.750000
21,MT_330,2014-09-07 21:00:00,120.315918,134.042188
22,MT_330,2014-09-07 22:00:00,89.463257,100.467187


### over the 7 days of the test dataset, daily forecasting windows, starting at midnight (2014-09-0X 00:00:00)

In [59]:
# get a global value for MAE
global_mae = mean_absolute_error(global_df['prediction'], global_df['target'])
global_mae

38.146215024208715

In [60]:
# get a global average for true values
global_true_values_average = np.mean(global_df['target'])
global_true_values_average

498.16469337378595

In [61]:
# get a global value for ND
global_nd = global_mae/global_true_values_average
global_nd

0.07657350175875795

In [62]:
# get a global value for RMSE
global_rmse = sqrt(mean_squared_error(global_df['prediction'], global_df['target']))
global_rmse

78.38095501489329

In [63]:
# get a global value for NRMSE
global_nrmse = global_rmse/global_true_values_average
global_nrmse

0.15733944227171878

In [64]:
# get a global vale for SMAPE
global_smape = smape(global_df['prediction'], global_df['target'])
global_smape

0.07620411616860134

In [65]:
for customer_id in customer_ids:
    
    label = '{}_predictions'.format(customer_id)

    plots[label] = figure(
        title='Predictions over the test dataset for {}'.format(customer_id),
        width=960,
        height=320,
        x_axis_type='datetime',
        x_axis_label='Timestmap',
        y_axis_label='Value')

    plots[label].line(
        x=global_df[global_df['customer_id'] == customer_id]['timestamp'],
        y=global_df[global_df['customer_id'] == customer_id]['prediction'],
        legend_label='predictions',
        color='red',
        line_width=1)

    plots[label].line(
        x=global_df[global_df['customer_id'] == customer_id]['timestamp'],
        y=global_df[global_df['customer_id'] == customer_id]['target'],
        legend_label='targets',
        color='blue',
        line_width=1)

    show(plots[label])