## Recursive inference (multi-step) for BSCTRFM models

### use time series instead of SLDB arrays for easier and more efficient timestamp management

In [1]:
import os
import json
import numpy as np
import pandas as pd
import joblib
from datetime import datetime
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error

# uncomment the following line for compatibility with TensorFlow 1.15 (on GCP)
# import tensorflow.compat.v1 as tf
# uncomment the following line for TensorFlow 2.X (local execution)
import tensorflow as tf

# forecast model was saved in TensorFlow 1.15
# but, in order to make predictions locally, has to be loaded with TensorFlow 2
from tensorflow.saved_model import load

In [2]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
# select a palette
from bokeh.palettes import d3
output_notebook()

In [3]:
# symmetrical mean absolute percentage error
def smape(targets, predictions):
    '''
    predictions: a list with the predicted values
    targets: a list with the actual values
    '''
    import numpy as np
    # lists to NumPy arrays
    targets, predictions = np.array(targets), np.array(predictions)
    # verify predictions and targets have the same shape
    if predictions.shape == targets.shape:
            return(np.sum(2*np.abs(predictions - targets) /
                          (np.abs(targets) + np.abs(predictions)))/predictions.shape[0])

In [4]:
# a function to encode float values for serialized examples
def _float_feature_from_list_of_values(list_of_values):
    """Returns a float_list from a list of floats / doubles."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_values))

In [5]:
# converts a set of tensors to a feature dict to a serialized example to pass it
# to the prediction function of the saved model 
def input_tensors_to_serialized_example(encoder_input_float_tensor,
                                        decoder_input_float_tensor,
                                        id_float_tensor):
    # first, pass the float tensors to NumPy array, then flatten them
    encoder_input_flat_array = encoder_input_float_tensor.numpy().flatten()
    decoder_input_flat_array = decoder_input_float_tensor.numpy().flatten()
    id_flat_array = id_float_tensor.numpy().flatten()
    
    # second, build the protobuffer example
    example = tf.train.Example(
        # features within the example
        features=tf.train.Features(
            # feature definition
            feature={
                'encoder_input': _float_feature_from_list_of_values(encoder_input_flat_array),
                'decoder_input': _float_feature_from_list_of_values(decoder_input_flat_array),
                'id': _float_feature_from_list_of_values(id_flat_array)
            }
        )
    )    
    # third, serialize the example dictionary to a string
    serialized_example = example.SerializeToString()
    # fourth, wrap the serialized example as a NumPy-string array
    numpy_example = np.array(serialized_example, dtype='S')
    # fifth, wrap the NumPy-string array as a string tensor
    serialized_example = tf.convert_to_tensor(numpy_example)

    return serialized_example

In [6]:
# now get the time series for the test dataset (unseen data)

In [7]:
# define a identifier string to access to the preprocessed time series
identifier = 'LD2011-2014'

In [8]:
# build the time series directory
time_series_folder = '/home/developer/gcp/cbidmltsf/timeseries/{}'.format(identifier)

In [9]:
# use a dictionary to remain the code consistent with the SLDB building process
# most of the times, only ts['test'] will be used for inference
# however, ts['eval'] might also be used, as it have not really been seen by training process
# (no tranining modification resulted from evaluation stage)

### rename the time series dictionary to ts_test, use the customer_id as key

In [10]:
ts_test = dict()

In [11]:
customer_id = 'MT_320'

In [12]:
# build the time series filename
ts_filename = '{}_{}'.format(customer_id, 'test')
ts_filename

'MT_320_test'

In [16]:
ts_test[customer_id] = pd.read_pickle('{}/{}.pkl'.format(time_series_folder, ts_filename))

In [17]:
ts_test[customer_id]

Unnamed: 0,date,token_id,kw_scaled,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week,sin_day_month,cos_day_month,sin_day_year,cos_day_year
9105368,2014-08-18 01:00:00,320,0.410926,0.258819,0.965926,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
9105369,2014-08-18 02:00:00,320,0.365127,0.500000,0.866025,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
9105370,2014-08-18 03:00:00,320,0.331876,0.707107,0.707107,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
9105371,2014-08-18 04:00:00,320,0.311730,0.866025,0.500000,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
9105372,2014-08-18 05:00:00,320,0.301258,0.965926,0.258819,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
...,...,...,...,...,...,...,...,...,...,...,...
9105866,2014-09-07 19:00:00,320,0.863875,-0.965926,0.258819,-0.781831,0.62349,0.994522,0.104528,-0.917584,-0.397543
9105867,2014-09-07 20:00:00,320,0.877647,-0.866025,0.500000,-0.781831,0.62349,0.994522,0.104528,-0.917584,-0.397543
9105868,2014-09-07 21:00:00,320,0.751592,-0.707107,0.707107,-0.781831,0.62349,0.994522,0.104528,-0.917584,-0.397543
9105869,2014-09-07 22:00:00,320,0.692665,-0.500000,0.866025,-0.781831,0.62349,0.994522,0.104528,-0.917584,-0.397543


In [27]:
# review start and end dates for test dataset, use relative-index slices instead of scalar index!!!
ts_test[customer_id]['date'][:1], ts_test[customer_id]['date'][-1:]

(9105368   2014-08-18 01:00:00
 Name: date, dtype: datetime64[ns],
 9105870   2014-09-07 23:00:00
 Name: date, dtype: datetime64[ns])

In [28]:
m, t = 168, 168

In [29]:
# build the first sub-series (first row of test SLDB)
# first lecture in this sub-series is the first lecture in ts_test
# last lecture in this sub-series is the first lecture in unseen data
row = 0
ts_test[customer_id][row:row+m+t]

Unnamed: 0,date,token_id,kw_scaled,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week,sin_day_month,cos_day_month,sin_day_year,cos_day_year
9105368,2014-08-18 01:00:00,320,0.410926,0.258819,0.965926,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
9105369,2014-08-18 02:00:00,320,0.365127,0.500000,0.866025,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
9105370,2014-08-18 03:00:00,320,0.331876,0.707107,0.707107,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
9105371,2014-08-18 04:00:00,320,0.311730,0.866025,0.500000,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
9105372,2014-08-18 05:00:00,320,0.301258,0.965926,0.258819,0.000000,1.00000,-0.587785,-0.809017,-0.729558,-0.683919
...,...,...,...,...,...,...,...,...,...,...,...
9105699,2014-08-31 20:00:00,320,0.692665,-0.866025,0.500000,-0.781831,0.62349,0.207912,0.978148,-0.863142,-0.504961
9105700,2014-08-31 21:00:00,320,0.677332,-0.707107,0.707107,-0.781831,0.62349,0.207912,0.978148,-0.863142,-0.504961
9105701,2014-08-31 22:00:00,320,0.595795,-0.500000,0.866025,-0.781831,0.62349,0.207912,0.978148,-0.863142,-0.504961
9105702,2014-08-31 23:00:00,320,0.510218,-0.258819,0.965926,-0.781831,0.62349,0.207912,0.978148,-0.863142,-0.504961


In [30]:
# build the last sub-series (last row of test SLDB)
# last lecture in this sub-series is the last lecture in unseen data
row = 167
ts_test[customer_id][row:row+m+t]

Unnamed: 0,date,token_id,kw_scaled,sin_hour_day,cos_hour_day,sin_day_week,cos_day_week,sin_day_month,cos_day_month,sin_day_year,cos_day_year
9105535,2014-08-25 00:00:00,320,0.396759,0.000000,1.000000,0.000000,1.00000,-0.866025,0.500000,-0.806480,-0.591261
9105536,2014-08-25 01:00:00,320,0.381225,0.258819,0.965926,0.000000,1.00000,-0.866025,0.500000,-0.806480,-0.591261
9105537,2014-08-25 02:00:00,320,0.371574,0.500000,0.866025,0.000000,1.00000,-0.866025,0.500000,-0.806480,-0.591261
9105538,2014-08-25 03:00:00,320,0.364089,0.707107,0.707107,0.000000,1.00000,-0.866025,0.500000,-0.806480,-0.591261
9105539,2014-08-25 04:00:00,320,0.343153,0.866025,0.500000,0.000000,1.00000,-0.866025,0.500000,-0.806480,-0.591261
...,...,...,...,...,...,...,...,...,...,...,...
9105866,2014-09-07 19:00:00,320,0.863875,-0.965926,0.258819,-0.781831,0.62349,0.994522,0.104528,-0.917584,-0.397543
9105867,2014-09-07 20:00:00,320,0.877647,-0.866025,0.500000,-0.781831,0.62349,0.994522,0.104528,-0.917584,-0.397543
9105868,2014-09-07 21:00:00,320,0.751592,-0.707107,0.707107,-0.781831,0.62349,0.994522,0.104528,-0.917584,-0.397543
9105869,2014-09-07 22:00:00,320,0.692665,-0.500000,0.866025,-0.781831,0.62349,0.994522,0.104528,-0.917584,-0.397543


### and the time series to produce the test SLDB is now verified

In [32]:
# from now on, all data required for inference will be extracted from the time series
# therefore, remove all references to SLDB datasets and TFRecord files

In [33]:
PROJECT_ROOT = '/home/developer/gcp/cbidmltsf'

In [34]:
# during batch prediction, the model identifier is obtained via Abseil Flags
# remember this notebook is based on local execution,
# therefore model directory must be downloaded from GS before running the notebook
model_id = 'BSCTRFM_TPU_014'

In [35]:
# during batch prediction, the SLDB identifier is obtained via Abseil Flags
# THE SLDB FOR INFERENCE MUST BE THE SAME USED FOR TRAINING! (THE ONE SETUP IN THE CONFIGURATION FILE)
sldb_id = 'LD2011-2014_MT320-MT330_BSCTRFM_168_168'

In [36]:
# during batch prediction, the dataset name is obtained via Abseil Flags
dataset = 'test'

In [37]:
# define a forecast window to guide the iterative prediction process
# start with a hourly, day-ahead process
forecast_window = 24

In [38]:
# ADD AN INFERENCE IDENTIFIER, BECAUSE FOR TRANSFORMER-BASED MODELS, DIFFERENT INFERENCES
# CAN BE PRODUCED FROM A SINGLE SAVED MODEL (USUALLY TO PRODUCE DIFFERENT FORECAST WINDOWS)
# during batch prediction, the inference identifier should be obtained via Abseil Flags
inference = '{:03d}'.format(forecast_window)

In [39]:
# build a path to the SLDB json file
data_dir = '{}/{}/{}'.format(PROJECT_ROOT, 'sldbs', sldb_id)

# then get the ts_identifier from the json file in the sldb directory
sldb_json_file = '{}/sldb.json'.format(data_dir)

In [40]:
# open the json file
with open(sldb_json_file, 'r') as inputfile:
    sldb_dict = json.load(inputfile)

In [41]:
# and get the time series identifier
ts_identifier = sldb_dict['ts']
ts_identifier

'LD2011-2014_MT320-MT330'

In [42]:
# load the scaler for the requested customer_id
scaler_path = '{}/scalers/{}_min_max.save'.format(data_dir, customer_id)
scaler_path

'/home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_MT320-MT330_BSCTRFM_168_168/scalers/MT_320_min_max.save'

In [45]:
min_max_scaler = joblib.load(scaler_path)

In [46]:
# get the SLDB parameters for the forecasting model
config_json_file = '{}/{}/{}.json'.format(PROJECT_ROOT,
                                          'parameters',
                                          model_id)

# recover the sldb dictionary from the json file in parameters/
with open(config_json_file, 'r') as inputfile:
    configuration = json.load(inputfile)

In [47]:
sldb_dict

{'ts': 'LD2011-2014_MT320-MT330',
 'embedding': {'hourly': 168},
 'tau': {'hourly': 1},
 'no_targets': 168,
 'BSCTRFM': 1,
 'stats': {'train': {'n_rows': 54043},
  'eval': {'n_rows': 2739},
  'test': {'n_rows': 1848}}}

In [48]:
m = sldb_dict['embedding']['hourly']
m

168

In [49]:
t = sldb_dict['no_targets']
t

168

In [50]:
# verify the values of the variables for batch inference
model_id, dataset, inference

('BSCTRFM_TPU_014', 'test', '024')

In [51]:
encoder_input_columns = [
    'kw_scaled',
    'sin_hour_day',
    'cos_hour_day',
    'sin_day_week',
    'cos_day_week',
    'sin_day_month',
    'cos_day_month',
    'sin_day_year',
    'cos_day_year'
]

In [52]:
decoder_input_columns = encoder_input_columns

In [53]:
id_columns = ['token_id']

### run inference process and build databases

In [43]:
# during batch prediction, the execution identifier is obtained via Abseil Flags
for execution in [4]:
    # a columns list for the predictions dataframe
    pred_df_columns = ['model_id',
                       'execution',
                       'dataset',
                       'inference',
                       'string_timestamps',
                       'predictions',
                       'targets']
    
    # build the predictions dataframe
    predictions_detail_df = pd.DataFrame(columns=pred_df_columns)

    # use model identifier and execution number to build the model directory string
    model_dir = '{}_{:02d}'.format(model_id, execution)

    # get the path to the saved model main directory
    saved_model_path = '{}/{}/{}/export/exporter'.format(PROJECT_ROOT,
                                                         'models',
                                                         model_dir)

    # get all the files in the saved model path, to find the most recent one
    all_files = os.listdir(saved_model_path)
    # get the path to the most recent saved model
    latest_saved_model_id = sorted(all_files)[-1]

    # build the full path for the latest saved model dir
    export_dir = '{}/{}'.format(saved_model_path, latest_saved_model_id)
    print ('Exported model path is {}'.format(export_dir))

    # load the saved model and the prediction function
    imported = load(export_dir=export_dir, tags='serve')
    predict_fn = imported.signatures["serving_default"]
    
    # iterate on a set of valid rows of the test dataset
    starting_point = 0 # based on the inference dataset
    span = 1 + 6*24 # number of days in the test dataset, expressed in hours
    dataset_row_indexes_list = starting_point + np.arange(span)
    
    for start_index in dataset_row_indexes_list:

        # define first prediction interval with start- and end-index
        # given the interval time_series[start_index:end_index]
        # the conditioning range is the union of the encoder-input and the decoder-input
        # and the prediction range is only the last lecture in the interval,
        # by means of a recursive inference process
        # on each step the last prediction is added to the decoder input
        # and the prediction range grows one step into the future

        # get the end-index of this recursive inference interval
        end_index = start_index + (m + t)

        # initialize a list to store recurrent predictions for this interval
        predictions_list = list()

        for i in np.arange(forecast_window):

            # build the inference interval as a sub-series of the dataset
            sub_series = ts[stage][start_index + i : end_index + i]

            # important: build sources as copies of the sub-series (and therefore of the global time series)
            # to avoid overwriting the original dataset

            # the encoder input source
            encoder_input = sub_series[encoder_input_columns][:m].copy()

            # the decoder input source
            # decoder_input = sub_series[m-1:-1].copy()
            decoder_input = sub_series[decoder_input_columns][m-1:m-1+t].copy()
            
            # the id (integer) for the customer
            id_input = sub_series[id_columns][:1].copy()

            # on first step (i=0), the decoder input carries only true values
            # and the predictions list is empty
            # on subsequent steps, the decoder input includes all previous predictions
            # (stored in the predictions list)
            if i > 0:
                decoder_input['kw_scaled'][-i:] = predictions_list

            # the target source, for metrics calculation
            # the first part of the sub-series is the encoder input, and
            # the second part of the sub-series is the target (only the variable column!)
            target = sub_series['kw_scaled'][m:].copy()

            # build source tensors from the sub-series    
            encoder_input_tensor = tf.expand_dims(encoder_input, axis=0)
            decoder_input_tensor = tf.expand_dims(decoder_input, axis=0)
            id_tensor = tf.expand_dims(id_input, axis=0)

            # make input example for the prediction function
            input_example = input_tensors_to_serialized_example(encoder_input_tensor,
                                                                decoder_input_tensor,
                                                                id_tensor)

            # get the output of the prediction function as a dictionary
            predict_output_dict = predict_fn(input_example)

            # get the prediction output tensor
            predict_output_tensor = predict_output_dict['forecast']

            # get the most recent prediction
            most_recent_prediction = predict_output_tensor[0, :, 0].numpy()[-1]

            # append the most recent prediction timestep to the predictions list
            predictions_list.append(most_recent_prediction)

            # pass the predictions list to an array
            # current_predictions_array = np.array(predictions_list).reshape(-1, 1)
            # get the targets vector to be compared with the current predictions array
            # current_targets = np.array(target[-i-1:]).reshape(-1, 1)

            # calculate SMAPE on the rescaled variable
            # rescaled_predictions = min_max_scaler.inverse_transform(current_predictions_array)
            # rescaled_targets = min_max_scaler.inverse_transform(current_targets)

            # current_smape = smape(rescaled_targets, rescaled_predictions)
            # print('On row {}, SMAPE for the first {} rescaled prediction(s) is {}'.format(start_index,
            #                                                                               i + 1,
            #                                                                               current_smape))        

        # iterative predictions over the forecast window reside in predictions_list
        # convert list to array, then expand feature dimension with value 1
        predicted_values = np.array(predictions_list).reshape(-1, 1)

        # inverse-scale predictions
        rescaled_predicted_values = min_max_scaler.inverse_transform(predicted_values)

        # and the true values remain in the prediction tensor, pass them to a NumPy array
        # for the true values array, expand feature dimension with value 1
        true_values = np.array(target[-i-1:]).reshape(-1, 1)

        # inverse-scale true values
        rescaled_true_values = min_max_scaler.inverse_transform(true_values)

        # a temporary dataframe built from the data in the current row
        df = pd.DataFrame(columns=pred_df_columns)
        df['model_id'] = [model_id]
        df['execution'] = [execution]
        df['dataset'] = [dataset]
        df['inference'] = [inference]
        df['string_timestamps']= [pd.to_datetime(target.index[-i-1:]).astype(str).tolist()]
        df['predictions'] = [np.squeeze(rescaled_predicted_values).tolist()]
        df['targets'] = [np.squeeze(rescaled_true_values).tolist()]
        
        # calculate mean absolute error and normalized deviation
        mae = mean_absolute_error(rescaled_true_values, rescaled_predicted_values)
        df['mae'] = mae
        
        true_values_average = np.mean(rescaled_true_values)
        df['nd'] = mae/true_values_average
        
        # calculate root mean squared error and normalized root mean squared error
        rmse = sqrt(mean_squared_error(rescaled_true_values, rescaled_predicted_values))
        df['rmse'] = rmse
        df['nrmse'] = rmse/true_values_average
        
        df['smape'] = smape(rescaled_true_values, rescaled_predicted_values)

        # append the temporary dataframe to the predictions detail dataframe
        predictions_detail_df = pd.concat([predictions_detail_df, df])

    
    # reset the index of final dataframe, once all of its rows (dataset) have been processed
    predictions_detail_df = predictions_detail_df.reset_index(drop=True)

    # build a predictions summary dataframe, reset index to avoid making a multi-column index when grouping by
    predictions_summary_df = predictions_detail_df.groupby(['model_id',
                                                            'execution',
                                                            'dataset',
                                                            'inference']).mean().reset_index()

    
    # DO NOT CALCULATE THE VECTOR METRICS
    calculate_vector_metrics = False
    
    if calculate_vector_metrics:
        
        # a range to iterate on prediction timesteps
        targets_range = np.arange(forecast_window)

        # vector metric (vector component to vector component)
        # an array forecast_window-d: metric for 1, 2,..., no_targets step-ahead
        # (target versus prediction for rows in dataset)

        # for index, row in dataframe.iterrows()
        mae_vector = [
            mean_absolute_error(
                # a list with the n-rows target values for the n-th step ahead
                [row.targets[n] for _, row in predictions_detail_df.iterrows()],
                # a list with the n-rows prediction values for the n-th step ahead
                [row.predictions[n] for _, row in predictions_detail_df.iterrows()]
            ) for n in targets_range
        ]
        predictions_summary_df['mae_vector'] = [mae_vector]

        # for index, row in dataframe.iterrows()
        rmse_vector = [
            sqrt(mean_squared_error(
                # a list with the n-rows target values for the n-th step ahead
                [row.targets[n] for _, row in predictions_detail_df.iterrows()],
                # a list with the n-rows prediction values for the n-th step ahead
                [row.predictions[n] for _, row in predictions_detail_df.iterrows()]
            )) for n in targets_range
        ]
        predictions_summary_df['rmse_vector'] = [rmse_vector]

        # for index, row in dataframe.iterrows()
        smape_vector = [
            smape(
                [row.targets[n] for _, row in predictions_detail_df.iterrows()],
                [row.predictions[n] for _, row in predictions_detail_df.iterrows()]
            ) for n in targets_range
        ]
        predictions_summary_df['smape_vector'] = [smape_vector]

        
    # insert count of rows as a column value
    predictions_summary_df.insert(4, 'count', len(dataset_row_indexes_list))

    # build a path to persist the dataframe to database/predictions_detail/
    detail_pickle_path = '{}/{}/{}/{}_{:02d}_{}_{}.pkl'.format(
        PROJECT_ROOT,
        'database',
        'predictions_detail',
        model_id,
        execution,
        dataset,
        inference)
    
    # persist the Pandas dataframe to database/predictions_detail/
    predictions_detail_df.to_pickle(detail_pickle_path)
    print('Persisted Pandas dataframe for predictions detail of {}_{:02d}_{}_{}'.format(model_id,
                                                                                        execution,
                                                                                        dataset,
                                                                                        inference))

    # build a path to persist the dataframe to database/predictions_summary/
    summary_pickle_path = '{}/{}/{}/{}_{:02d}_{}_{}.pkl'.format(
        PROJECT_ROOT,
        'database',
        'predictions_summary',
        model_id,
        execution,
        dataset,
        inference)

    # persist the Pandas dataframe to database/predictions_summary/
    predictions_summary_df.to_pickle(summary_pickle_path)
    print('Persisted Pandas dataframe for predictions summary of {}_{:02d}_{}_{}'.format(model_id,
                                                                                         execution,
                                                                                         dataset,
                                                                                         inference))


Exported model path is /home/developer/gcp/cbidmltsf/models/BSCTRFM_TPU_014_04/export/exporter/1631533140
Persisted Pandas dataframe for predictions detail of BSCTRFM_TPU_014_04_test_024
Persisted Pandas dataframe for predictions summary of BSCTRFM_TPU_014_04_test_024


In [44]:
predictions_detail_df

Unnamed: 0,model_id,execution,dataset,inference,string_timestamps,predictions,targets,mae,nd,rmse,nrmse,smape
0,BSCTRFM_TPU_014,4,test,024,"[2014-09-01 00:00:00, 2014-09-01 01:00:00, 201...","[72.76553344726562, 71.9839096069336, 68.34590...","[74.78121516164995, 69.39659977703457, 63.7764...",10.207663,0.089616,13.194057,0.115834,0.086802
1,BSCTRFM_TPU_014,4,test,024,"[2014-09-01 01:00:00, 2014-09-01 02:00:00, 201...","[70.84458923339844, 66.64947509765625, 64.2269...","[69.39659977703457, 63.77647714604234, 64.3352...",8.689303,0.076265,11.762020,0.103234,0.072815
2,BSCTRFM_TPU_014,4,test,024,"[2014-09-01 02:00:00, 2014-09-01 03:00:00, 201...","[67.58816528320312, 60.70099639892578, 57.7665...","[63.77647714604234, 64.33528428093645, 61.2667...",8.664055,0.075874,11.760351,0.102989,0.073732
3,BSCTRFM_TPU_014,4,test,024,"[2014-09-01 03:00:00, 2014-09-01 04:00:00, 201...","[61.314876556396484, 56.57582092285156, 54.279...","[64.33528428093645, 61.266722408026745, 61.546...",9.911903,0.086624,13.170313,0.115100,0.086786
4,BSCTRFM_TPU_014,4,test,024,"[2014-09-01 04:00:00, 2014-09-01 05:00:00, 201...","[57.87807846069336, 54.23857116699219, 100.337...","[61.266722408026745, 61.54682274247492, 104.87...",8.426121,0.073530,11.528135,0.100599,0.071808
...,...,...,...,...,...,...,...,...,...,...,...,...
140,BSCTRFM_TPU_014,4,test,024,"[2014-09-06 20:00:00, 2014-09-06 21:00:00, 201...","[134.96995544433594, 130.708984375, 109.306526...","[150.7246376811595, 131.007525083612, 123.3193...",11.848900,0.099544,14.468149,0.121549,0.102696
141,BSCTRFM_TPU_014,4,test,024,"[2014-09-06 21:00:00, 2014-09-06 22:00:00, 201...","[130.36651611328125, 106.81233215332031, 80.35...","[131.007525083612, 123.31939799331099, 98.7221...",12.916692,0.108446,16.080924,0.135012,0.107242
142,BSCTRFM_TPU_014,4,test,024,"[2014-09-06 22:00:00, 2014-09-06 23:00:00, 201...","[111.77174377441406, 82.97529602050781, 69.259...","[123.31939799331099, 98.7221293199553, 72.0248...",11.112971,0.093311,13.410775,0.112605,0.094425
143,BSCTRFM_TPU_014,4,test,024,"[2014-09-06 23:00:00, 2014-09-07 00:00:00, 201...","[84.98181915283203, 77.36021423339844, 80.7196...","[98.7221293199553, 72.0248049052397, 72.824693...",12.565256,0.105609,15.247971,0.128156,0.108458


In [135]:
predictions_summary_df

Unnamed: 0,model_id,execution,dataset,inference,count,mae,nd,rmse,nrmse,smape
0,BSCTRFM_TPU_014,4,test,24,145,10.224511,0.087024,12.736132,0.108413,0.088882


In [46]:
# plot metrics for the set of predictions

In [139]:
p = figure(title='MAE and RMSE over the test set',
           width=960,
           height=320,
           x_axis_label='Test Dataset Row Index',
           y_axis_label='Error')

p.line(x=predictions_detail_df.index,
       y=predictions_detail_df['mae'],
       legend_label='MAE',
       color='orange',
       line_width=1)

p.line(x=predictions_detail_df.index,
       y=predictions_detail_df['rmse'],
       legend_label='RMSE',
       color='green',
       line_width=1)

show(p)

In [137]:
p = figure(title='ND, NRMSE, and SMAPE over the test set',
           width=960,
           height=320,
           x_axis_label='Test Dataset Row Index',
           y_axis_label='Error')

p.line(x=predictions_detail_df.index,
       y=predictions_detail_df['nd'],
       legend_label='ND',
       color='orange',
       line_width=2)

p.line(x=predictions_detail_df.index,
       y=predictions_detail_df['nrmse'],
       legend_label='NRMSE',
       color='green',
       line_width=1)

p.line(x=predictions_detail_df.index,
       y=predictions_detail_df['smape'],
       legend_label='SMAPE',
       color='purple',
       line_width=1)

show(p)

In [64]:
# now get a global, unique value for ND and NRSME for the 7 days in the test dataset

In [118]:
global_df = pd.DataFrame(columns=['prediction', 'target'])

In [119]:
# starting timestamps to predict over a complete day
start_timestamps = [0, 24, 48, 72, 96, 120, 144]

In [120]:
for start_timestamp in start_timestamps:
    # make a buffer dataframe for one-day predictions
    buffer_df = pd.DataFrame()
    # populate the buffer dataframe
    buffer_df['timestamp'] = pd.to_datetime(
        predictions_detail_df['string_timestamps'][start_timestamp:start_timestamp+1][start_timestamp]
    )
    buffer_df['prediction'] = \
        predictions_detail_df['predictions'][start_timestamp:start_timestamp+1][start_timestamp]
    
    buffer_df['target'] = \
        predictions_detail_df['targets'][start_timestamp:start_timestamp+1][start_timestamp]
    
    buffer_df = buffer_df.set_index('timestamp')
    
    global_df = pd.concat([global_df, buffer_df])

In [121]:
global_df

Unnamed: 0,prediction,target
2014-09-01 00:00:00,72.765533,74.781215
2014-09-01 01:00:00,71.983910,69.396600
2014-09-01 02:00:00,68.345909,63.776477
2014-09-01 03:00:00,62.153027,64.335284
2014-09-01 04:00:00,57.610641,61.266722
...,...,...
2014-09-07 19:00:00,144.087250,150.165831
2014-09-07 20:00:00,127.316689,152.550167
2014-09-07 21:00:00,124.339233,130.726031
2014-09-07 22:00:00,98.572815,120.523969


In [126]:
p = figure(title='Predictions over the test dataset',
           width=960,
           height=320,
           x_axis_type='datetime',
           x_axis_label='Timestmap',
           y_axis_label='Value')

p.line(x=global_df.index,
       y=global_df['prediction'],
       legend_label='predictions',
       color='red',
       line_width=1)

p.line(x=global_df.index,
       y=global_df['target'],
       legend_label='targets',
       color='blue',
       line_width=1)

show(p)

### over the 7 days of the test dataset, daily forecasting windows, starting at midnight (2014-09-0X 00:00:00)

In [128]:
# get a global value for MAE
global_mae = mean_absolute_error(global_df['prediction'], global_df['target'])
global_mae

10.531039912489154

In [129]:
# get a global average for true values
global_true_values_average = np.mean(global_df['target'])
global_true_values_average

117.02499568668046

In [130]:
# get a global value for ND
global_nd = global_mae/global_true_values_average
global_nd

0.08998966289804167

In [132]:
# get a global value for RMSE
global_rmse = sqrt(mean_squared_error(global_df['prediction'], global_df['target']))
global_rmse

13.36922310574982

In [133]:
# get a global value for NRMSE
global_nrmse = global_rmse/global_true_values_average
global_nrmse

0.11424245758183332

In [134]:
# get a global vale for SMAPE
global_smape = smape(global_df['prediction'], global_df['target'])
global_smape

0.09170034727617235