## consolidate predictions detail and get global metrics for electricity dataset

In [1]:
import os
import json
import numpy as np
import pandas as pd
import joblib
from datetime import datetime
from math import sqrt
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [85]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
from bokeh.layouts import row, gridplot, layout
from bokeh.palettes import d3
from bokeh.models import Span

output_notebook()

In [3]:
# symmetric mean absolute percentage error
def symmetric_mean_absolute_percentage_error(targets, predictions):
    '''
    predictions: a list with the predicted values
    targets: a list with the actual values
    '''
    import numpy as np
    # lists to NumPy arrays
    targets, predictions = np.array(targets), np.array(predictions)
    # verify predictions and targets have the same shape
    if predictions.shape == targets.shape:
            return(np.sum(2*np.abs(predictions - targets) /
                          (np.abs(targets) + np.abs(predictions)))/predictions.shape[0])

In [4]:
pd.set_option('display.max_rows', 200)

In [5]:
PROJECT_ROOT = '/home/developer/gcp/cbidmltsf'

In [6]:
# build a list with customer_ids
start, end = 1, 370

customer_ids = ['MT_{:03d}'.format(token_id) for token_id in np.arange(start, end + 1)]

In [7]:
len(customer_ids)

370

In [8]:
# build separated lists for available predictions based on model_id,/execution/saved_model_id combinations

In [9]:
model_ids = [
    # BSCTRFM_TPU_055_00, at 3K, 6K, 9K, 12K train_steps
    'BSCTRFM_TPU_055', 'BSCTRFM_TPU_055', 'BSCTRFM_TPU_055', 'BSCTRFM_TPU_055',
    # BSCTRFM_TPU_055_01, at 3K, 6K, 9K, 12K train_steps
    'BSCTRFM_TPU_055', 'BSCTRFM_TPU_055', 'BSCTRFM_TPU_055', 'BSCTRFM_TPU_055',
    # BSCTRFM_TPU_055_02, at 6K, 9K, 12K train_steps
    'BSCTRFM_TPU_055', 'BSCTRFM_TPU_055', 'BSCTRFM_TPU_055',
    # BSCTRFM_TPU_055_03, at 9K, 12K train_steps
    'BSCTRFM_TPU_055', 'BSCTRFM_TPU_055',
    # BSCTRFM_TPU_056_00, at 4K, 5K, 6K, 7K, 8K train_steps
    'BSCTRFM_TPU_056', 'BSCTRFM_TPU_056', 'BSCTRFM_TPU_056', 'BSCTRFM_TPU_056', 'BSCTRFM_TPU_056',
    # BSCTRFM_TPU_056_01, at 4K, 6K, 7K, 8K train_steps
    'BSCTRFM_TPU_056', 'BSCTRFM_TPU_056', 'BSCTRFM_TPU_056', 'BSCTRFM_TPU_056',
]

In [10]:
executions = [
    # BSCTRFM_TPU_055_00, at 3K, 6K, 9K, 12K train_steps
    0, 0, 0, 0,
    # BSCTRFM_TPU_055_01, at 3K, 6K, 9K, 12K train_steps
    1, 1, 1, 1,
    # BSCTRFM_TPU_055_02, at 6K, 9K, 12K train_steps
    2, 2, 2,
    # BSCTRFM_TPU_055_03, at 9K, 12K train_steps
    3, 3,
    # BSCTRFM_TPU_056_00, at 4K, 5K, 6K, 7K, 8K train_steps
    0, 0, 0, 0, 0,
    # BSCTRFM_TPU_056_01, at 4K, 6K, 7K, 8K train_steps
    1, 1, 1, 1,
]

In [11]:
saved_model_ids = [
    # BSCTRFM_TPU_055_00, at 3K, 6K, 9K, 12K train_steps
    '1637863002', '1637863629', '1637864194', '1637865732',
    # BSCTRFM_TPU_055_01, at 3K, 6K, 9K, 12K train_steps
    '1637863302', '1637863905', '1637864474', '1637865979',
    # BSCTRFM_TPU_055_02, at 6K, 9K, 12K train_steps
    '1637864792', '1637865102', '1637866231',
    # BSCTRFM_TPU_055_03, at 9K, 12K train_steps
    '1637865427', '1637866499',
    # BSCTRFM_TPU_056_00, at 4K, 5K, 6K, 7K, 8K train_steps
    '1637866890', '1637867407', '1637867659', '1637867918', '1637868158',
    # BSCTRFM_TPU_056_01, at 4K, 6K, 7K, 8K train_steps
    '1637867159', '1637868463', '1637868718', '1637868965',
]

In [25]:
# build a dictionary to recover training information by saved model
training_info = {
    '1637863002': {
        'model_id': 'BSCTRFM_TPU_055',
        'execution': 0,
        'train_steps': 3000
    },
    '1637863629': {
        'model_id': 'BSCTRFM_TPU_055',
        'execution': 0,
        'train_steps': 6000
    },
    '1637864194': {
        'model_id': 'BSCTRFM_TPU_055',
        'execution': 0,
        'train_steps': 9000
    },
    '1637865732': {
        'model_id': 'BSCTRFM_TPU_055',
        'execution': 0,
        'train_steps': 12000
    },
    '1637863302': {
        'model_id': 'BSCTRFM_TPU_055',
        'execution': 1,
        'train_steps': 3000
    },
    '1637863905': {
        'model_id': 'BSCTRFM_TPU_055',
        'execution': 1,
        'train_steps': 6000
    },
    '1637864474': {
        'model_id': 'BSCTRFM_TPU_055',
        'execution': 1,
        'train_steps': 9000
    },
    '1637865979': {
        'model_id': 'BSCTRFM_TPU_055',
        'execution': 1,
        'train_steps': 12000
    },
    '1637864792': {
        'model_id': 'BSCTRFM_TPU_055',
        'execution': 2,
        'train_steps': 6000
    },
    '1637865102': {
        'model_id': 'BSCTRFM_TPU_055',
        'execution': 2,
        'train_steps': 9000
    },
    '1637866231': {
        'model_id': 'BSCTRFM_TPU_055',
        'execution': 2,
        'train_steps': 12000
    },
    '1637865427': {
        'model_id': 'BSCTRFM_TPU_055',
        'execution': 3,
        'train_steps': 9000
    },
    '1637866499': {
        'model_id': 'BSCTRFM_TPU_055',
        'execution': 3,
        'train_steps': 12000
    },
    '1637866890': {
        'model_id': 'BSCTRFM_TPU_056',
        'execution': 0,
        'train_steps': 4000
    },
    '1637867407': {
        'model_id': 'BSCTRFM_TPU_056',
        'execution': 0,
        'train_steps': 5000
    },
    '1637867659': {
        'model_id': 'BSCTRFM_TPU_056',
        'execution': 0,
        'train_steps': 6000
    },
    '1637867918': {
        'model_id': 'BSCTRFM_TPU_056',
        'execution': 0,
        'train_steps': 7000
    },
    '1637868158': {
        'model_id': 'BSCTRFM_TPU_056',
        'execution': 0,
        'train_steps': 8000
    },
    '1637867159': {
        'model_id': 'BSCTRFM_TPU_056',
        'execution': 1,
        'train_steps': 4000
    },
    '1637868463': {
        'model_id': 'BSCTRFM_TPU_056',
        'execution': 1,
        'train_steps': 6000
    },
    '1637868718': {
        'model_id': 'BSCTRFM_TPU_056',
        'execution': 1,
        'train_steps': 7000
    },
    '1637868965': {
        'model_id': 'BSCTRFM_TPU_056',
        'execution': 1,
        'train_steps': 8000
    },
}

In [12]:
for model_id, execution, saved_model_id in zip(model_ids,
                                               executions,
                                               saved_model_ids):
    print(model_id, execution, saved_model_id)

BSCTRFM_TPU_055 0 1637863002
BSCTRFM_TPU_055 0 1637863629
BSCTRFM_TPU_055 0 1637864194
BSCTRFM_TPU_055 0 1637865732
BSCTRFM_TPU_055 1 1637863302
BSCTRFM_TPU_055 1 1637863905
BSCTRFM_TPU_055 1 1637864474
BSCTRFM_TPU_055 1 1637865979
BSCTRFM_TPU_055 2 1637864792
BSCTRFM_TPU_055 2 1637865102
BSCTRFM_TPU_055 2 1637866231
BSCTRFM_TPU_055 3 1637865427
BSCTRFM_TPU_055 3 1637866499
BSCTRFM_TPU_056 0 1637866890
BSCTRFM_TPU_056 0 1637867407
BSCTRFM_TPU_056 0 1637867659
BSCTRFM_TPU_056 0 1637867918
BSCTRFM_TPU_056 0 1637868158
BSCTRFM_TPU_056 1 1637867159
BSCTRFM_TPU_056 1 1637868463
BSCTRFM_TPU_056 1 1637868718
BSCTRFM_TPU_056 1 1637868965


In [13]:
sldb_id = 'LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX'
sldb_id

'LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX'

In [14]:
forecast_window = 24
inference = '{:03d}'.format(forecast_window)

In [15]:
# build a path to the SLDB json file
data_dir = '{}/{}/{}'.format(PROJECT_ROOT, 'sldbs', sldb_id)
data_dir

'/home/developer/gcp/cbidmltsf/sldbs/LD2011-2014_SEPARATED_FULL_BSCTRFM_168_168_07DB_MMX'

In [16]:
# then get the ts_identifier from the json file in the sldb directory
sldb_json_file = '{}/sldb.json'.format(data_dir)

In [17]:
# open the json file
with open(sldb_json_file, 'r') as inputfile:
    sldb_dict = json.load(inputfile)

In [18]:
ts_identifier = sldb_dict['ts']
m = sldb_dict['embedding']['hourly']
t = sldb_dict['no_targets']

In [19]:
ts_identifier, m, t

('LD2011-2014_SEPARATED_FULL', 168, 168)

## skip the next code cells if predictions detail dataframe is already persisted!

In [48]:
predictions_detail_columns=[
    'model_id', 'execution', 'saved_model_id',
    'inference', 'event',
    'customer_id', 'timestamp', 'prediction', 'target'
]

In [110]:
predictions_detail_df = pd.DataFrame(columns=predictions_detail_columns)

In [111]:
# inference event value can be retrieved from the pickle filename
# define it here as only inference event zero has been run

event = 0

# iterate over model_id, execution, saved_model_id combinations
for model_id, execution, saved_model_id in zip(model_ids,
                                               executions,
                                               saved_model_ids):
    
    print('Processing predictions for model {}, execution {}, saved model {}...'.\
          format(model_id, execution, saved_model_id)
         )
    
    
    # for each saved_model_id, iterate over persisted predictions for the test dataset
    for customer_id in customer_ids:
        detail_pickle_path = '{}/{}/{}/{}_{:02d}_{}_{}_{}_{:02d}.pkl'.format(
            PROJECT_ROOT,
            'database',
            'predictions_detail',
            model_id,
            execution,
            saved_model_id,
            # for electricity dataset, replace dataset with customer_id
            customer_id,
            inference,
            event)
        
        loaded_predictions_df = pd.read_pickle(detail_pickle_path)
        
        # pass info on the 7-row loaded_predictions_df
        # to a 168-row predictions_detail_df, use a buffer_df

        # for each customer, iterate on starting rows for each day
        for index, row in loaded_predictions_df.iterrows():

            # make a buffer dataframe for one-day predictions
            buffer_df = pd.DataFrame()

            # populate the buffer dataframe

            buffer_df['model_id'] = forecast_window*[model_id]
            buffer_df['execution'] = forecast_window*[execution]
            buffer_df['saved_model_id'] = forecast_window*[saved_model_id]
            buffer_df['customer_id'] = forecast_window*[customer_id]
            buffer_df['inference'] = forecast_window*[inference]
            buffer_df['event'] = forecast_window*[event]

            buffer_df['timestamp'] = pd.to_datetime(row['string_timestamps'])
            buffer_df['prediction'] = row['predictions']
            buffer_df['target'] =  row['targets']

            predictions_detail_df = pd.concat(
                [predictions_detail_df, buffer_df])
            


# once added all requested model_id/execution/saved_model_id combinations
# and all 7 24-h rows to the predictions detail dataframe,
# for all the customer_ids, reset the index
predictions_detail_df = predictions_detail_df.reset_index()


Processing predictions for model BSCTRFM_TPU_055, execution 0, saved model 1637863002...
Processing predictions for model BSCTRFM_TPU_055, execution 0, saved model 1637863629...
Processing predictions for model BSCTRFM_TPU_055, execution 0, saved model 1637864194...
Processing predictions for model BSCTRFM_TPU_055, execution 0, saved model 1637865732...
Processing predictions for model BSCTRFM_TPU_055, execution 1, saved model 1637863302...
Processing predictions for model BSCTRFM_TPU_055, execution 1, saved model 1637863905...
Processing predictions for model BSCTRFM_TPU_055, execution 1, saved model 1637864474...
Processing predictions for model BSCTRFM_TPU_055, execution 1, saved model 1637865979...
Processing predictions for model BSCTRFM_TPU_055, execution 2, saved model 1637864792...
Processing predictions for model BSCTRFM_TPU_055, execution 2, saved model 1637865102...
Processing predictions for model BSCTRFM_TPU_055, execution 2, saved model 1637866231...
Processing prediction

In [107]:
# ToDo: persist the predictions_detail_df in /databases

In [112]:
predictions_detail_df.to_pickle('/home/developer/gcp/cbidmltsf/database/electricity.pkl')

In [None]:
# just load the predictions detail dataframe (previously persisted)

In [20]:
predictions_detail_df = pd.read_pickle('/home/developer/gcp/cbidmltsf/database/electricity.pkl')

In [21]:
predictions_detail_df

Unnamed: 0,index,model_id,execution,saved_model_id,inference,event,customer_id,timestamp,prediction,target
0,0,BSCTRFM_TPU_055,0,1637863002,024,0,MT_001,2014-09-01 00:00:00,17.688816,16.180203
1,1,BSCTRFM_TPU_055,0,1637863002,024,0,MT_001,2014-09-01 01:00:00,18.235891,16.497462
2,2,BSCTRFM_TPU_055,0,1637863002,024,0,MT_001,2014-09-01 02:00:00,18.561188,16.180203
3,3,BSCTRFM_TPU_055,0,1637863002,024,0,MT_001,2014-09-01 03:00:00,20.365328,16.497462
4,4,BSCTRFM_TPU_055,0,1637863002,024,0,MT_001,2014-09-01 04:00:00,19.960730,16.180203
...,...,...,...,...,...,...,...,...,...,...
1367515,19,BSCTRFM_TPU_056,1,1637868965,024,0,MT_370,2014-09-07 19:00:00,22408.533203,20824.324324
1367516,20,BSCTRFM_TPU_056,1,1637868965,024,0,MT_370,2014-09-07 20:00:00,21819.689453,19527.027027
1367517,21,BSCTRFM_TPU_056,1,1637868965,024,0,MT_370,2014-09-07 21:00:00,21928.656250,20202.702703
1367518,22,BSCTRFM_TPU_056,1,1637868965,024,0,MT_370,2014-09-07 22:00:00,22943.761719,19851.351351


In [34]:
# get statistics from the predictions detail dataframe by saved model identifier

# a dictionary of dictionaries to store inference metrics
metrics = dict()

for saved_model_id in saved_model_ids:
    
    # a dictionary for metrics on the current saved model
    metrics[saved_model_id] = dict()
    
    results_df = predictions_detail_df[predictions_detail_df["saved_model_id"] == saved_model_id]
    
    metrics[saved_model_id]['mae'] = mean_absolute_error(
        results_df['prediction'], results_df['target']
    )
    
    true_values_average = np.mean(
        results_df['target']
    )
    
    metrics[saved_model_id]['nd'] = metrics[saved_model_id]['mae']/true_values_average

    metrics[saved_model_id]['rmse'] = sqrt(
        mean_squared_error(
            results_df['prediction'], results_df['target']
        )
    )
    
    metrics[saved_model_id]['nrmse'] = metrics[saved_model_id]['rmse']/true_values_average

    metrics[saved_model_id]['smape'] = symmetric_mean_absolute_percentage_error(
        results_df['prediction'], results_df['target']
    )

    print('Saved model {}, ND: {}, NRMSE: {}, SMAPE: {}'.format(saved_model_id,
                                                                metrics[saved_model_id]['nd'],
                                                                metrics[saved_model_id]['nrmse'],
                                                                metrics[saved_model_id]['smape']
                                                               )
         )

Saved model 1637863002, ND: 0.07499619477151644, NRMSE: 0.5351805232057555, SMAPE: 0.12990528454070893
Saved model 1637863629, ND: 0.07462521666374447, NRMSE: 0.5461752854472485, SMAPE: 0.12437068616939888
Saved model 1637864194, ND: 0.07535566283304043, NRMSE: 0.6165859837826502, SMAPE: 0.12583219950953217
Saved model 1637865732, ND: 0.06999339992058573, NRMSE: 0.5254253980467948, SMAPE: 0.12016170376072668
Saved model 1637863302, ND: 0.06989795845341049, NRMSE: 0.5499317942066688, SMAPE: 0.11716068951125673
Saved model 1637863905, ND: 0.07125180427594334, NRMSE: 0.5857794726129275, SMAPE: 0.11271499066952906
Saved model 1637864474, ND: 0.07052512455031132, NRMSE: 0.5763264989451125, SMAPE: 0.11374702626430361
Saved model 1637865979, ND: 0.06764749466511644, NRMSE: 0.5247670387886102, SMAPE: 0.11326571290095541
Saved model 1637864792, ND: 0.07447706486818038, NRMSE: 0.5776953525644812, SMAPE: 0.12442336554361476
Saved model 1637865102, ND: 0.0688196591235853, NRMSE: 0.5432375113433718

In [29]:
# merge training info and metrics dictionaries to a dataframe

In [37]:
training_info_df = pd.DataFrame.from_dict(training_info, orient='index')
training_info_df

Unnamed: 0,model_id,execution,train_steps
1637863002,BSCTRFM_TPU_055,0,3000
1637863629,BSCTRFM_TPU_055,0,6000
1637864194,BSCTRFM_TPU_055,0,9000
1637865732,BSCTRFM_TPU_055,0,12000
1637863302,BSCTRFM_TPU_055,1,3000
1637863905,BSCTRFM_TPU_055,1,6000
1637864474,BSCTRFM_TPU_055,1,9000
1637865979,BSCTRFM_TPU_055,1,12000
1637864792,BSCTRFM_TPU_055,2,6000
1637865102,BSCTRFM_TPU_055,2,9000


In [38]:
metrics_df = pd.DataFrame.from_dict(metrics, orient='index')
metrics_df

Unnamed: 0,mae,nd,rmse,nrmse,smape
1637863002,56.777212,0.074996,405.167999,0.535181,0.129905
1637863629,56.496357,0.074625,413.49178,0.546175,0.124371
1637864194,57.049354,0.075356,466.79746,0.616586,0.125832
1637865732,52.989757,0.069993,397.782706,0.525425,0.120162
1637863302,52.917501,0.069898,416.335712,0.549932,0.117161
1637863905,53.942454,0.071252,443.474839,0.585779,0.112715
1637864474,53.392308,0.070525,436.318296,0.576326,0.113747
1637865979,51.213747,0.067647,397.284284,0.524767,0.113266
1637864792,56.384196,0.074477,437.354612,0.577695,0.124423
1637865102,52.101155,0.06882,411.267686,0.543238,0.112385


In [46]:
predictions_summary_df = pd.concat([training_info_df, metrics_df], axis=1)
predictions_summary_df

Unnamed: 0,model_id,execution,train_steps,mae,nd,rmse,nrmse,smape
1637863002,BSCTRFM_TPU_055,0,3000,56.777212,0.074996,405.167999,0.535181,0.129905
1637863629,BSCTRFM_TPU_055,0,6000,56.496357,0.074625,413.49178,0.546175,0.124371
1637864194,BSCTRFM_TPU_055,0,9000,57.049354,0.075356,466.79746,0.616586,0.125832
1637865732,BSCTRFM_TPU_055,0,12000,52.989757,0.069993,397.782706,0.525425,0.120162
1637863302,BSCTRFM_TPU_055,1,3000,52.917501,0.069898,416.335712,0.549932,0.117161
1637863905,BSCTRFM_TPU_055,1,6000,53.942454,0.071252,443.474839,0.585779,0.112715
1637864474,BSCTRFM_TPU_055,1,9000,53.392308,0.070525,436.318296,0.576326,0.113747
1637865979,BSCTRFM_TPU_055,1,12000,51.213747,0.067647,397.284284,0.524767,0.113266
1637864792,BSCTRFM_TPU_055,2,6000,56.384196,0.074477,437.354612,0.577695,0.124423
1637865102,BSCTRFM_TPU_055,2,9000,52.101155,0.06882,411.267686,0.543238,0.112385


In [90]:
p = figure(title='For Electricity/ND train over 10K steps', x_axis_label='Training steps', y_axis_label='ND')

objective = Span(location=0.07,
                    dimension='width',
                    line_color='red',
                    line_dash='dashed',
                    line_width=2)

p.add_layout(objective)

p.line(
    x=predictions_summary_df['1637863002':'1637865732'].train_steps,
    y=predictions_summary_df['1637863002':'1637865732'].nd,
    color='blue',
    legend_label='055_00'
)

p.line(
    x=predictions_summary_df['1637863302':'1637865979'].train_steps,
    y=predictions_summary_df['1637863302':'1637865979'].nd,
    color='orange',
    legend_label='055_01'
)

p.line(
    x=predictions_summary_df['1637864792':'1637866231'].train_steps,
    y=predictions_summary_df['1637864792':'1637866231'].nd,
    color='cyan',
    legend_label='055_02'
)

p.line(
    x=predictions_summary_df['1637865427':'1637866499'].train_steps,
    y=predictions_summary_df['1637865427':'1637866499'].nd,
    color='purple',
    legend_label='055_03'
)

p.line(
    x=predictions_summary_df['1637866890':'1637868158'].train_steps,
    y=predictions_summary_df['1637866890':'1637868158'].nd,
    color='green',
    legend_label='056_00'
)

p.line(
    x=predictions_summary_df['1637867159':'1637868965'].train_steps,
    y=predictions_summary_df['1637867159':'1637868965'].nd,
    color='brown',
    legend_label='056_01'
)

show(p)

In [89]:
p = figure(title='For Electricity/NRMSE all training steps values reach the objective', x_axis_label='Training steps', y_axis_label='NRMSE')

objective = Span(location=1.0,
                    dimension='width',
                    line_color='red',
                    line_dash='dashed',
                    line_width=2)

p.add_layout(objective)

p.line(
    x=predictions_summary_df['1637863002':'1637865732'].train_steps,
    y=predictions_summary_df['1637863002':'1637865732'].nrmse,
    color='blue',
    legend_label='055_00'
)

p.line(
    x=predictions_summary_df['1637863302':'1637865979'].train_steps,
    y=predictions_summary_df['1637863302':'1637865979'].nrmse,
    color='orange',
    legend_label='055_01'
)

p.line(
    x=predictions_summary_df['1637864792':'1637866231'].train_steps,
    y=predictions_summary_df['1637864792':'1637866231'].nrmse,
    color='cyan',
    legend_label='055_02'
)

p.line(
    x=predictions_summary_df['1637865427':'1637866499'].train_steps,
    y=predictions_summary_df['1637865427':'1637866499'].nrmse,
    color='purple',
    legend_label='055_03'
)

p.line(
    x=predictions_summary_df['1637866890':'1637868158'].train_steps,
    y=predictions_summary_df['1637866890':'1637868158'].nrmse,
    color='green',
    legend_label='056_00'
)

p.line(
    x=predictions_summary_df['1637867159':'1637868965'].train_steps,
    y=predictions_summary_df['1637867159':'1637868965'].nrmse,
    color='brown',
    legend_label='056_01'
)

show(p)