In [1]:
import json
import numpy as np
import pandas as pd
import os

In [2]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
output_notebook()

In [3]:
# pass the following cell to code to build prediction metrics
# for all available pickle files in predictions_summary/

In [4]:
predictions_pkl_list = os.listdir(path='/home/developer/gcp/cbidmltsf/database/predictions_summary/')
predictions_pkl_list.sort()

print('{} pickle files available in database/predictions_summary/'.format(len(predictions_pkl_list)))

110 pickle files available in database/predictions_summary/


In [5]:
# pass the following cell to code to build prediction metrics
# for given sets of pickle files in predictions_summary/

topologies = ['DMSLSTM']
infrastructures = ['TPU']
experiment_no_strings = ['005']
execution_no_strings = ['00', '01', '02', '03', '04', '05', '06', '07', '08', '09']
datasets = ['test']
predictions_pkl_list = list()
for topology in topologies:
    for infrastructure in infrastructures:
        for experiment_no_string in experiment_no_strings:
            for execution_no_string in execution_no_strings:
                for dataset in datasets:
                    predictions_pkl_list.append(
                        '{}_{}_{}_{}_on_{}_tfrecord.pkl'.format(
                            topology,
                            infrastructure,
                            experiment_no_string,
                            execution_no_string,
                            dataset))

In [6]:
# a Pandas dataframe to store all predictions summary items
df_columns=['model_id', 'execution', 'dataset', 'inference',
            'count', 'mae', 'rmse', 'smape',
            'mae_vector', 'rmse_vector', 'smape_vector']
df = pd.DataFrame(columns=df_columns)

In [7]:
# collect prediction items from all available pickle files into a single dataframe
for predictions_pickle in predictions_pkl_list:
    buffer_df = pd.read_pickle('../database/predictions_summary/{}'.format(predictions_pickle))
    # append buffer to final dataframe
    df = df.append(buffer_df, ignore_index=True)  

In [8]:
# the default number of displayed rows for a dataframe is 60, expand it
pd.set_option("display.max_rows", 200)

In [9]:
df

Unnamed: 0,model_id,execution,dataset,inference,count,mae,rmse,smape,mae_vector,rmse_vector,smape_vector
0,ARTRFDC_TPU_000,0,test,24,1872,101.04483,125.422415,0.039948,"[47.8510985415206, 67.55012551332132, 80.19862...","[65.86299546355286, 90.75086411245374, 107.060...","[0.019057982109775017, 0.027025182390041053, 0..."
1,ARTRFDC_TPU_000,0,test,48,1872,112.418121,141.127973,0.044304,"[47.8510985415206, 67.55012551332132, 80.19862...","[65.86299546355286, 90.75086411245374, 107.060...","[0.019057982109775017, 0.027025182390041053, 0..."
2,ARTRFDC_TPU_000,1,test,24,1872,110.30999,136.327681,0.043377,"[49.87253955286792, 72.11261064578325, 85.5869...","[70.5845094819496, 97.81143834029527, 114.9812...","[0.019672606282578733, 0.02864809898686916, 0...."
3,ARTRFDC_TPU_000,1,test,48,1872,121.703217,151.662819,0.047798,"[49.87253955286792, 72.11261064578325, 85.5869...","[70.5845094819496, 97.81143834029527, 114.9812...","[0.019672606282578733, 0.02864809898686916, 0...."
4,ARTRFDC_TPU_000,2,test,24,1872,100.571465,124.453318,0.039781,"[49.76701394105569, 70.28462714822884, 81.5607...","[68.74878122069825, 93.71048410590784, 108.443...","[0.019780438293549973, 0.028056132289439536, 0..."
5,ARTRFDC_TPU_000,2,test,48,1872,112.413933,140.537774,0.044403,"[49.76701394105569, 70.28462714822884, 81.5607...","[68.74878122069825, 93.71048410590784, 108.443...","[0.019780438293549973, 0.028056132289439536, 0..."
6,ARTRFDC_TPU_000,3,test,24,1872,105.786972,130.371412,0.041518,"[46.788078242896965, 66.75235070122613, 79.100...","[64.11180784511782, 88.31195158504795, 104.283...","[0.01856074971051215, 0.026616715342791038, 0...."
7,ARTRFDC_TPU_000,3,test,48,1872,118.552834,147.284341,0.046312,"[46.788078242896965, 66.75235070122613, 79.100...","[64.11180784511782, 88.31195158504795, 104.283...","[0.01856074971051215, 0.026616715342791038, 0...."
8,ARTRFDC_TPU_000,4,test,24,1872,106.291191,131.08898,0.041883,"[46.91391160753038, 66.69246843126085, 79.3359...","[64.01878025850338, 88.2317226635982, 104.8267...","[0.018770852065884018, 0.02684926106242952, 0...."
9,ARTRFDC_TPU_000,4,test,48,1872,125.716293,157.593682,0.04935,"[46.91391160753038, 66.69246843126085, 79.3359...","[64.01878025850338, 88.2317226635982, 104.8267...","[0.018770852065884018, 0.02684926106242952, 0...."


In [10]:
# based on its architecture, each model produces a different number of inference rows
# over the same teset dataset

# in order to produce a fairer comparison
# use inferences for a given period only
# let's say, only 7, 14, or 21 days ahead (168, 336, 504 rows)

In [11]:
# verify the target dimensionality for inference 024
# from the first row in the summary dataframe (currently ARTRFDC_TPU_000/024)
len(df[:1]['mae_vector'][0])

24

In [12]:
# verify the target dimensionality for inference 048
# from the last row in the summary dataframe (currently EDSLSTM_TPU_012/048)
# len(df[-1:]['mae_vector'][79])

In [13]:
# pass the grouped dataframe to a temporal structure
grouped = df.groupby(['model_id', 'dataset', 'inference']).mean().reset_index()

In [14]:
grouped

Unnamed: 0,model_id,dataset,inference,mae,rmse,smape
0,ARTRFDC_TPU_000,test,24,105.266441,130.245657,0.041526
1,ARTRFDC_TPU_000,test,48,118.247803,147.931498,0.046519
2,ARTRFDC_TPU_001,test,24,114.289991,141.34779,0.045037
3,ARTRFDC_TPU_001,test,48,126.540334,157.872408,0.049771
4,BSCTRFM_TPU_009,test,24,91.161586,110.397489,0.035185
5,BSCTRFM_TPU_010,test,24,97.989032,119.423,0.037827
6,BSCTRFM_TPU_011,test,24,89.684376,107.684234,0.03459
7,DMSLSTM_TPU_006,test,24,82.055161,101.908486,0.033966
8,DMSLSTM_TPU_007,test,48,83.762919,105.949111,0.034689
9,EDSLSTM_TPU_013,test,24,80.708783,99.845779,0.031991


In [15]:
# an empty dataframe to store performance metrics values per model_id-dataset-inference
metrics_columns = [
    'model_id',
    'dataset',
    'inference',
    'count',
    # 'mae_min',
    # 'mae_mean',
    # 'mae_max',
    # 'mae_std',
    # 'rmse_min',
    # 'rmse_mean',
    # 'rmse_max',
    # 'rmse_std',
    'smape_min',
    'smape_mean',
    'smape_max',
    'smape_std',
    # 'mae_vector',
    # 'rmse_vector',
    # 'smape_vector'
]

metrics_df = pd.DataFrame(columns=metrics_columns)

In [16]:
for _, row in grouped.iterrows():

    model_id, dataset, inference = row['model_id'], row['dataset'], row['inference']
        
    # print (model_id, dataset, inference)
    # get a boolean to filter prediction_results_df on the model_id, dataset, and inference
    flag = \
    df.model_id.eq(model_id) & \
    df.dataset.eq(dataset) & \
    df.inference.eq(inference)

    filtered_df = df[flag]

    # a row dataframe to calculate metrics for a given model_id, dataset, and inference
    row_df = pd.DataFrame([[
        model_id,
        dataset,
        inference,
        filtered_df.execution.count(),
        # filtered_df.mae.min(),
        # filtered_df.mae.mean(),
        # filtered_df.mae.max(),
        # filtered_df.mae.std(),
        # filtered_df.rmse.min(),
        # filtered_df.rmse.mean(),
        # filtered_df.rmse.max(),
        # filtered_df.rmse.std(),
        filtered_df.smape.min(),
        filtered_df.smape.mean(),
        filtered_df.smape.max(),
        filtered_df.smape.std(),
        # np.mean(np.array([row.mae_vector for _, row in filtered_df.iterrows()]), axis=0),
        # np.mean(np.array([row.rmse_vector for _, row in filtered_df.iterrows()]), axis=0),
        # np.mean(np.array([row.smape_vector for _, row in filtered_df.iterrows()]), axis=0),
    ]],
        columns=metrics_columns)
    
    metrics_df = metrics_df.append(row_df, ignore_index=True)

In [17]:
# best metrics for 24-step-ahead
metrics_df[metrics_df['inference'] == '024'].style.highlight_min(color = 'lightgreen', axis = 0)

Unnamed: 0,model_id,dataset,inference,count,smape_min,smape_mean,smape_max,smape_std
0,ARTRFDC_TPU_000,test,24,10,0.039595,0.041526,0.044192,0.001538
2,ARTRFDC_TPU_001,test,24,10,0.043205,0.045037,0.047289,0.001371
4,BSCTRFM_TPU_009,test,24,10,0.03182,0.035185,0.037626,0.001728
5,BSCTRFM_TPU_010,test,24,10,0.035781,0.037827,0.040448,0.001351
6,BSCTRFM_TPU_011,test,24,10,0.032266,0.03459,0.036863,0.001282
7,DMSLSTM_TPU_006,test,24,10,0.033339,0.033966,0.034321,0.000312
9,EDSLSTM_TPU_013,test,24,10,0.031511,0.031991,0.032384,0.000251


In [18]:
# worst metrics for 24-step-ahead
metrics_df[metrics_df['inference'] == '024'].style.highlight_max(color = 'yellow', axis = 0)

Unnamed: 0,model_id,dataset,inference,count,smape_min,smape_mean,smape_max,smape_std
0,ARTRFDC_TPU_000,test,24,10,0.039595,0.041526,0.044192,0.001538
2,ARTRFDC_TPU_001,test,24,10,0.043205,0.045037,0.047289,0.001371
4,BSCTRFM_TPU_009,test,24,10,0.03182,0.035185,0.037626,0.001728
5,BSCTRFM_TPU_010,test,24,10,0.035781,0.037827,0.040448,0.001351
6,BSCTRFM_TPU_011,test,24,10,0.032266,0.03459,0.036863,0.001282
7,DMSLSTM_TPU_006,test,24,10,0.033339,0.033966,0.034321,0.000312
9,EDSLSTM_TPU_013,test,24,10,0.031511,0.031991,0.032384,0.000251


In [25]:
# best metrics for 48-step-ahead
metrics_df[metrics_df['inference'] == '048'].style.highlight_min(color = 'lightgreen', axis = 0)

Unnamed: 0,model_id,dataset,inference,count,smape_min,smape_mean,smape_max,smape_std
1,ARTRFDC_TPU_000,test,48,10,0.044158,0.046519,0.04935,0.001819
3,ARTRFDC_TPU_001,test,48,10,0.047534,0.049771,0.051672,0.001323
7,DMSLSTM_TPU_007,test,48,10,0.034282,0.034689,0.035118,0.000246
9,EDSLSTM_TPU_014,test,48,10,0.034543,0.035487,0.036865,0.000866


In [26]:
# worst metrics for 48-step-ahead
metrics_df[metrics_df['inference'] == '048'].style.highlight_max(color = 'yellow', axis = 0)

Unnamed: 0,model_id,dataset,inference,count,smape_min,smape_mean,smape_max,smape_std
1,ARTRFDC_TPU_000,test,48,10,0.044158,0.046519,0.04935,0.001819
3,ARTRFDC_TPU_001,test,48,10,0.047534,0.049771,0.051672,0.001323
7,DMSLSTM_TPU_007,test,48,10,0.034282,0.034689,0.035118,0.000246
9,EDSLSTM_TPU_014,test,48,10,0.034543,0.035487,0.036865,0.000866


In [21]:
# use Bokeh to plot prediction statistics

In [22]:
plots = dict()