### load predictions detail pickles and compare models over a consistent period

In [1]:
import json
import numpy as np
import pandas as pd
import os

In [2]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
# select a palette
from bokeh.palettes import d3
output_notebook()

In [3]:
# load all available pickles in database/predictions_detail/ by building a files list
avail_pkl_list = os.listdir(path='/home/developer/gcp/cbidmltsf/database/predictions_detail/')
avail_pkl_list.sort()
avail_pkl_list

print('{} pickle files available in database/predictions_detail/'.format(len(avail_pkl_list)))


80 pickle files available in database/predictions_detail/


In [4]:
# a Pandas dataframe to store all predictions detail items
df_columns = ['model_id',
              'execution',
              'dataset',
              'inference',
              'string_timestamps',
              'predictions',
              'targets',
              'mae',
              'rmse',
              'smape']

df = pd.DataFrame(columns=df_columns)

### trim the inference rows produced over the test dataset to a given number

In [5]:
rows_to_trim = 504

In [6]:
# collect prediction items from all available pickle files into a single dataframe
for available_pickle in avail_pkl_list:
    buffer_df = pd.read_pickle('../database/predictions_detail/{}'.format(available_pickle))[:rows_to_trim]
    # append buffer to final dataframe
    df = df.append(buffer_df, ignore_index=True)  

### group all selected rows for each model, execution, dataset, and inference

In [7]:
# build a predictions summary over the selected number of rows,
# reset index to avoid making a multi-column index when grouping by
trimmed_summary_df = df.groupby(['model_id',
                                 'execution',
                                 'dataset',
                                 'inference']).mean().reset_index()

### now consolidate executions, grouping by model, dataset, and inference

In [8]:
grouped = trimmed_summary_df.groupby(['model_id', 'dataset', 'inference']).mean().reset_index()

In [9]:
# an empty dataframe to store performance metrics values per model_id-dataset-inference
metrics_columns = [
    'model_id',
    'dataset',
    'inference',
    'count',
    # 'mae_min',
    # 'mae_mean',
    # 'mae_max',
    # 'mae_std',
    # 'rmse_min',
    # 'rmse_mean',
    # 'rmse_max',
    # 'rmse_std',
    'smape_min',
    'smape_mean',
    'smape_max',
    'smape_std',
    # 'mae_vector',
    # 'rmse_vector',
    # 'smape_vector'
]

metrics_df = pd.DataFrame(columns=metrics_columns)

In [10]:
for _, row in grouped.iterrows():

    model_id, dataset, inference = row['model_id'], row['dataset'], row['inference']
        
    # print (model_id, dataset, inference)
    # get a boolean to filter prediction_results_df on the model_id, dataset, and inference
    flag = \
    trimmed_summary_df.model_id.eq(model_id) & \
    trimmed_summary_df.dataset.eq(dataset) & \
    trimmed_summary_df.inference.eq(inference)

    filtered_df = trimmed_summary_df[flag]

    # a row dataframe to calculate metrics for a given model_id, dataset, and inference
    row_df = pd.DataFrame([[
        model_id,
        dataset,
        inference,
        filtered_df.execution.count(),
        # filtered_df.mae.min(),
        # filtered_df.mae.mean(),
        # filtered_df.mae.max(),
        # filtered_df.mae.std(),
        # filtered_df.rmse.min(),
        # filtered_df.rmse.mean(),
        # filtered_df.rmse.max(),
        # filtered_df.rmse.std(),
        filtered_df.smape.min(),
        filtered_df.smape.mean(),
        filtered_df.smape.max(),
        filtered_df.smape.std(),
        # np.mean(np.array([row.mae_vector for _, row in filtered_df.iterrows()]), axis=0),
        # np.mean(np.array([row.rmse_vector for _, row in filtered_df.iterrows()]), axis=0),
        # np.mean(np.array([row.smape_vector for _, row in filtered_df.iterrows()]), axis=0),
    ]],
        columns=metrics_columns)
    
    metrics_df = metrics_df.append(row_df, ignore_index=True)

In [11]:
# best metrics for 24-step-ahead
metrics_df[metrics_df['inference'] == '024'].style.highlight_min(color = 'lightgreen', axis = 0)

Unnamed: 0,model_id,dataset,inference,count,smape_min,smape_mean,smape_max,smape_std
0,ARTRFDC_TPU_000,test,24,10,0.037532,0.041544,0.046844,0.003603
2,ARTRFDC_TPU_001,test,24,10,0.043127,0.046446,0.048679,0.001962
4,DMSLSTM_TPU_006,test,24,10,0.032036,0.032353,0.03254,0.000195
6,EDSLSTM_TPU_011,test,24,10,0.031777,0.068875,0.387734,0.112041


In [12]:
# worst metrics for 24-step-ahead
metrics_df[metrics_df['inference'] == '024'].style.highlight_max(color = 'yellow', axis = 0)

Unnamed: 0,model_id,dataset,inference,count,smape_min,smape_mean,smape_max,smape_std
0,ARTRFDC_TPU_000,test,24,10,0.037532,0.041544,0.046844,0.003603
2,ARTRFDC_TPU_001,test,24,10,0.043127,0.046446,0.048679,0.001962
4,DMSLSTM_TPU_006,test,24,10,0.032036,0.032353,0.03254,0.000195
6,EDSLSTM_TPU_011,test,24,10,0.031777,0.068875,0.387734,0.112041


In [13]:
# best metrics for 48-step-ahead
metrics_df[metrics_df['inference'] == '048'].style.highlight_min(color = 'lightgreen', axis = 0)

Unnamed: 0,model_id,dataset,inference,count,smape_min,smape_mean,smape_max,smape_std
1,ARTRFDC_TPU_000,test,48,10,0.041624,0.047879,0.055085,0.004484
3,ARTRFDC_TPU_001,test,48,10,0.047635,0.052956,0.056466,0.00293
5,DMSLSTM_TPU_007,test,48,10,0.032846,0.033206,0.033623,0.000245
7,EDSLSTM_TPU_012,test,48,10,0.036294,0.087629,0.403293,0.118452


In [14]:
# worst metrics for 48-step-ahead
metrics_df[metrics_df['inference'] == '048'].style.highlight_max(color = 'yellow', axis = 0)

Unnamed: 0,model_id,dataset,inference,count,smape_min,smape_mean,smape_max,smape_std
1,ARTRFDC_TPU_000,test,48,10,0.041624,0.047879,0.055085,0.004484
3,ARTRFDC_TPU_001,test,48,10,0.047635,0.052956,0.056466,0.00293
5,DMSLSTM_TPU_007,test,48,10,0.032846,0.033206,0.033623,0.000245
7,EDSLSTM_TPU_012,test,48,10,0.036294,0.087629,0.403293,0.118452
