### load predictions detail pickles and compare models over a consistent period

In [14]:
import json
import numpy as np
import pandas as pd
import os

In [15]:
from bokeh.plotting import figure, show, output_file, save
from bokeh.io import output_notebook
# select a palette
from bokeh.palettes import d3
output_notebook()

In [16]:
# load all available pickles in database/predictions_detail/ by building a files list
avail_pkl_list = os.listdir(path='/home/developer/gcp/cbidmltsf/database/predictions_detail/')
avail_pkl_list.sort()
avail_pkl_list

print('{} pickle files available in database/predictions_detail/'.format(len(avail_pkl_list)))


80 pickle files available in database/predictions_detail/


In [17]:
# a Pandas dataframe to store all predictions detail items
df_columns = ['model_id',
              'execution',
              'dataset',
              'inference',
              'string_timestamps',
              'predictions',
              'targets',
              'mae',
              'rmse',
              'smape']

df = pd.DataFrame(columns=df_columns)

### trim the inference rows produced over the test dataset to a given number

In [18]:
rows_to_trim = 168

In [19]:
# collect prediction items from all available pickle files into a single dataframe
for available_pickle in avail_pkl_list:
    buffer_df = pd.read_pickle('../database/predictions_detail/{}'.format(available_pickle))[:rows_to_trim]
    # append buffer to final dataframe
    df = df.append(buffer_df, ignore_index=True)  

In [20]:
df

Unnamed: 0,model_id,execution,dataset,inference,string_timestamps,predictions,targets,mae,rmse,smape
0,ARTRFDC_TPU_000,0,test,024,"[2018-05-08 00:00:00, 2018-05-08 01:00:00, 201...","[2149.200439453125, 1996.2294921875, 1900.8508...","[2116.356689453125, 1893.3948974609375, 1790.4...",115.793541,178.249934,0.048070
1,ARTRFDC_TPU_000,0,test,024,"[2018-05-08 01:00:00, 2018-05-08 02:00:00, 201...","[1975.03076171875, 1885.2646484375, 1824.51171...","[1893.3948974609375, 1790.4100341796875, 1775....",111.819374,176.953179,0.046238
2,ARTRFDC_TPU_000,0,test,024,"[2018-05-08 02:00:00, 2018-05-08 03:00:00, 201...","[1822.8671875, 1772.877197265625, 1759.9060058...","[1790.4100341796875, 1775.4566650390625, 1747....",102.713097,175.296440,0.041710
3,ARTRFDC_TPU_000,0,test,024,"[2018-05-08 03:00:00, 2018-05-08 04:00:00, 201...","[1748.633056640625, 1742.730224609375, 1779.09...","[1775.4566650390625, 1747.455078125, 1770.9300...",104.318604,176.140087,0.042744
4,ARTRFDC_TPU_000,0,test,024,"[2018-05-08 04:00:00, 2018-05-08 05:00:00, 201...","[1761.5634765625, 1788.3353271484375, 1936.010...","[1747.455078125, 1770.9300537109375, 1974.2517...",106.615181,176.006486,0.043861
...,...,...,...,...,...,...,...,...,...,...
13435,EDSLSTM_TPU_012,9,test,048,"[2018-05-10 11:00:00, 2018-05-10 12:00:00, 201...","[2584.451171875, 2676.203857421875, 2717.97607...","[2599.05322265625, 2709.36328125, 2810.8715820...",87.010160,103.575266,0.038023
13436,EDSLSTM_TPU_012,9,test,048,"[2018-05-10 12:00:00, 2018-05-10 13:00:00, 201...","[2683.687255859375, 2736.33154296875, 2743.760...","[2709.36328125, 2810.87158203125, 2839.5864257...",86.092430,101.546492,0.037774
13437,EDSLSTM_TPU_012,9,test,048,"[2018-05-10 13:00:00, 2018-05-10 14:00:00, 201...","[2745.304931640625, 2766.29248046875, 2749.001...","[2810.87158203125, 2839.58642578125, 2773.3981...",83.922040,98.567697,0.037325
13438,EDSLSTM_TPU_012,9,test,048,"[2018-05-10 14:00:00, 2018-05-10 15:00:00, 201...","[2784.90087890625, 2776.83837890625, 2738.9902...","[2839.58642578125, 2773.398193359375, 2784.179...",84.850225,100.769601,0.038291


In [22]:
# the default number of displayed rows for a dataframe is 60, expand it
pd.set_option("display.max_rows", 200)

In [26]:
### group all selected rows for each model, execution, dataset, and inference

In [24]:
# build a predictions summary over the selected number of rows,
# reset index to avoid making a multi-column index when grouping by
trimmed_summary_df = df.groupby(['model_id',
                                 'execution',
                                 'dataset',
                                 'inference']).mean().reset_index()

In [25]:
trimmed_summary_df

Unnamed: 0,model_id,execution,dataset,inference,mae,rmse,smape
0,ARTRFDC_TPU_000,0,test,24,100.628027,127.771021,0.042299
1,ARTRFDC_TPU_000,0,test,48,122.483976,156.383812,0.050592
2,ARTRFDC_TPU_000,1,test,24,118.881263,150.49299,0.0491
3,ARTRFDC_TPU_000,1,test,48,145.106863,181.366243,0.059062
4,ARTRFDC_TPU_000,2,test,24,99.546443,121.55309,0.042322
5,ARTRFDC_TPU_000,2,test,48,110.270467,136.435351,0.046266
6,ARTRFDC_TPU_000,3,test,24,124.510783,155.129013,0.050333
7,ARTRFDC_TPU_000,3,test,48,143.691809,181.774613,0.057502
8,ARTRFDC_TPU_000,4,test,24,98.15819,121.179498,0.040894
9,ARTRFDC_TPU_000,4,test,48,108.536374,135.17691,0.044649


### now consolidate executions, grouping by model, dataset, and inference

In [27]:
grouped = df.groupby(['model_id', 'dataset', 'inference']).mean().reset_index()

In [28]:
grouped

Unnamed: 0,model_id,dataset,inference,mae,rmse,smape
0,ARTRFDC_TPU_000,test,24,107.156134,133.350654,0.044568
1,ARTRFDC_TPU_000,test,48,124.854348,155.713426,0.051241
2,ARTRFDC_TPU_001,test,24,131.522431,165.179857,0.053697
3,ARTRFDC_TPU_001,test,48,155.488228,193.73682,0.062944
4,DMSLSTM_TPU_006,test,24,77.704618,94.0648,0.033249
5,DMSLSTM_TPU_007,test,48,79.411681,98.700116,0.034058
6,EDSLSTM_TPU_011,test,24,189.273018,216.585047,0.068776
7,EDSLSTM_TPU_012,test,48,202.067853,238.233974,0.082606


In [35]:
# an empty dataframe to store performance metrics values per model_id-dataset-inference
metrics_columns = [
    'model_id',
    'dataset',
    'inference',
    'count',
    # 'mae_min',
    # 'mae_mean',
    # 'mae_max',
    # 'mae_std',
    # 'rmse_min',
    # 'rmse_mean',
    # 'rmse_max',
    # 'rmse_std',
    'smape_min',
    'smape_mean',
    'smape_max',
    'smape_std',
    # 'mae_vector',
    # 'rmse_vector',
    # 'smape_vector'
]

metrics_df = pd.DataFrame(columns=metrics_columns)

In [36]:
for _, row in grouped.iterrows():

    model_id, dataset, inference = row['model_id'], row['dataset'], row['inference']
        
    # print (model_id, dataset, inference)
    # get a boolean to filter prediction_results_df on the model_id, dataset, and inference
    flag = \
    trimmed_summary_df.model_id.eq(model_id) & \
    trimmed_summary_df.dataset.eq(dataset) & \
    trimmed_summary_df.inference.eq(inference)

    filtered_df = trimmed_summary_df[flag]

    # a row dataframe to calculate metrics for a given model_id, dataset, and inference
    row_df = pd.DataFrame([[
        model_id,
        dataset,
        inference,
        filtered_df.execution.count(),
        # filtered_df.mae.min(),
        # filtered_df.mae.mean(),
        # filtered_df.mae.max(),
        # filtered_df.mae.std(),
        # filtered_df.rmse.min(),
        # filtered_df.rmse.mean(),
        # filtered_df.rmse.max(),
        # filtered_df.rmse.std(),
        filtered_df.smape.min(),
        filtered_df.smape.mean(),
        filtered_df.smape.max(),
        filtered_df.smape.std(),
        # np.mean(np.array([row.mae_vector for _, row in filtered_df.iterrows()]), axis=0),
        # np.mean(np.array([row.rmse_vector for _, row in filtered_df.iterrows()]), axis=0),
        # np.mean(np.array([row.smape_vector for _, row in filtered_df.iterrows()]), axis=0),
    ]],
        columns=metrics_columns)
    
    metrics_df = metrics_df.append(row_df, ignore_index=True)

In [39]:
# best metrics for 24-step-ahead
metrics_df[metrics_df['inference'] == '024'].style.highlight_min(color = 'lightgreen', axis = 0)

Unnamed: 0,model_id,dataset,inference,count,smape_min,smape_mean,smape_max,smape_std
0,ARTRFDC_TPU_000,test,24,10,0.038437,0.044568,0.053467,0.004978
2,ARTRFDC_TPU_001,test,24,10,0.044816,0.053697,0.064652,0.005814
4,DMSLSTM_TPU_006,test,24,10,0.032882,0.033249,0.033502,0.000187
6,EDSLSTM_TPU_011,test,24,10,0.028936,0.068776,0.40749,0.119018


In [40]:
# worst metrics for 24-step-ahead
metrics_df[metrics_df['inference'] == '024'].style.highlight_max(color = 'yellow', axis = 0)

Unnamed: 0,model_id,dataset,inference,count,smape_min,smape_mean,smape_max,smape_std
0,ARTRFDC_TPU_000,test,24,10,0.038437,0.044568,0.053467,0.004978
2,ARTRFDC_TPU_001,test,24,10,0.044816,0.053697,0.064652,0.005814
4,DMSLSTM_TPU_006,test,24,10,0.032882,0.033249,0.033502,0.000187
6,EDSLSTM_TPU_011,test,24,10,0.028936,0.068776,0.40749,0.119018


In [41]:
# best metrics for 48-step-ahead
metrics_df[metrics_df['inference'] == '048'].style.highlight_min(color = 'lightgreen', axis = 0)

Unnamed: 0,model_id,dataset,inference,count,smape_min,smape_mean,smape_max,smape_std
1,ARTRFDC_TPU_000,test,48,10,0.041292,0.051241,0.061494,0.007143
3,ARTRFDC_TPU_001,test,48,10,0.048469,0.062944,0.075184,0.008238
5,DMSLSTM_TPU_007,test,48,10,0.033687,0.034058,0.034627,0.000289
7,EDSLSTM_TPU_012,test,48,10,0.031649,0.082606,0.407689,0.120413


In [42]:
# worst metrics for 48-step-ahead
metrics_df[metrics_df['inference'] == '048'].style.highlight_max(color = 'yellow', axis = 0)

Unnamed: 0,model_id,dataset,inference,count,smape_min,smape_mean,smape_max,smape_std
1,ARTRFDC_TPU_000,test,48,10,0.041292,0.051241,0.061494,0.007143
3,ARTRFDC_TPU_001,test,48,10,0.048469,0.062944,0.075184,0.008238
5,DMSLSTM_TPU_007,test,48,10,0.033687,0.034058,0.034627,0.000289
7,EDSLSTM_TPU_012,test,48,10,0.031649,0.082606,0.407689,0.120413
