In [1]:
import json
import numpy as np
import pandas as pd
import os

In [2]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [3]:
# symmetrical mean absolute percentage error
def smape(targets, predictions):
    '''
    predictions: a NumPy array with the predicted values
    targets: a NumPy array with the actual values
    '''
    # verify predictions and targets have the same shape
    if predictions.shape == targets.shape:
            return(np.sum(2*np.abs(predictions - targets) /
                          (np.abs(targets) + np.abs(predictions)))/predictions.shape[0])

In [4]:
# load all available pickles in database/predictions by building a files list
avail_pkl_list = os.listdir(path='/home/developer/gcp/cbidmltsf/database/predictions/')
avail_pkl_list.sort()

In [5]:
print('{} pickle files available in database/predictions/'.format(len(avail_pkl_list)))

679 pickle files available in database/predictions/


In [6]:
# a Pandas dataframe to store all prediction items
df_columns=['model_id', 'execution', 'dataset', 'string_timestamp', 'prediction', 'target']
df = pd.DataFrame(columns=df_columns)

In [7]:
# collect prediction items from all available pickle files into a single dataframe
for available_pickle in avail_pkl_list:
    buffer_df = pd.read_pickle('../database/predictions/{}'.format(available_pickle))
    # append buffer to final dataframe
    df = df.append(buffer_df, ignore_index=True)            

In [8]:
df

Unnamed: 0,model_id,execution,dataset,string_timestamp,prediction,target
0,DMSLSTM_TPU_000,0,test,2018-05-26 17:00:00,2914.691162,2902.731689
1,DMSLSTM_TPU_000,0,test,2018-05-26 18:00:00,2926.174561,2947.036621
2,DMSLSTM_TPU_000,0,test,2018-05-26 19:00:00,2939.863525,2989.810059
3,DMSLSTM_TPU_000,0,test,2018-05-26 20:00:00,3074.350586,3069.921631
4,DMSLSTM_TPU_000,0,test,2018-05-26 21:00:00,3112.340088,3197.853516
...,...,...,...,...,...,...
102875,DMSLSTM_TPU_98,9,test,2018-02-28 19:00:00,8.454431,9.289534
102876,DMSLSTM_TPU_98,9,test,2018-02-28 20:00:00,9.375312,9.598534
102877,DMSLSTM_TPU_98,9,test,2018-02-28 21:00:00,8.583482,6.592383
102878,DMSLSTM_TPU_98,9,test,2018-02-28 22:00:00,5.784236,6.462183


In [9]:
# group the global dataframe by model, execution number, and dataset used
grouped = df.groupby(['model_id', 'execution', 'dataset']).size().to_frame('count').reset_index()

In [10]:
grouped

Unnamed: 0,model_id,execution,dataset,count
0,DMSLSTM_TPU_000,0,test,1591
1,DMSLSTM_TPU_000,1,test,1591
2,DMSLSTM_TPU_000,2,test,1591
3,DMSLSTM_TPU_000,3,test,1591
4,DMSLSTM_TPU_000,4,test,1591
...,...,...,...,...
674,DMSLSTM_TPU_98,5,test,130
675,DMSLSTM_TPU_98,6,test,130
676,DMSLSTM_TPU_98,7,test,130
677,DMSLSTM_TPU_98,8,test,130


In [11]:
# an empty dataframe to store performance metrics values per model execution
metrics_columns = ['model_id', 'execution', 'dataset', 'smape', 'mae', 'mse']
metrics_df = pd.DataFrame(columns=metrics_columns)

In [12]:
# finally, build the predictive performance metrics dataframe
for _, row in grouped.iterrows():
# for index, row in grouped.iterrows():
    model_id, execution, dataset = row['model_id'], row['execution'], row['dataset']
    # print (model_id, execution, dataset)
    # get a boolean to filter prediction_results_df on the model_id, execution, and dataset
    flag = \
    df.model_id.eq(model_id) & \
    df.execution.eq(execution) & \
    df.dataset.eq(dataset)

    filtered_df = df[flag]
    # a row dataframe to calculate smape for a given model_id, counter, and dataset
    row_df = pd.DataFrame([[model_id,
                            execution,
                            dataset,
                            smape(filtered_df.target, filtered_df.prediction),
                            mean_absolute_error(filtered_df.target, filtered_df.prediction),
                            mean_squared_error(filtered_df.target, filtered_df.prediction)
                           ]],
                          columns=metrics_columns)

    metrics_df = metrics_df.append(row_df, ignore_index=True)

In [13]:
metrics_df

Unnamed: 0,model_id,execution,dataset,smape,mae,mse
0,DMSLSTM_TPU_000,0,test,0.016002,40.969452,3588.660063
1,DMSLSTM_TPU_000,1,test,0.016171,41.400739,3679.103367
2,DMSLSTM_TPU_000,2,test,0.016337,41.879196,3737.456876
3,DMSLSTM_TPU_000,3,test,0.016093,41.185813,3632.204805
4,DMSLSTM_TPU_000,4,test,0.016019,41.069029,3678.592751
...,...,...,...,...,...,...
674,DMSLSTM_TPU_98,5,test,0.143499,0.880179,1.323985
675,DMSLSTM_TPU_98,6,test,0.142462,0.868961,1.284152
676,DMSLSTM_TPU_98,7,test,0.141487,0.867386,1.295510
677,DMSLSTM_TPU_98,8,test,0.142928,0.876220,1.330054


In [14]:
# build a dataframe grouped on metrics_df to use as detail for the header
metrics_df_grouped = metrics_df.groupby(['model_id']).mean()

In [15]:
metrics_df_grouped

Unnamed: 0_level_0,smape,mae,mse
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DMSLSTM_TPU_000,0.016170,41.417177,3689.267468
DMSLSTM_TPU_10,0.192778,1.188906,2.407209
DMSLSTM_TPU_11,0.192240,1.177254,2.294762
DMSLSTM_TPU_13,0.195072,1.183611,2.319429
DMSLSTM_TPU_14,0.185609,1.135878,2.226060
...,...,...,...
DMSLSTM_TPU_94,0.148921,0.922683,1.481626
DMSLSTM_TPU_95,0.147091,0.911686,1.434219
DMSLSTM_TPU_96,0.150894,0.934624,1.477289
DMSLSTM_TPU_97,0.145016,0.891851,1.370604


In [16]:
# repeat the previous steps with json files in stats/training_wall_times/

In [17]:
# first, build a batch script to transfer json files to Pandas pickles
# use the file list in /stats/training_wall_times
path = '/home/developer/gcp/cbidmltsf/stats/training_wall_times'

In [18]:
wall_times_json_files = [item[:-5] for item in os.listdir(path=path)]
wall_times_json_files.sort()

In [19]:
len(wall_times_json_files)

679

In [20]:
# now continue loading available pickle files

In [21]:
# load all available pickles in database/training_wall_times by building a files list
avail_wt_pkl_list = os.listdir(path='/home/developer/gcp/cbidmltsf/database/training_wall_times/')
avail_wt_pkl_list.sort()

In [22]:
print('{} pickle files available in database/training_wall_times/'.format(len(avail_wt_pkl_list)))

679 pickle files available in database/training_wall_times/


In [23]:
avail_wt_pkl_list[:5]

['DMSLSTM_TPU_000_00.pkl',
 'DMSLSTM_TPU_000_01.pkl',
 'DMSLSTM_TPU_000_02.pkl',
 'DMSLSTM_TPU_000_03.pkl',
 'DMSLSTM_TPU_000_04.pkl']

In [24]:
# an empty dataframe to store wall time values per model execution
wall_times_columns = ['model_id', 'execution', 'wall_time']
wall_times_df = pd.DataFrame(columns=wall_times_columns)

In [25]:
wall_times_df

Unnamed: 0,model_id,execution,wall_time


In [26]:
# collect wall times items from all available pickle files into a single dataframe
for available_wt_pickle in avail_wt_pkl_list:
    buffer_df = pd.read_pickle('../database/training_wall_times/{}'.format(available_wt_pickle))
    # append buffer to final dataframe
    wall_times_df = wall_times_df.append(buffer_df, ignore_index=True)    

In [27]:
wall_times_df

Unnamed: 0,model_id,execution,wall_time
0,DMSLSTM_TPU_000,0,23.646
1,DMSLSTM_TPU_000,1,24.4625
2,DMSLSTM_TPU_000,2,23.7454
3,DMSLSTM_TPU_000,3,25.0851
4,DMSLSTM_TPU_000,4,24.0802
...,...,...,...
674,DMSLSTM_TPU_98,5,4.856
675,DMSLSTM_TPU_98,6,5.07151
676,DMSLSTM_TPU_98,7,4.76948
677,DMSLSTM_TPU_98,8,4.99075


In [48]:
wt_grouped = wall_times_df.groupby('model_id').mean()

DataError: No numeric types to aggregate

In [30]:
wt_grouped

Unnamed: 0_level_0,execution,wall_time
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1
DMSLSTM_TPU_000,10,10
DMSLSTM_TPU_10,10,0
DMSLSTM_TPU_11,10,0
DMSLSTM_TPU_13,10,0
DMSLSTM_TPU_14,10,0
...,...,...
DMSLSTM_TPU_94,10,10
DMSLSTM_TPU_95,10,10
DMSLSTM_TPU_96,10,10
DMSLSTM_TPU_97,10,10


In [31]:
# now use the code in consolidate_hyperparameters_database
# to build the header containing hyperparameters for each model_id/experiment

In [32]:
# a list to traverse all subdirectories in parameters/
parameters_folders = os.listdir(path='/home/developer/gcp/cbidmltsf/parameters')
parameters_folders.sort()

In [33]:
len(parameters_folders)

63

In [34]:
# build the hyperparameters header
hyperparameters_columns = ['model_id',
                           'm',
                           'hourly', 'daily', 'weekly', 'use_timestamps', 'dense',
                           'train_batch_size', 'train_steps', 'base_learning_rate', 'lrs_steps', 'lrs_weights', 'precision']
hyperparameters_df = pd.DataFrame(columns=hyperparameters_columns)

In [35]:
hyperparameters_df

Unnamed: 0,model_id,m,hourly,daily,weekly,use_timestamps,dense,train_batch_size,train_steps,base_learning_rate,lrs_steps,lrs_weights,precision


In [41]:
# load relevant parameter values for model comparison into hyperparameters header
for parameters_folder in parameters_folders:

    path = '/home/developer/gcp/cbidmltsf/parameters/{}/sldb_parameters.json'.format(parameters_folder)
    with open(path, 'r') as json_file:
        sldb_parameters = json.load(json_file)

    path = '/home/developer/gcp/cbidmltsf/parameters/{}/architecture_parameters.json'.format(parameters_folder)
    with open(path, 'r') as json_file:
        architecture_parameters = json.load(json_file)

    path = '/home/developer/gcp/cbidmltsf/parameters/{}/training_parameters.json'.format(parameters_folder)
    with open(path, 'r') as json_file:
        training_parameters = json.load(json_file)
        
    model_id = parameters_folder
        
    m = [sldb_parameters['embedding']['hourly'],
         sldb_parameters['embedding']['daily'],
         sldb_parameters['embedding']['weekly'],
        ]
    
    hourly = architecture_parameters['hourly']['structure']
    daily = architecture_parameters['daily']['structure']
    weekly = architecture_parameters['weekly']['structure']
    use_timestamps = architecture_parameters['use_timestamps']
    dense = architecture_parameters['dense']['structure']
    
    train_batch_size = training_parameters['train_batch_size']
    train_steps = training_parameters['train_steps']
    base_learning_rate = training_parameters['base_learning_rate']
    lrs_steps = training_parameters['lrs_steps']
    lrs_weights = training_parameters['lrs_weights']
    precision = training_parameters['precision']
    
    # build a hyperparameters_df row
    hp_row_df = pd.DataFrame([[model_id,
                               m,
                               hourly, daily, weekly, use_timestamps, dense,
                               train_batch_size, train_steps, base_learning_rate, lrs_steps, lrs_weights, precision]],
                          columns=hyperparameters_columns)

    hyperparameters_df = hyperparameters_df.append(hp_row_df, ignore_index=True)    

In [42]:
hyperparameters_df[['model_id', 'm', 'hourly', 'train_batch_size', 'train_steps', 'base_learning_rate']]

Unnamed: 0,model_id,m,hourly,train_batch_size,train_steps,base_learning_rate
0,DMSLSTM_TPU_000,"[8, 8, 4]","[64, 128]",1024,1563,0.002500
1,DMSLSTM_TPU_10,"[16, 8, 4]","[64, 64, 64, 64]",32,16000,0.040000
2,DMSLSTM_TPU_11,"[8, 4, 4]","[64, 64, 64, 64]",32,16000,0.040000
3,DMSLSTM_TPU_13,"[8, 4, 4]","[64, 64, 64, 64]",32,16000,0.064000
4,DMSLSTM_TPU_14,"[8, 8, 4]","[64, 64, 64, 64]",32,16000,0.064000
...,...,...,...,...,...,...
58,DMSLSTM_TPU_94,"[8, 8, 4]","[64, 128]",704,639,0.003636
59,DMSLSTM_TPU_95,"[8, 8, 4]","[64, 128]",768,586,0.003333
60,DMSLSTM_TPU_96,"[8, 8, 4]","[64, 128]",832,541,0.003077
61,DMSLSTM_TPU_97,"[8, 8, 4]","[64, 128]",896,502,0.002857


In [43]:
# so far, index for hyperparameters dataframe is a numerical range
hyperparameters_df.index

RangeIndex(start=0, stop=63, step=1)

In [44]:
# change index to model_id
hyperparameters_df = hyperparameters_df.set_index('model_id')

In [45]:
hyperparameters_df.index

Index(['DMSLSTM_TPU_000', 'DMSLSTM_TPU_10', 'DMSLSTM_TPU_11', 'DMSLSTM_TPU_13',
       'DMSLSTM_TPU_14', 'DMSLSTM_TPU_15', 'DMSLSTM_TPU_16', 'DMSLSTM_TPU_17',
       'DMSLSTM_TPU_18', 'DMSLSTM_TPU_19', 'DMSLSTM_TPU_20', 'DMSLSTM_TPU_21',
       'DMSLSTM_TPU_22', 'DMSLSTM_TPU_23', 'DMSLSTM_TPU_24', 'DMSLSTM_TPU_25',
       'DMSLSTM_TPU_26', 'DMSLSTM_TPU_27', 'DMSLSTM_TPU_30', 'DMSLSTM_TPU_31',
       'DMSLSTM_TPU_32', 'DMSLSTM_TPU_33', 'DMSLSTM_TPU_34', 'DMSLSTM_TPU_35',
       'DMSLSTM_TPU_36', 'DMSLSTM_TPU_37', 'DMSLSTM_TPU_38', 'DMSLSTM_TPU_39',
       'DMSLSTM_TPU_40', 'DMSLSTM_TPU_41', 'DMSLSTM_TPU_42', 'DMSLSTM_TPU_50',
       'DMSLSTM_TPU_51', 'DMSLSTM_TPU_52', 'DMSLSTM_TPU_70', 'DMSLSTM_TPU_71',
       'DMSLSTM_TPU_72', 'DMSLSTM_TPU_73', 'DMSLSTM_TPU_74', 'DMSLSTM_TPU_75',
       'DMSLSTM_TPU_76', 'DMSLSTM_TPU_77', 'DMSLSTM_TPU_78', 'DMSLSTM_TPU_79',
       'DMSLSTM_TPU_80', 'DMSLSTM_TPU_81', 'DMSLSTM_TPU_82', 'DMSLSTM_TPU_83',
       'DMSLSTM_TPU_84', 'DMSLSTM_TPU_85', 'DMSLSTM

In [54]:
# finally, build a header-detail dataframe for hyperparameters and metrics
statistics_df = pd.concat([hyperparameters_df, metrics_df_grouped, wt_grouped], axis=1, sort=False)

In [66]:
# show the final result with only the most significative columns
# part 2
statistics_df[9:][
    [
        # 'm',
        # 'hourly', 'daily', 'weekly',
        # 'use_timestamps', dense',
        'train_batch_size', 'train_steps',
        'base_learning_rate',
        # 'lrs_steps', 'lrs_weights',
        'precision',
        'smape',
        # 'mae',
        # 'mse',
        'wall_time'
    ]
]

Unnamed: 0_level_0,train_batch_size,train_steps,base_learning_rate,precision,smape,wall_time
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DMSLSTM_TPU_20,32,16000,0.064,float32,0.179525,
DMSLSTM_TPU_21,32,16000,0.064,float32,0.182127,
DMSLSTM_TPU_22,32,16000,0.064,float32,0.185894,
DMSLSTM_TPU_23,32,16000,0.064,float32,0.176622,
DMSLSTM_TPU_24,32,16000,0.064,float32,0.175934,
DMSLSTM_TPU_25,32,16000,0.064,float32,0.178262,
DMSLSTM_TPU_26,32,16000,0.064,float32,0.178654,
DMSLSTM_TPU_27,32,16000,0.064,float32,0.180361,
DMSLSTM_TPU_30,32,16000,0.064,float32,0.177242,
DMSLSTM_TPU_31,32,16000,0.064,float32,0.17423,
