In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import json

In [4]:
# use the notebook build_Pandas_dataframe_from_prediction_results_TPU_10_to_TPU_52
# as a reference to work with results from TPU_70 to TPU_79

In [5]:
# build a list with the model identifiers
architecture = 'DMSLSTM'
infrastructure = 'TPU'
exp_range_list = [x for x in range(80, 81)]

In [6]:
model_ids_list = ['{}_{}_{:02}'.format(architecture, infrastructure, item)\
                  for item in exp_range_list]

In [7]:
model_ids_list

['DMSLSTM_TPU_80']

In [8]:
# the experiments in the model_ids_list were trained for the same number of times, then
num_executions = 10

In [9]:
# so far, only predictions for test.tfrecord have been produced, then 
datasets = ['test']

In [10]:
columns=['model_id', 'counter', 'dataset', 'string_timestamp', 'prediction', 'target']
prediction_results_df = pd.DataFrame(\
                                     columns=columns)

In [11]:
prediction_results_df

Unnamed: 0,model_id,counter,dataset,string_timestamp,prediction,target


In [12]:
for model_id in model_ids_list:
    for counter in np.arange(num_executions):
        for dataset in datasets:
            # build a path to JSON files, that is
            # ../stats/model_id_counter/prediction_results_on_dataset_tfrecord.json
            path = '../stats/{}_{:02d}/prediction_results_on_{}_tfrecord.json'.format(model_id, counter, dataset)
            # print(path)
            # load json file to dictionary
            with open(path, 'r') as json_file:
                prediction_results = json.load(json_file)
            # how many predictions in the dataset?
            length = len(prediction_results['predictions'])
            # a list with model_id repeated length times
            model_id_repeat_list = [model_id]*length
            # same for counter
            counter_repeat_list = [counter]*length
            # same for dataset
            dataset_repeat_list = [dataset]*length
            # buffer dataframe
            df = pd.DataFrame(list(zip(model_id_repeat_list,
                                       counter_repeat_list,
                                       dataset_repeat_list,
                                       prediction_results['string_timestamps'],
                                       prediction_results['predictions'],
                                       prediction_results['targets'])), columns = columns)
            # append buffer to final dataframe
            prediction_results_df = prediction_results_df.append(df, ignore_index=True)            

In [13]:
prediction_results_df

Unnamed: 0,model_id,counter,dataset,string_timestamp,prediction,target
0,DMSLSTM_TPU_80,0,test,2018-02-23 14:00:00,7.613549,6.213650
1,DMSLSTM_TPU_80,0,test,2018-02-23 15:00:00,9.115017,8.161867
2,DMSLSTM_TPU_80,0,test,2018-02-23 16:00:00,8.168036,6.752000
3,DMSLSTM_TPU_80,0,test,2018-02-23 17:00:00,4.616117,5.244833
4,DMSLSTM_TPU_80,0,test,2018-02-23 18:00:00,6.697938,7.247250
...,...,...,...,...,...,...
1295,DMSLSTM_TPU_80,9,test,2018-02-28 19:00:00,7.819508,9.289534
1296,DMSLSTM_TPU_80,9,test,2018-02-28 20:00:00,9.248091,9.598534
1297,DMSLSTM_TPU_80,9,test,2018-02-28 21:00:00,8.735196,6.592383
1298,DMSLSTM_TPU_80,9,test,2018-02-28 22:00:00,5.480075,6.462183


In [14]:
# persist the Pandas dataframe to avoid re-calculating
prediction_results_df.to_pickle('../database/prediction_results_TPU_80.pkl')

In [15]:
# start here to read the persisted dataframe
prediction_results_TPU_80_df = pd.read_pickle('../database/prediction_results_TPU_80.pkl')

In [16]:
# symmetrical mean absolute percentage error
def smape(predictions, targets):
    '''
    predictions: a NumPy array with the predicted values
    targets: a NumPy array with the actual values
    '''
    # verify predictions and targets have the same shape
    if predictions.shape == targets.shape:
            return(np.sum(2*np.abs(predictions - targets) /
                          (np.abs(targets) + np.abs(predictions)))/predictions.shape[0])

In [17]:
# an empty dataframe to store smape values per model execution
smape_values_df = pd.DataFrame(columns=['model_id', 'counter', 'dataset', 'smape'])

In [18]:
smape_values_df

Unnamed: 0,model_id,counter,dataset,smape


In [19]:
for model_id in model_ids_list:
    for counter in np.arange(num_executions):
        for dataset in datasets:
            # get a boolean to filter prediction_results_df on the model_id, counter, and dataset
            flag = \
            prediction_results_df.model_id.eq(model_id) & \
            prediction_results_df.counter.eq(counter) & \
            prediction_results_df.dataset.eq(dataset)
            
            filtered_df = prediction_results_df[flag]
            # a row dataframe to calculate smape for a given model_id, counter, and dataset
            row_df = pd.DataFrame([[model_id,
                                    counter,
                                    dataset,
                                    smape(filtered_df.prediction, filtered_df.target)]],
                                  columns=['model_id', 'counter', 'dataset', 'smape'])
            
            smape_values_df = smape_values_df.append(row_df, ignore_index=True)

In [20]:
# persist Pandas dataframe, to avoid re-calculating
smape_values_df.to_pickle('../database/smape_values_TPU_80.pkl')

In [21]:
# once persisted, the dataframe can be recovered here
smape_values_TPU_80_df = pd.read_pickle('../database/smape_values_TPU_80.pkl')

In [22]:
smape_values_TPU_80_df[:10]

Unnamed: 0,model_id,counter,dataset,smape
0,DMSLSTM_TPU_80,0,test,0.200182
1,DMSLSTM_TPU_80,1,test,0.18101
2,DMSLSTM_TPU_80,2,test,0.200149
3,DMSLSTM_TPU_80,3,test,0.18054
4,DMSLSTM_TPU_80,4,test,0.192912
5,DMSLSTM_TPU_80,5,test,0.204578
6,DMSLSTM_TPU_80,6,test,0.190282
7,DMSLSTM_TPU_80,7,test,0.188848
8,DMSLSTM_TPU_80,8,test,0.186462
9,DMSLSTM_TPU_80,9,test,0.182835


In [23]:
# smape_values_df.groupby(['model_id', 'dataset'])[['smape']].mean()

In [24]:
# recover statistics for the dataset that renders the best predictive performance
is_test = smape_values_TPU_80_df.dataset.eq('test')
smape_values_TPU_80_df[is_test].groupby(['model_id']).describe()

Unnamed: 0_level_0,smape,smape,smape,smape,smape,smape,smape,smape
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
model_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
DMSLSTM_TPU_80,10.0,0.19078,0.008545,0.18054,0.183742,0.189565,0.19834,0.204578
