In [1]:
# Import libraries
import datetime as dt
import numpy as np
import os
import pandas as pd

from IPython.display import display

In [2]:
os.chdir('RC_2021_results')

In [3]:
# Dataset metadata
datasets = ['labour', 'traffic', 'tourism', 'tourismlarge', 'wiki']
levels_dict = {'labour': 4,
               'tourism': 4,
               'tourismlarge': {'geo': 4,
                                'trav': 5},
               'traffic': 4,
               'wiki': 5}


# Hyperparameter configuration
epoch_set = [10, 25, 50]
context_len_dict = {'tourism': [2, 3, 4],
                    'tourismlarge': [2, 3, 4],
                    'labour': [2, 3, 4],
                    'traffic': [15, 25, 40, 60],
                    'wiki': [15, 25, 40, 60]}

optimal_hps = {'DeepVAR': {'traffic': {'epochs': 25,
                                       'context_len_factor': 40},
                           'wiki': {'epochs': 25,
                                    'context_len_factor': 15},
                           'labour': {'epochs': 50,
                                      'context_len_factor': 3},
                           'tourism': {'epochs': 10,
                                       'context_len_factor': 3},
                           'tourismlarge': {'epochs': 10,
                                            'context_len_factor': 3}},
               'DeepVARPlus': {'traffic': {'epochs': 25,
                                           'context_len_factor': 40},
                               'wiki': {'epochs': 25,
                                        'context_len_factor': 15},
                               'labour': {'epochs': 50,
                                          'context_len_factor': 3},
                               'tourism': {'epochs': 10,
                                           'context_len_factor': 2},
                               'tourismlarge': {'epochs': 25,
                                                'context_len_factor': 3}},
              'HierE2E': {'traffic': {'epochs': 50,
                                      'context_len_factor': 40},
                          'wiki': {'epochs': 50,
                                   'context_len_factor': 15},
                          'labour': {'epochs': 50,
                                     'context_len_factor': 3},
                          'tourism': {'epochs': 10,
                                      'context_len_factor': 3},
                          'tourismlarge': {'epochs': 50,  
                                           'context_len_factor': 3}}}


# Additional parameters
num_runs = 5
mean_col = f'mean_of_{num_runs}_runs'
methods = ['ARIMA_NaiveBU', 'ETS_NaiveBU', 'ARIMA_MINT_shr', 'ARIMA_MINT_ols', 'ETS_MINT_shr', 'ETS_MINT_ols',
           'ARIMA_ERM', 'ETS_ERM', 'PERMBU_MINT', 'HierE2E', 'DeepVAR', 'DeepVARPlus']
classical_methods = [method for method in methods if method not in ['DeepVAR', 'DeepVARPlus', 'HierE2E']]
neural_methods = ['HierE2E', 'DeepVAR', 'DeepVARPlus']

### Helper functions

In [17]:
# Function which creates a list of the names of the losses we're interested in for a particular dataset
def get_cols_of_interest(dataset):
    if dataset != 'tourismlarge':
        cols_of_interest = ['mean_wQuantileLoss'] + \
                           [f'level_{i}_mean_wQuantileLoss' for i in range (1, levels_dict[dataset] + 1)]
    else:
        cols_of_interest = ['mean_wQuantileLoss', 'level_1_mean_wQuantileLoss'] + \
                           [f'level_{i}_geo_mean_wQuantileLoss' for i in range (2, levels_dict[dataset]['geo'] + 1)] + \
                           [f'level_{i}_trav_mean_wQuantileLoss' for i in range (2, levels_dict[dataset]['trav'] + 1)]
        
    return cols_of_interest

def select_pickles(folder_name):
    files = [file for file in os.listdir(folder_name) if '.txt' not in file]
    return files

# Function which evaluates the results for a particular model on a particular dataset
def get_model_results(folder_name, dataset):
    # we get the losses of interest for a particular dataset
    cols_of_interest = get_cols_of_interest(dataset)
    
    if folder_name == os.path.join('PERMBU_MINT', 'tourismlarge'):
        best_results = pd.DataFrame(len(cols_of_interest) * ['NA'], index=cols_of_interest).T
        best_results.index = ['PERMBU_MINT']
        return best_results
    
    # we store the results from the separate runs into a dataframe
    best_results = pd.DataFrame()
    files = select_pickles(folder_name)
    for (i, file) in enumerate(files):
        losses = np.empty((len(cols_of_interest),))
        results = pd.read_pickle(os.path.join(folder_name, file))
        results = pd.DataFrame(results) 

        for (j, col) in enumerate(cols_of_interest):
            if col == 'mean_wQuantileLoss':
                losses[j] = results.loc[0, col]
            else:
                losses[j] = results.loc[1, col]

        selected_results = pd.DataFrame(losses, index=cols_of_interest, columns=[f'run_{i}'])
        best_results = pd.concat([best_results, selected_results], axis=1)
    
    # we add the mean (overall and hierarchical ones) of all runs
    best_results[mean_col] = best_results.mean(axis=1)
    
    return best_results

# Function to calculate the average loss across multiple runs for a particular hyperparameter configuration
def get_mean_loss(folder_name, num_runs, calc_std_error):
    losses = np.empty((num_runs,))
    files = select_pickles(folder_name)
    for (i, file) in enumerate(files):
        results = pd.read_pickle(os.path.join(folder_name, file))
        results = pd.DataFrame(results)

        losses[i] = results.loc[0, 'mean_wQuantileLoss']
            
    mean_loss = np.mean(losses)
    if calc_std_error == True:
        loss_std_error = np.std(losses)
        return mean_loss, loss_std_error
    else:
        return mean_loss

# Function which evaluates the performance of a method over the entire hyperparameter space on a given dataset
def get_method_results_on_dataset(model, dataset):
    # we get the losses of interest for the dataset
    cols_of_interest = get_cols_of_interest(dataset)
    
    # we search (based on the results) for the best hyperparameter configuration
    # for the method on a given dataset
    folder_name = model + '_' + dataset 
    min_loss = 1000  # just a random loss to initialize the variable
    optimal_epochs = 0
    optimal_context_len = 0
    for epochs in epoch_set:
        for context_len in context_len_dict[dataset]:
            subfolder_name = os.path.join(folder_name, f'epochs_{epochs}_context_len_factor_{context_len}')
            assert len(os.listdir(subfolder_name)) == num_runs, \
                   f"You didn't run the model epochs_{epochs}_context_len_factor_{context_len} for {dataset} {num_runs} times"
            
            # we check if the mean loss (across num_runs runs) for a given configuration 
            # is lower than the one of the most optimal configuration so far
            mean_loss = get_mean_loss(subfolder_name, num_runs, calc_std_error=False)
            if mean_loss < min_loss:
                min_loss = mean_loss
                optimal_epochs = epochs
                optimal_context_len = context_len
    
    # we store the results of the most optimal hyperparameter configuration into a dataframe
    subfolder_name = os.path.join(folder_name, f'epochs_{optimal_epochs}_context_len_factor_{optimal_context_len}')
    best_results = get_model_results(subfolder_name, dataset)
    
    return optimal_epochs, optimal_context_len, best_results

# Function which evaluates the performance of a method over the entire hyperparameter space on all datasets
def get_method_results_on_datasets(model):
    for dataset in ['wiki']: #datasets:
        optimal_epochs, optimal_context_len, best_results = get_method_results_on_dataset(model, dataset)
        print(f'Method {model} on dataset {dataset}:\n\t')
        print(f'Most optimal number of epochs: {optimal_epochs}\n\t')
        print(f'Most optimal context length factor: {optimal_context_len}\n\t')
        print('The performance of this configuration:\n')
        display(best_results)

def correct_folder_names(folder_name):
    if 'tourism' in folder_name and 'large' not in folder_name:
        folder_name = folder_name.replace('tourism', 'tourismsmall')
    if 'wiki' in folder_name:
        folder_name = folder_name.replace('wiki', 'wiki2')
        
    return folder_name

def get_paper_results(model, datasets, display_results=False):
    for dataset in datasets:
        folder_name = os.path.join(model, dataset)
        
        folder_name = correct_folder_names(folder_name)
        
        dataset_results = get_model_results(folder_name, dataset)
        
        if display_results:
            print(f'Performance of model {model} on dataset {dataset}:\n')
            display(dataset_results)
            print('\n\n')
        
    if len(datasets) == 1:
        return dataset_results

def create_paper_table_1(verbose=True, to_latex=False):
    results_df = pd.DataFrame()
    for method in methods:
        method_df = pd.DataFrame(index=[method])
        for dataset in datasets:
            if verbose:
                print(f'Adding results for method {method} on dataset {dataset}..\n')
            
            folder_name = os.path.join(method, dataset)
            folder_name = correct_folder_names(folder_name)

            if folder_name == os.path.join('PERMBU_MINT', 'tourismlarge'):
                method_df[dataset] = 'NA'
            else:
                mean_loss, loss_std_error = get_mean_loss(folder_name, num_runs, calc_std_error=True)
                method_df[dataset] = str(np.round(mean_loss, 4)) + ' +/- ' + str(np.round(loss_std_error, 4))

        results_df = pd.concat([results_df, method_df])
        
    if to_latex:
        results_df.to_latex('paper_table_1.tex')
    
    return results_df

def examine_hp_search_results(method, dataset, num_runs):
    optimal_epochs = optimal_hps[method][dataset]['epochs']
    optimal_context_len_factor = optimal_hps[method][dataset]['context_len_factor']
    
    folder_name = os.path.join(f'{method}_{dataset}', 
                               f'epochs_{optimal_epochs}_context_len_factor_{optimal_context_len_factor}')
    mean_loss, loss_std_error = get_mean_loss(folder_name, num_runs, calc_std_error=True)
    print(f'Method {method} on dataset {dataset}:\n')
    print(f'epochs_{optimal_epochs}_context_len_factor_{optimal_context_len_factor}:\n')
    print(f'mean_loss: {mean_loss}\n')
    print(f'loss_std_error: {loss_std_error}\n')
    print(f'Results across {num_runs} runs for this hyperparameter configuration:\n')
    display(get_model_results(folder_name, dataset))
    print('\n\n')

    my_optimal_epochs, my_optimal_context_len_factor, best_results = get_method_results_on_dataset(method, dataset)
    print(f'Results across {num_runs} runs for the configuration we found to be the best:\n')
    display(best_results)
    print(f'optimal_epochs for this configuration: {my_optimal_epochs}\n')
    print(f'optimal_context_len_factor for this configuration: {my_optimal_context_len_factor}\n\n')

def get_methods_results(dataset):
    dataset_df = pd.DataFrame()
    for method in methods:
        method_df = get_paper_results(method, [dataset])
        if (method == 'PERMBU_MINT') and (dataset == 'tourismlarge'):
            dataset_df = pd.concat([dataset_df, method_df])
        else:
            std_col = method_df.loc[:, method_df.columns != mean_col].std(axis=1).round(4)
            method_df[mean_col] = method_df[mean_col].round(4)
            method_df[mean_col] = method_df[mean_col].astype('str') + ' +/- ' + std_col.astype(str)
            dataset_df = pd.concat([dataset_df, method_df[mean_col].to_frame().T])
    return dataset_df

def get_supplementary_table(dataset, to_latex=False):
    dataset_df = get_methods_results(dataset)
    dataset_df.index = methods
    dataset_df.columns = [(col.split('wQuantileLoss')[0] + 'CRPS') for col in dataset_df.columns]
    
    if to_latex:
        dataset_df.to_latex(f'{dataset}_supplementary_table.tex')
    return dataset_df

def split_result(x):
    if x == 'NA':
        return 'NA'
    else:
        return float(x.split(' +/- ')[0])
    
def get_supplementary_subtable(dataset, to_latex=False):
    complete_dataset_df = get_supplementary_table(dataset)
    dataset_df = complete_dataset_df.applymap(split_result)
    
    count_dict = dict.fromkeys(classical_methods, 0)
    dataset_df_tmp = dataset_df.iloc[:, 1:].loc[dataset_df.index.isin(classical_methods), :]
    if dataset == 'tourismlarge':
        dataset_df_tmp = dataset_df_tmp.drop('PERMBU_MINT')
    dataset_df_tmp = dataset_df_tmp.astype('float64')
    for col in dataset_df_tmp.columns:
        best_method = dataset_df_tmp.loc[:, col].idxmin()
        count_dict[best_method] += 1
    
    max_num_levels = max(count_dict.values())
    best_methods = [method for method, num in count_dict.items() if num == max_num_levels]
    if len(best_methods) > 1:
        best_methods = [dataset_df.loc[best_methods, 'mean_CRPS'].idxmin()]
    complete_dataset_df = complete_dataset_df.loc[best_methods + neural_methods, :]
    
    complete_dataset_df = complete_dataset_df.T
    if to_latex:
        complete_dataset_df.to_latex(f'{dataset}_supplementary_subtable.tex')
    return complete_dataset_df

def convert_date(file):
    date = file.split('run_')[1].replace('.pkl', '').split('_')
    date = '/'.join(date[: 3]) + ' ' + ':'.join(date[3:])
    date = dt.datetime.strptime(date, '%Y/%m/%d %H:%M:%S')
    return date

def calculate_runtimes(folder_name):
    runtimes = np.zeros((num_runs - 1,))
    files = [file for file in os.listdir(folder_name) if '.txt' not in file]
    files = sorted(files)
    for (i, file) in enumerate(files):
        if i == 0:
            old_date = convert_date(file)
        else:
            new_date = convert_date(file)
            runtimes[i-1] = (new_date - old_date).seconds
            old_date = new_date
    return runtimes

def calculate_average_runtimes(to_latex=False, calc_std_error=False, minutes=False):
    runtime_dict = dict.fromkeys(methods, dict.fromkeys(datasets))
    runtime_df = pd.DataFrame(runtime_dict)

    for method in methods:
        for dataset in datasets:
            folder_name = os.path.join(method, dataset)
            folder_name = correct_folder_names(folder_name)
            if ('PERMBU_MINT' in folder_name) and ('tourismlarge' in folder_name):
                continue
            runtimes = calculate_runtimes(folder_name)
            if calc_std_error and minutes:
                runtime_df.loc[dataset, method] = str((runtimes.mean() / 60).round(2)) + ' +/- ' + str((runtimes.std() / 60).round(2))
            else:
                runtime_df.loc[dataset, method] = runtimes.mean()
                
    if to_latex:
        runtime_df.T.to_latex('runtime_table.tex')
    return runtime_df.T

def calculate_total_time():
    runtime_df = calculate_average_runtimes()
    runtime_df = runtime_df.sum(axis=1) * num_runs  
    for method in neural_methods:
        for dataset in datasets:
            folder_name = method + '_' + dataset
            subfolders = [subfolder for subfolder in os.listdir(folder_name) if ('.txt' not in subfolder) and ('.ipynb_checkpoints' not in subfolder)]
            subfolders = [os.path.join(folder_name, subfolder) for subfolder in subfolders]
            for subfolder in subfolders:
                runtime_df.loc[method] += calculate_runtimes(subfolder)[: 3].mean() * num_runs
                
    runtime_df = runtime_df.to_frame()
    runtime_df.columns = ['total_time [h]']
    runtime_df = (runtime_df / 3600).round(2)
    return runtime_df

## Comparison of performance of authors' optimal configurations against mine

**Best configurations**:

- *DeepVAR*: 
    - traffic: (found: 50, 60) (paper: 25, 40)
    - wiki: (found: 10, 15) (paper: 25, 15)
    - labour: (found: 25, 2) (paper: 50, 3)
    - tourism: (found: 10, 4) (paper: 10, 3)
    - tourismlarge: (found: 10, 2) (paper: 10, 3)
    
    
- *DeepVARPlus*:
    - traffic: (found: 25, 60) (paper: 25, 40)
    - wiki: (found: 10, 15) (paper: 25, 15)
    - labour: (found: 50, 3) (paper: 50, 3)
    - tourism: (found: 10, 4) (paper: 10, 2)
    - tourismlarge: (found: 50, 4) (paper: 25, 3)
    
    
- *HierE2E*:
    - traffic: (found: 50, 60) (paper: 50, 40)
    - wiki: (found: 50, 15) (paper: 50, 15)
    - labour: (found: 50, 2) (paper: 50, 3)
    - tourism: (found: 10, 3) (paper: 10, 3)
    - tourismlarge: (found: 50, 4) (paper: 50, 3)

In [8]:
for method in neural_methods:
    for dataset in datasets:
        examine_hp_search_results(method, dataset, num_runs)

Method HierE2E on dataset labour:

epochs_50_context_len_factor_3:

mean_loss: 0.014300808423089926

loss_std_error: 0.0015778704851996987

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.016347,0.012833,0.012536,0.015963,0.013825,0.014301
level_1_mean_wQuantileLoss,0.013146,0.007103,0.004938,0.012752,0.009868,0.009562
level_2_mean_wQuantileLoss,0.016205,0.012384,0.011673,0.014966,0.01312,0.01367
level_3_mean_wQuantileLoss,0.016418,0.013549,0.013822,0.016075,0.014256,0.014824
level_4_mean_wQuantileLoss,0.019619,0.018294,0.019711,0.020061,0.018056,0.019148





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.013594,0.010111,0.010189,0.011972,0.012797,0.011733
level_1_mean_wQuantileLoss,0.010769,0.005716,0.005274,0.007772,0.009029,0.007712
level_2_mean_wQuantileLoss,0.012021,0.007807,0.009091,0.010097,0.011514,0.010106
level_3_mean_wQuantileLoss,0.012987,0.010313,0.010322,0.01185,0.012422,0.011579
level_4_mean_wQuantileLoss,0.018599,0.016608,0.016069,0.01817,0.018224,0.017534


optimal_epochs for this configuration: 50

optimal_context_len_factor for this configuration: 2


Method HierE2E on dataset traffic:

epochs_50_context_len_factor_40:

mean_loss: 0.04742799064618816

loss_std_error: 0.002065178424655306

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.046812,0.05015,0.044501,0.04936,0.046317,0.047428
level_1_mean_wQuantileLoss,0.018283,0.028854,0.007417,0.020809,0.01652,0.018377
level_2_mean_wQuantileLoss,0.021671,0.024137,0.010803,0.021734,0.019432,0.019555
level_3_mean_wQuantileLoss,0.022901,0.025879,0.018965,0.022109,0.028024,0.023576
level_4_mean_wQuantileLoss,0.124394,0.12173,0.140818,0.132789,0.121292,0.128205





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.039715,0.045269,0.037944,0.043055,0.047821,0.042761
level_1_mean_wQuantileLoss,0.017834,0.011213,0.006453,0.022366,0.028333,0.01724
level_2_mean_wQuantileLoss,0.019216,0.015419,0.011109,0.021,0.026936,0.018736
level_3_mean_wQuantileLoss,0.017718,0.018217,0.012015,0.020936,0.025813,0.01894
level_4_mean_wQuantileLoss,0.104092,0.136227,0.122197,0.107917,0.110204,0.116128


optimal_epochs for this configuration: 50

optimal_context_len_factor for this configuration: 60


Method HierE2E on dataset tourism:

epochs_10_context_len_factor_3:

mean_loss: 0.09910454103029455

loss_std_error: 0.0031105099463855828

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.094799,0.097642,0.101626,0.097893,0.103563,0.099105
level_1_mean_wQuantileLoss,0.035314,0.040423,0.056249,0.043795,0.044476,0.044051
level_2_mean_wQuantileLoss,0.080806,0.087405,0.088572,0.086865,0.090355,0.0868
level_3_mean_wQuantileLoss,0.120507,0.118515,0.120121,0.119914,0.128484,0.121508
level_4_mean_wQuantileLoss,0.14257,0.144226,0.14156,0.140998,0.150937,0.144058





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.094799,0.097642,0.101626,0.097893,0.103563,0.099105
level_1_mean_wQuantileLoss,0.035314,0.040423,0.056249,0.043795,0.044476,0.044051
level_2_mean_wQuantileLoss,0.080806,0.087405,0.088572,0.086865,0.090355,0.0868
level_3_mean_wQuantileLoss,0.120507,0.118515,0.120121,0.119914,0.128484,0.121508
level_4_mean_wQuantileLoss,0.14257,0.144226,0.14156,0.140998,0.150937,0.144058


optimal_epochs for this configuration: 10

optimal_context_len_factor for this configuration: 3


Method HierE2E on dataset tourismlarge:

epochs_50_context_len_factor_3:

mean_loss: 0.3437197247053691

loss_std_error: 0.03442998884689549

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.317791,0.402651,0.306355,0.332184,0.359618,0.34372
level_1_mean_wQuantileLoss,0.211868,0.377659,0.182935,0.209457,0.333892,0.263162
level_2_geo_mean_wQuantileLoss,0.261528,0.37598,0.228817,0.296132,0.331952,0.298882
level_3_geo_mean_wQuantileLoss,0.291956,0.38048,0.27427,0.309838,0.342233,0.319755
level_4_geo_mean_wQuantileLoss,0.345158,0.395364,0.333715,0.350454,0.369968,0.358932
level_2_trav_mean_wQuantileLoss,0.228055,0.37003,0.242021,0.247915,0.294959,0.276596
level_3_trav_mean_wQuantileLoss,0.318664,0.39171,0.312301,0.340553,0.338822,0.34041
level_4_trav_mean_wQuantileLoss,0.394731,0.435388,0.390485,0.408027,0.392118,0.40415
level_5_trav_mean_wQuantileLoss,0.490372,0.494597,0.486293,0.495094,0.472997,0.487871





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.253579,0.189734,0.183214,0.210749,0.205488,0.208553
level_1_mean_wQuantileLoss,0.18279,0.102093,0.107139,0.147698,0.127934,0.133531
level_2_geo_mean_wQuantileLoss,0.20388,0.128386,0.126963,0.167131,0.156043,0.156481
level_3_geo_mean_wQuantileLoss,0.232825,0.171535,0.161721,0.188797,0.184315,0.187839
level_4_geo_mean_wQuantileLoss,0.27772,0.219927,0.202432,0.227496,0.220798,0.229675
level_2_trav_mean_wQuantileLoss,0.195043,0.134551,0.144792,0.158394,0.159187,0.158393
level_3_trav_mean_wQuantileLoss,0.24173,0.178687,0.180192,0.20087,0.200879,0.200472
level_4_trav_mean_wQuantileLoss,0.305628,0.25116,0.237389,0.2616,0.261982,0.263552
level_5_trav_mean_wQuantileLoss,0.389014,0.331536,0.305086,0.334002,0.33277,0.338482


optimal_epochs for this configuration: 50

optimal_context_len_factor for this configuration: 4


Method HierE2E on dataset wiki:

epochs_50_context_len_factor_15:

mean_loss: 0.1629297977755499

loss_std_error: 0.005636358432241553

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.152346,0.166696,0.166175,0.161913,0.167519,0.16293
level_1_mean_wQuantileLoss,0.05994,0.075128,0.065996,0.064342,0.068762,0.066834
level_2_mean_wQuantileLoss,0.108434,0.122134,0.123058,0.116206,0.122407,0.118448
level_3_mean_wQuantileLoss,0.140586,0.159853,0.154944,0.151751,0.160812,0.153589
level_4_mean_wQuantileLoss,0.159291,0.17263,0.173783,0.174147,0.175465,0.171063
level_5_mean_wQuantileLoss,0.293477,0.303737,0.313096,0.303118,0.310151,0.304716





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.152346,0.166696,0.166175,0.161913,0.167519,0.16293
level_1_mean_wQuantileLoss,0.05994,0.075128,0.065996,0.064342,0.068762,0.066834
level_2_mean_wQuantileLoss,0.108434,0.122134,0.123058,0.116206,0.122407,0.118448
level_3_mean_wQuantileLoss,0.140586,0.159853,0.154944,0.151751,0.160812,0.153589
level_4_mean_wQuantileLoss,0.159291,0.17263,0.173783,0.174147,0.175465,0.171063
level_5_mean_wQuantileLoss,0.293477,0.303737,0.313096,0.303118,0.310151,0.304716


optimal_epochs for this configuration: 50

optimal_context_len_factor for this configuration: 15


Method DeepVAR on dataset labour:

epochs_50_context_len_factor_3:

mean_loss: 0.011548858910635323

loss_std_error: 0.0030584552430236362

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.009509,0.010835,0.017487,0.010897,0.009016,0.011549
level_1_mean_wQuantileLoss,0.004359,0.007657,0.020475,0.005981,0.003966,0.008487
level_2_mean_wQuantileLoss,0.006691,0.009864,0.015092,0.010306,0.007452,0.009881
level_3_mean_wQuantileLoss,0.010945,0.010074,0.015179,0.009635,0.009825,0.011132
level_4_mean_wQuantileLoss,0.016041,0.015746,0.019202,0.017665,0.014822,0.016695





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.010073,0.008849,0.00883,0.009336,0.009129,0.009243
level_1_mean_wQuantileLoss,0.004782,0.004903,0.004744,0.004617,0.004237,0.004656
level_2_mean_wQuantileLoss,0.009663,0.007381,0.008194,0.008542,0.008856,0.008527
level_3_mean_wQuantileLoss,0.010778,0.009055,0.00859,0.009501,0.009701,0.009525
level_4_mean_wQuantileLoss,0.015068,0.014056,0.013792,0.014682,0.013721,0.014264


optimal_epochs for this configuration: 25

optimal_context_len_factor for this configuration: 2


Method DeepVAR on dataset traffic:

epochs_25_context_len_factor_40:

mean_loss: 0.05267169566895079

loss_std_error: 0.007235754742415189

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.052198,0.058956,0.062487,0.043508,0.046209,0.052672
level_1_mean_wQuantileLoss,0.018487,0.038146,0.012058,0.016423,0.018355,0.020694
level_2_mean_wQuantileLoss,0.015407,0.027034,0.033125,0.011538,0.011025,0.019626
level_3_mean_wQuantileLoss,0.026748,0.016647,0.048897,0.022865,0.020674,0.027166
level_4_mean_wQuantileLoss,0.148152,0.153997,0.155866,0.123207,0.134782,0.143201





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.044064,0.053324,0.042105,0.049561,0.038154,0.045442
level_1_mean_wQuantileLoss,0.038004,0.031915,0.008191,0.013327,0.013218,0.020931
level_2_mean_wQuantileLoss,0.010964,0.030159,0.014015,0.035775,0.015981,0.021379
level_3_mean_wQuantileLoss,0.021772,0.027639,0.0323,0.025931,0.017526,0.025034
level_4_mean_wQuantileLoss,0.105517,0.123582,0.113915,0.123211,0.105891,0.114423


optimal_epochs for this configuration: 50

optimal_context_len_factor for this configuration: 60


Method DeepVAR on dataset tourism:

epochs_10_context_len_factor_3:

mean_loss: 0.10459290603250262

loss_std_error: 0.0023672514278107004

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.108756,0.105187,0.103911,0.103441,0.10167,0.104593
level_1_mean_wQuantileLoss,0.04636,0.056701,0.04995,0.054228,0.046062,0.05066
level_2_mean_wQuantileLoss,0.100404,0.074869,0.087056,0.087945,0.091635,0.088382
level_3_mean_wQuantileLoss,0.130767,0.130868,0.124941,0.123322,0.122191,0.126418
level_4_mean_wQuantileLoss,0.157491,0.158312,0.153696,0.148268,0.14679,0.152912





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.093323,0.094188,0.098453,0.107639,0.105104,0.099741
level_1_mean_wQuantileLoss,0.04305,0.050961,0.043422,0.05214,0.05336,0.048587
level_2_mean_wQuantileLoss,0.079907,0.073462,0.077583,0.101779,0.093966,0.085339
level_3_mean_wQuantileLoss,0.110021,0.112786,0.123426,0.124005,0.119943,0.118036
level_4_mean_wQuantileLoss,0.140314,0.139545,0.149381,0.152634,0.153145,0.147004


optimal_epochs for this configuration: 10

optimal_context_len_factor for this configuration: 4


Method DeepVAR on dataset tourismlarge:

epochs_10_context_len_factor_3:

mean_loss: 0.15566938858166185

loss_std_error: 0.004186473899370983

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.156711,0.153582,0.156627,0.149339,0.162088,0.155669
level_1_mean_wQuantileLoss,0.097376,0.092374,0.094747,0.079553,0.099604,0.092731
level_2_geo_mean_wQuantileLoss,0.118511,0.107491,0.111901,0.10642,0.123082,0.113481
level_3_geo_mean_wQuantileLoss,0.143466,0.135979,0.137246,0.130309,0.145694,0.138539
level_4_geo_mean_wQuantileLoss,0.178757,0.174166,0.175426,0.172259,0.183829,0.176887
level_2_trav_mean_wQuantileLoss,0.103743,0.108332,0.110831,0.095705,0.116387,0.106999
level_3_trav_mean_wQuantileLoss,0.142487,0.142096,0.148505,0.139429,0.149042,0.144312
level_4_trav_mean_wQuantileLoss,0.199868,0.199553,0.203074,0.199387,0.205418,0.20146
level_5_trav_mean_wQuantileLoss,0.269482,0.268668,0.271284,0.271649,0.273648,0.270946





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.133555,0.139335,0.140325,0.144346,0.149957,0.141504
level_1_mean_wQuantileLoss,0.062633,0.068672,0.070121,0.08549,0.085688,0.074521
level_2_geo_mean_wQuantileLoss,0.078425,0.090297,0.096098,0.094427,0.108039,0.093457
level_3_geo_mean_wQuantileLoss,0.115802,0.123489,0.126426,0.131657,0.134438,0.126363
level_4_geo_mean_wQuantileLoss,0.159623,0.163451,0.163724,0.169668,0.173322,0.165958
level_2_trav_mean_wQuantileLoss,0.078509,0.083949,0.085318,0.0901,0.098571,0.087289
level_3_trav_mean_wQuantileLoss,0.121115,0.126753,0.124967,0.126841,0.133577,0.12665
level_4_trav_mean_wQuantileLoss,0.18836,0.192232,0.190827,0.191833,0.196681,0.191987
level_5_trav_mean_wQuantileLoss,0.26397,0.26584,0.265116,0.264751,0.269341,0.265804


optimal_epochs for this configuration: 10

optimal_context_len_factor for this configuration: 2


Method DeepVAR on dataset wiki:

epochs_25_context_len_factor_15:

mean_loss: 0.19243037821639009

loss_std_error: 0.010899580749596311

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.203757,0.180093,0.192357,0.20548,0.180464,0.19243
level_1_mean_wQuantileLoss,0.184709,0.149912,0.20893,0.204792,0.110393,0.171747
level_2_mean_wQuantileLoss,0.184575,0.147806,0.144623,0.185714,0.178346,0.168213
level_3_mean_wQuantileLoss,0.198579,0.179111,0.176313,0.190427,0.172598,0.183405
level_4_mean_wQuantileLoss,0.208539,0.190666,0.195928,0.200505,0.182479,0.195623
level_5_mean_wQuantileLoss,0.242384,0.23297,0.235992,0.245962,0.258506,0.243163





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.147886,0.144989,0.154595,0.149937,0.142461,0.147974
level_1_mean_wQuantileLoss,0.074003,0.081025,0.084903,0.106296,0.067567,0.082759
level_2_mean_wQuantileLoss,0.129792,0.102511,0.121655,0.116853,0.127902,0.119743
level_3_mean_wQuantileLoss,0.145069,0.156839,0.162759,0.1415,0.156126,0.152459
level_4_mean_wQuantileLoss,0.149672,0.160402,0.161443,0.149057,0.146531,0.153421
level_5_mean_wQuantileLoss,0.240893,0.224171,0.242212,0.23598,0.214178,0.231487


optimal_epochs for this configuration: 10

optimal_context_len_factor for this configuration: 15


Method DeepVARPlus on dataset labour:

epochs_50_context_len_factor_3:

mean_loss: 0.011223655759198953

loss_std_error: 0.0025316868382062327

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.009654,0.015768,0.012047,0.010048,0.008602,0.011224
level_1_mean_wQuantileLoss,0.006146,0.013804,0.007655,0.006131,0.003836,0.007514
level_2_mean_wQuantileLoss,0.009008,0.014496,0.010138,0.008496,0.007524,0.009933
level_3_mean_wQuantileLoss,0.010068,0.015356,0.011906,0.01002,0.008854,0.011241
level_4_mean_wQuantileLoss,0.013392,0.019415,0.018489,0.015545,0.014195,0.016207





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.009654,0.015768,0.012047,0.010048,0.008602,0.011224
level_1_mean_wQuantileLoss,0.006146,0.013804,0.007655,0.006131,0.003836,0.007514
level_2_mean_wQuantileLoss,0.009008,0.014496,0.010138,0.008496,0.007524,0.009933
level_3_mean_wQuantileLoss,0.010068,0.015356,0.011906,0.01002,0.008854,0.011241
level_4_mean_wQuantileLoss,0.013392,0.019415,0.018489,0.015545,0.014195,0.016207


optimal_epochs for this configuration: 50

optimal_context_len_factor for this configuration: 3


Method DeepVARPlus on dataset traffic:

epochs_25_context_len_factor_40:

mean_loss: 0.04928521227617485

loss_std_error: 0.007431496895209996

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.039664,0.055673,0.042006,0.050386,0.058697,0.049285
level_1_mean_wQuantileLoss,0.009666,0.026317,0.006048,0.014762,0.033594,0.018077
level_2_mean_wQuantileLoss,0.010006,0.02937,0.013118,0.019847,0.031914,0.020851
level_3_mean_wQuantileLoss,0.014129,0.028525,0.016005,0.021611,0.032288,0.022512
level_4_mean_wQuantileLoss,0.124856,0.13848,0.132854,0.145323,0.13699,0.135701





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.041408,0.043333,0.038767,0.041181,0.041011,0.04114
level_1_mean_wQuantileLoss,0.014176,0.007857,0.003607,0.005604,0.013747,0.008998
level_2_mean_wQuantileLoss,0.013751,0.019948,0.006891,0.010094,0.014098,0.012956
level_3_mean_wQuantileLoss,0.015288,0.022415,0.026117,0.013705,0.016581,0.018821
level_4_mean_wQuantileLoss,0.122417,0.12311,0.118452,0.135321,0.119618,0.123784


optimal_epochs for this configuration: 25

optimal_context_len_factor for this configuration: 60


Method DeepVARPlus on dataset tourism:

epochs_10_context_len_factor_2:

mean_loss: 0.13189371295157248

loss_std_error: 0.02647111534991346

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.119256,0.132232,0.1826,0.116878,0.108502,0.131894
level_1_mean_wQuantileLoss,0.063189,0.069646,0.081338,0.061887,0.055955,0.066403
level_2_mean_wQuantileLoss,0.110691,0.121427,0.193129,0.106553,0.088116,0.123983
level_3_mean_wQuantileLoss,0.140196,0.159628,0.216305,0.138387,0.12921,0.156745
level_4_mean_wQuantileLoss,0.162949,0.178228,0.239629,0.160684,0.160728,0.180444





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.110683,0.092055,0.119135,0.104571,0.098634,0.105015
level_1_mean_wQuantileLoss,0.044764,0.040849,0.071525,0.05561,0.039491,0.050448
level_2_mean_wQuantileLoss,0.106311,0.074649,0.102562,0.080044,0.080792,0.088872
level_3_mean_wQuantileLoss,0.135716,0.111301,0.135654,0.129066,0.121528,0.126653
level_4_mean_wQuantileLoss,0.15594,0.141419,0.166797,0.153563,0.152727,0.154089


optimal_epochs for this configuration: 10

optimal_context_len_factor for this configuration: 4


Method DeepVARPlus on dataset tourismlarge:

epochs_25_context_len_factor_3:

mean_loss: 0.30731205965570135

loss_std_error: 0.030361293604412216

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.350208,0.276572,0.302138,0.274048,0.333594,0.307312
level_1_mean_wQuantileLoss,0.289516,0.200942,0.237206,0.231017,0.295522,0.250841
level_2_geo_mean_wQuantileLoss,0.292091,0.2209,0.254676,0.232718,0.299826,0.260042
level_3_geo_mean_wQuantileLoss,0.311764,0.249584,0.276073,0.254409,0.305771,0.27952
level_4_geo_mean_wQuantileLoss,0.358436,0.292238,0.30988,0.275884,0.339696,0.315227
level_2_trav_mean_wQuantileLoss,0.29364,0.241268,0.256697,0.234321,0.284745,0.262134
level_3_trav_mean_wQuantileLoss,0.337894,0.272773,0.296184,0.253402,0.317451,0.295541
level_4_trav_mean_wQuantileLoss,0.408818,0.328446,0.35764,0.320631,0.372636,0.357634
level_5_trav_mean_wQuantileLoss,0.509505,0.406421,0.428751,0.390003,0.453105,0.437557





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.279087,0.254442,0.24551,0.26344,0.27071,0.262638
level_1_mean_wQuantileLoss,0.213679,0.212013,0.177932,0.211078,0.229099,0.20876
level_2_geo_mean_wQuantileLoss,0.21793,0.209942,0.181102,0.215743,0.234699,0.211883
level_3_geo_mean_wQuantileLoss,0.244855,0.227269,0.213868,0.24052,0.245907,0.234484
level_4_geo_mean_wQuantileLoss,0.289583,0.265078,0.255499,0.274442,0.276106,0.272142
level_2_trav_mean_wQuantileLoss,0.228114,0.20556,0.205644,0.232047,0.227354,0.219744
level_3_trav_mean_wQuantileLoss,0.265111,0.235937,0.243823,0.254208,0.254735,0.250763
level_4_trav_mean_wQuantileLoss,0.343102,0.298055,0.30351,0.303811,0.309204,0.311536
level_5_trav_mean_wQuantileLoss,0.430322,0.38168,0.382699,0.375672,0.388574,0.391789


optimal_epochs for this configuration: 50

optimal_context_len_factor for this configuration: 4


Method DeepVARPlus on dataset wiki:

epochs_25_context_len_factor_15:

mean_loss: 0.2377701548073529

loss_std_error: 0.013063235740950275

Results across 5 runs for this hyperparameter configuration:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.224226,0.251645,0.251481,0.220984,0.240516,0.23777
level_1_mean_wQuantileLoss,0.118369,0.147537,0.130533,0.118306,0.145122,0.131974
level_2_mean_wQuantileLoss,0.161302,0.212805,0.185371,0.166594,0.192328,0.18368
level_3_mean_wQuantileLoss,0.216701,0.23541,0.254482,0.199255,0.229402,0.22705
level_4_mean_wQuantileLoss,0.247812,0.271763,0.286035,0.231376,0.257135,0.258824
level_5_mean_wQuantileLoss,0.376945,0.390707,0.400981,0.389389,0.378593,0.387323





Results across 5 runs for the configuration we found to be the best:



Unnamed: 0,run_0,run_1,run_2,run_3,run_4,mean_of_5_runs
mean_wQuantileLoss,0.165791,0.179253,0.15962,0.140375,0.168302,0.162668
level_1_mean_wQuantileLoss,0.073595,0.070772,0.069625,0.055193,0.062647,0.066367
level_2_mean_wQuantileLoss,0.123042,0.141984,0.111266,0.092725,0.116686,0.11714
level_3_mean_wQuantileLoss,0.158032,0.173699,0.154496,0.125709,0.157759,0.153939
level_4_mean_wQuantileLoss,0.179037,0.196572,0.174403,0.146643,0.183262,0.175983
level_5_mean_wQuantileLoss,0.295251,0.31324,0.28831,0.281605,0.321156,0.299912


optimal_epochs for this configuration: 10

optimal_context_len_factor for this configuration: 15




## Table 4 from Reproducibility Report

In [9]:
display(create_paper_table_1(to_latex=False))

Adding results for method ARIMA_NaiveBU on dataset labour..

Adding results for method ARIMA_NaiveBU on dataset traffic..

Adding results for method ARIMA_NaiveBU on dataset tourism..

Adding results for method ARIMA_NaiveBU on dataset tourismlarge..

Adding results for method ARIMA_NaiveBU on dataset wiki..

Adding results for method ETS_NaiveBU on dataset labour..

Adding results for method ETS_NaiveBU on dataset traffic..

Adding results for method ETS_NaiveBU on dataset tourism..

Adding results for method ETS_NaiveBU on dataset tourismlarge..

Adding results for method ETS_NaiveBU on dataset wiki..

Adding results for method ARIMA_MINT_shr on dataset labour..

Adding results for method ARIMA_MINT_shr on dataset traffic..

Adding results for method ARIMA_MINT_shr on dataset tourism..

Adding results for method ARIMA_MINT_shr on dataset tourismlarge..

Adding results for method ARIMA_MINT_shr on dataset wiki..

Adding results for method ARIMA_MINT_ols on dataset labour..

Adding res

Unnamed: 0,labour,traffic,tourism,tourismlarge,wiki
ARIMA_NaiveBU,0.0453 +/- 0.0,0.0753 +/- 0.0,0.1138 +/- 0.0,0.1752 +/- 0.0,0.3776 +/- 0.0
ETS_NaiveBU,0.0432 +/- 0.0,0.0665 +/- 0.0,0.1008 +/- 0.0,0.169 +/- 0.0,0.4673 +/- 0.0
ARIMA_MINT_shr,0.0467 +/- 0.0,0.0775 +/- 0.0,0.1171 +/- 0.0,0.1615 +/- 0.0,0.2466 +/- 0.0
ARIMA_MINT_ols,0.0463 +/- 0.0,0.1123 +/- 0.0,0.1195 +/- 0.0,0.1731 +/- 0.0,0.2782 +/- 0.0
ETS_MINT_shr,0.0455 +/- 0.0,0.0963 +/- 0.0,0.1013 +/- 0.0,0.1627 +/- 0.0,0.3622 +/- 0.0
ETS_MINT_ols,0.0459 +/- 0.0,0.111 +/- 0.0,0.1002 +/- 0.0,0.1668 +/- 0.0,0.2702 +/- 0.0
ARIMA_ERM,0.0399 +/- 0.0,0.0466 +/- 0.0,0.5885 +/- 0.0,0.5668 +/- 0.0,0.2195 +/- 0.0
ETS_ERM,0.0456 +/- 0.0,0.1027 +/- 0.0,2.3742 +/- 0.0,0.508 +/- 0.0,0.2217 +/- 0.0
PERMBU_MINT,0.0393 +/- 0.0002,0.0679 +/- 0.0047,0.0763 +/- 0.0003,,0.279 +/- 0.02
HierE2E,0.0335 +/- 0.0064,0.0359 +/- 0.0114,0.0916 +/- 0.0082,0.1688 +/- 0.0036,0.1629 +/- 0.0056


## Table 5 from Reproducibility Report (separated in subtables)

In [10]:
for dataset in ['tourism', 'tourismlarge', 'labour', 'traffic', 'wiki']:
    print(f'\n\nThis is a table displaying the CRPS score across levels\non the dataset {dataset} for the optimal configuration of\nHierE2E, DeepVAR, DeepVAR+ and the best classical machine learning approach: \n')
    display(get_supplementary_subtable(dataset, to_latex=False))



This is a table displaying the CRPS score across levels
on the dataset tourism for the optimal configuration of
HierE2E, DeepVAR, DeepVAR+ and the best classical machine learning approach: 



Unnamed: 0,PERMBU_MINT,HierE2E,DeepVAR,DeepVARPlus
mean_CRPS,0.0763 +/- 0.0003,0.0916 +/- 0.0091,0.0953 +/- 0.0062,0.0956 +/- 0.018
level_1_mean_CRPS,0.0464 +/- 0.0017,0.051 +/- 0.0099,0.0531 +/- 0.012,0.0509 +/- 0.019
level_2_mean_CRPS,0.0592 +/- 0.0008,0.0765 +/- 0.0113,0.0827 +/- 0.0091,0.0776 +/- 0.0216
level_3_mean_CRPS,0.0899 +/- 0.0011,0.1104 +/- 0.008,0.112 +/- 0.0086,0.1148 +/- 0.018
level_4_mean_CRPS,0.1097 +/- 0.0009,0.1286 +/- 0.0079,0.1333 +/- 0.0062,0.139 +/- 0.0152




This is a table displaying the CRPS score across levels
on the dataset tourismlarge for the optimal configuration of
HierE2E, DeepVAR, DeepVAR+ and the best classical machine learning approach: 



Unnamed: 0,ARIMA_MINT_shr,HierE2E,DeepVAR,DeepVARPlus
mean_CRPS,0.1615 +/- 0.0,0.1688 +/- 0.004,0.1394 +/- 0.0021,0.1979 +/- 0.0294
level_1_mean_CRPS,0.0443 +/- 0.0,0.0959 +/- 0.0105,0.0634 +/- 0.005,0.1234 +/- 0.043
level_2_geo_mean_CRPS,0.0826 +/- 0.0,0.1161 +/- 0.0063,0.0814 +/- 0.0029,0.1417 +/- 0.0351
level_3_geo_mean_CRPS,0.1439 +/- 0.0,0.1503 +/- 0.0053,0.1216 +/- 0.003,0.1775 +/- 0.0304
level_4_geo_mean_CRPS,0.2042 +/- 0.0,0.1901 +/- 0.0045,0.1629 +/- 0.0017,0.218 +/- 0.0263
level_2_trav_mean_CRPS,0.0834 +/- 0.0,0.1209 +/- 0.0039,0.0891 +/- 0.0087,0.1464 +/- 0.0331
level_3_trav_mean_CRPS,0.1485 +/- 0.0,0.1619 +/- 0.0044,0.1302 +/- 0.004,0.1895 +/- 0.0259
level_4_trav_mean_CRPS,0.244 +/- 0.0,0.2242 +/- 0.0044,0.1979 +/- 0.0012,0.2556 +/- 0.0234
level_5_trav_mean_CRPS,0.3413 +/- 0.0,0.2913 +/- 0.0053,0.2684 +/- 0.0026,0.3314 +/- 0.0245




This is a table displaying the CRPS score across levels
on the dataset labour for the optimal configuration of
HierE2E, DeepVAR, DeepVAR+ and the best classical machine learning approach: 



Unnamed: 0,PERMBU_MINT,HierE2E,DeepVAR,DeepVARPlus
mean_CRPS,0.0393 +/- 0.0003,0.0335 +/- 0.0072,0.0367 +/- 0.0055,0.0457 +/- 0.013
level_1_mean_CRPS,0.0406 +/- 0.0004,0.0302 +/- 0.0093,0.0342 +/- 0.005,0.0445 +/- 0.016
level_2_mean_CRPS,0.0388 +/- 0.0003,0.0342 +/- 0.0071,0.0362 +/- 0.0059,0.0461 +/- 0.013
level_3_mean_CRPS,0.0382 +/- 0.0002,0.0335 +/- 0.0066,0.0362 +/- 0.0056,0.0456 +/- 0.0125
level_4_mean_CRPS,0.0396 +/- 0.0003,0.0361 +/- 0.0058,0.0403 +/- 0.0067,0.0466 +/- 0.0106




This is a table displaying the CRPS score across levels
on the dataset traffic for the optimal configuration of
HierE2E, DeepVAR, DeepVAR+ and the best classical machine learning approach: 



Unnamed: 0,ARIMA_ERM,HierE2E,DeepVAR,DeepVARPlus
mean_CRPS,0.0466 +/- 0.0,0.0359 +/- 0.0127,0.0334 +/- 0.0036,0.0366 +/- 0.0088
level_1_mean_CRPS,0.0089 +/- 0.0,0.0166 +/- 0.017,0.0131 +/- 0.0058,0.013 +/- 0.0081
level_2_mean_CRPS,0.0113 +/- 0.0,0.0178 +/- 0.0159,0.0174 +/- 0.0121,0.0158 +/- 0.008
level_3_mean_CRPS,0.0254 +/- 0.0,0.0186 +/- 0.0154,0.0198 +/- 0.0086,0.0209 +/- 0.0124
level_4_mean_CRPS,0.1408 +/- 0.0,0.0905 +/- 0.0061,0.0835 +/- 0.0027,0.0969 +/- 0.0096




This is a table displaying the CRPS score across levels
on the dataset wiki for the optimal configuration of
HierE2E, DeepVAR, DeepVAR+ and the best classical machine learning approach: 



Unnamed: 0,ETS_ERM,HierE2E,DeepVAR,DeepVARPlus
mean_CRPS,0.2217 +/- 0.0,0.1629 +/- 0.0063,0.2081 +/- 0.0067,0.2053 +/- 0.0146
level_1_mean_CRPS,0.1558 +/- 0.0,0.0668 +/- 0.0056,0.0751 +/- 0.0153,0.0523 +/- 0.0158
level_2_mean_CRPS,0.1614 +/- 0.0,0.1184 +/- 0.0062,0.1199 +/- 0.0143,0.1053 +/- 0.009
level_3_mean_CRPS,0.201 +/- 0.0,0.1536 +/- 0.0082,0.2238 +/- 0.0074,0.2076 +/- 0.0187
level_4_mean_CRPS,0.2399 +/- 0.0,0.1711 +/- 0.0067,0.2555 +/- 0.0109,0.2567 +/- 0.0205
level_5_mean_CRPS,0.3506 +/- 0.0,0.3047 +/- 0.0076,0.3663 +/- 0.0047,0.4047 +/- 0.0223


## Supplementary material tables

In [11]:
for dataset in ['tourism', 'tourismlarge', 'labour', 'traffic', 'wiki']:
    print(f'\n\nThis is a table displaying the CRPS score across levels \non the dataset {dataset} for the optimal configuration of each method: \n')
    display(get_supplementary_table(dataset, to_latex=False))



This is a table displaying the CRPS score across levels 
on the dataset tourism for the optimal configuration of each method: 



Unnamed: 0,mean_CRPS,level_1_mean_CRPS,level_2_mean_CRPS,level_3_mean_CRPS,level_4_mean_CRPS
ARIMA_NaiveBU,0.1138 +/- 0.0,0.0588 +/- 0.0,0.0945 +/- 0.0,0.1366 +/- 0.0,0.1653 +/- 0.0
ETS_NaiveBU,0.1008 +/- 0.0,0.0545 +/- 0.0,0.0809 +/- 0.0,0.1194 +/- 0.0,0.1483 +/- 0.0
ARIMA_MINT_shr,0.1171 +/- 0.0,0.0625 +/- 0.0,0.0989 +/- 0.0,0.1395 +/- 0.0,0.1677 +/- 0.0
ARIMA_MINT_ols,0.1195 +/- 0.0,0.0619 +/- 0.0,0.1018 +/- 0.0,0.1419 +/- 0.0,0.1723 +/- 0.0
ETS_MINT_shr,0.1013 +/- 0.0,0.0592 +/- 0.0,0.0793 +/- 0.0,0.1202 +/- 0.0,0.1467 +/- 0.0
ETS_MINT_ols,0.1002 +/- 0.0,0.0597 +/- 0.0,0.0749 +/- 0.0,0.1201 +/- 0.0,0.1462 +/- 0.0
ARIMA_ERM,0.5885 +/- 0.0,0.2196 +/- 0.0,0.3903 +/- 0.0,0.812 +/- 0.0,0.9322 +/- 0.0
ETS_ERM,2.3742 +/- 0.0,1.4383 +/- 0.0,1.9934 +/- 0.0,2.8479 +/- 0.0,3.2173 +/- 0.0
PERMBU_MINT,0.0763 +/- 0.0003,0.0464 +/- 0.0017,0.0592 +/- 0.0008,0.0899 +/- 0.0011,0.1097 +/- 0.0009
HierE2E,0.0916 +/- 0.0091,0.051 +/- 0.0099,0.0765 +/- 0.0113,0.1104 +/- 0.008,0.1286 +/- 0.0079




This is a table displaying the CRPS score across levels 
on the dataset tourismlarge for the optimal configuration of each method: 



Unnamed: 0,mean_CRPS,level_1_mean_CRPS,level_2_geo_mean_CRPS,level_3_geo_mean_CRPS,level_4_geo_mean_CRPS,level_2_trav_mean_CRPS,level_3_trav_mean_CRPS,level_4_trav_mean_CRPS,level_5_trav_mean_CRPS
ARIMA_NaiveBU,0.1752 +/- 0.0,0.0827 +/- 0.0,0.1035 +/- 0.0,0.1586 +/- 0.0,0.2131 +/- 0.0,0.1003 +/- 0.0,0.1567 +/- 0.0,0.2489 +/- 0.0,0.3379 +/- 0.0
ETS_NaiveBU,0.169 +/- 0.0,0.0802 +/- 0.0,0.0989 +/- 0.0,0.1561 +/- 0.0,0.2058 +/- 0.0,0.0927 +/- 0.0,0.1484 +/- 0.0,0.2408 +/- 0.0,0.3291 +/- 0.0
ARIMA_MINT_shr,0.1615 +/- 0.0,0.0443 +/- 0.0,0.0826 +/- 0.0,0.1439 +/- 0.0,0.2042 +/- 0.0,0.0834 +/- 0.0,0.1485 +/- 0.0,0.244 +/- 0.0,0.3413 +/- 0.0
ARIMA_MINT_ols,0.1731 +/- 0.0,0.0394 +/- 0.0,0.083 +/- 0.0,0.1501 +/- 0.0,0.2169 +/- 0.0,0.1056 +/- 0.0,0.1646 +/- 0.0,0.261 +/- 0.0,0.3643 +/- 0.0
ETS_MINT_shr,0.1627 +/- 0.0,0.0505 +/- 0.0,0.0902 +/- 0.0,0.1501 +/- 0.0,0.2024 +/- 0.0,0.089 +/- 0.0,0.1439 +/- 0.0,0.2415 +/- 0.0,0.3343 +/- 0.0
ETS_MINT_ols,0.1668 +/- 0.0,0.0484 +/- 0.0,0.0897 +/- 0.0,0.1542 +/- 0.0,0.2102 +/- 0.0,0.0891 +/- 0.0,0.1455 +/- 0.0,0.2499 +/- 0.0,0.3473 +/- 0.0
ARIMA_ERM,0.5668 +/- 0.0,0.2577 +/- 0.0,0.3791 +/- 0.0,0.4974 +/- 0.0,0.638 +/- 0.0,0.366 +/- 0.0,0.5402 +/- 0.0,0.8013 +/- 0.0,1.0551 +/- 0.0
ETS_ERM,0.508 +/- 0.0,0.1161 +/- 0.0,0.3231 +/- 0.0,0.4684 +/- 0.0,0.6143 +/- 0.0,0.2622 +/- 0.0,0.4853 +/- 0.0,0.7741 +/- 0.0,1.0209 +/- 0.0
PERMBU_MINT,,,,,,,,,
HierE2E,0.1688 +/- 0.004,0.0959 +/- 0.0105,0.1161 +/- 0.0063,0.1503 +/- 0.0053,0.1901 +/- 0.0045,0.1209 +/- 0.0039,0.1619 +/- 0.0044,0.2242 +/- 0.0044,0.2913 +/- 0.0053




This is a table displaying the CRPS score across levels 
on the dataset labour for the optimal configuration of each method: 



Unnamed: 0,mean_CRPS,level_1_mean_CRPS,level_2_mean_CRPS,level_3_mean_CRPS,level_4_mean_CRPS
ARIMA_NaiveBU,0.0453 +/- 0.0,0.0437 +/- 0.0,0.0441 +/- 0.0,0.0447 +/- 0.0,0.0489 +/- 0.0
ETS_NaiveBU,0.0432 +/- 0.0,0.0416 +/- 0.0,0.0418 +/- 0.0,0.0421 +/- 0.0,0.0471 +/- 0.0
ARIMA_MINT_shr,0.0467 +/- 0.0,0.0454 +/- 0.0,0.0455 +/- 0.0,0.0459 +/- 0.0,0.0499 +/- 0.0
ARIMA_MINT_ols,0.0463 +/- 0.0,0.0448 +/- 0.0,0.045 +/- 0.0,0.0455 +/- 0.0,0.0499 +/- 0.0
ETS_MINT_shr,0.0455 +/- 0.0,0.044 +/- 0.0,0.0442 +/- 0.0,0.0444 +/- 0.0,0.0492 +/- 0.0
ETS_MINT_ols,0.0459 +/- 0.0,0.0445 +/- 0.0,0.0447 +/- 0.0,0.0448 +/- 0.0,0.0495 +/- 0.0
ARIMA_ERM,0.0399 +/- 0.0,0.0365 +/- 0.0,0.0379 +/- 0.0,0.0391 +/- 0.0,0.0459 +/- 0.0
ETS_ERM,0.0456 +/- 0.0,0.0409 +/- 0.0,0.0437 +/- 0.0,0.0452 +/- 0.0,0.0525 +/- 0.0
PERMBU_MINT,0.0393 +/- 0.0003,0.0406 +/- 0.0004,0.0388 +/- 0.0003,0.0382 +/- 0.0002,0.0396 +/- 0.0003
HierE2E,0.0335 +/- 0.0072,0.0302 +/- 0.0093,0.0342 +/- 0.0071,0.0335 +/- 0.0066,0.0361 +/- 0.0058




This is a table displaying the CRPS score across levels 
on the dataset traffic for the optimal configuration of each method: 



Unnamed: 0,mean_CRPS,level_1_mean_CRPS,level_2_mean_CRPS,level_3_mean_CRPS,level_4_mean_CRPS
ARIMA_NaiveBU,0.0753 +/- 0.0,0.0364 +/- 0.0,0.0364 +/- 0.0,0.0453 +/- 0.0,0.1832 +/- 0.0
ETS_NaiveBU,0.0665 +/- 0.0,0.0128 +/- 0.0,0.0128 +/- 0.0,0.0351 +/- 0.0,0.2053 +/- 0.0
ARIMA_MINT_shr,0.0775 +/- 0.0,0.0467 +/- 0.0,0.0467 +/- 0.0,0.0467 +/- 0.0,0.1701 +/- 0.0
ARIMA_MINT_ols,0.1123 +/- 0.0,0.0853 +/- 0.0,0.0853 +/- 0.0,0.0853 +/- 0.0,0.1934 +/- 0.0
ETS_MINT_shr,0.0963 +/- 0.0,0.0601 +/- 0.0,0.0601 +/- 0.0,0.0601 +/- 0.0,0.205 +/- 0.0
ETS_MINT_ols,0.111 +/- 0.0,0.0765 +/- 0.0,0.0765 +/- 0.0,0.0765 +/- 0.0,0.2145 +/- 0.0
ARIMA_ERM,0.0466 +/- 0.0,0.0089 +/- 0.0,0.0113 +/- 0.0,0.0254 +/- 0.0,0.1408 +/- 0.0
ETS_ERM,0.1027 +/- 0.0,0.0828 +/- 0.0,0.0828 +/- 0.0,0.0828 +/- 0.0,0.1624 +/- 0.0
PERMBU_MINT,0.0679 +/- 0.0053,0.0346 +/- 0.0072,0.0354 +/- 0.0058,0.0419 +/- 0.0044,0.1598 +/- 0.0042
HierE2E,0.0359 +/- 0.0127,0.0166 +/- 0.017,0.0178 +/- 0.0159,0.0186 +/- 0.0154,0.0905 +/- 0.0061




This is a table displaying the CRPS score across levels 
on the dataset wiki for the optimal configuration of each method: 



Unnamed: 0,mean_CRPS,level_1_mean_CRPS,level_2_mean_CRPS,level_3_mean_CRPS,level_4_mean_CRPS,level_5_mean_CRPS
ARIMA_NaiveBU,0.3776 +/- 0.0,0.1904 +/- 0.0,0.2797 +/- 0.0,0.4118 +/- 0.0,0.4124 +/- 0.0,0.5936 +/- 0.0
ETS_NaiveBU,0.4673 +/- 0.0,0.341 +/- 0.0,0.3863 +/- 0.0,0.4631 +/- 0.0,0.5051 +/- 0.0,0.641 +/- 0.0
ARIMA_MINT_shr,0.2466 +/- 0.0,0.08 +/- 0.0,0.1382 +/- 0.0,0.2559 +/- 0.0,0.2953 +/- 0.0,0.4638 +/- 0.0
ARIMA_MINT_ols,0.2782 +/- 0.0,0.1079 +/- 0.0,0.1743 +/- 0.0,0.2857 +/- 0.0,0.3253 +/- 0.0,0.4977 +/- 0.0
ETS_MINT_shr,0.3622 +/- 0.0,0.218 +/- 0.0,0.2666 +/- 0.0,0.3451 +/- 0.0,0.388 +/- 0.0,0.5936 +/- 0.0
ETS_MINT_ols,0.2702 +/- 0.0,0.0234 +/- 0.0,0.1456 +/- 0.0,0.2616 +/- 0.0,0.3138 +/- 0.0,0.6065 +/- 0.0
ARIMA_ERM,0.2195 +/- 0.0,0.0776 +/- 0.0,0.1213 +/- 0.0,0.2325 +/- 0.0,0.2746 +/- 0.0,0.3913 +/- 0.0
ETS_ERM,0.2217 +/- 0.0,0.1558 +/- 0.0,0.1614 +/- 0.0,0.201 +/- 0.0,0.2399 +/- 0.0,0.3506 +/- 0.0
PERMBU_MINT,0.279 +/- 0.0223,0.094 +/- 0.0394,0.1599 +/- 0.0248,0.2689 +/- 0.0293,0.3056 +/- 0.0305,0.5666 +/- 0.0589
HierE2E,0.1629 +/- 0.0063,0.0668 +/- 0.0056,0.1184 +/- 0.0062,0.1536 +/- 0.0082,0.1711 +/- 0.0067,0.3047 +/- 0.0076


## Calculation of average runtime of optimal configurations

In [18]:
runtime_df = calculate_average_runtimes(to_latex=False, calc_std_error=True, minutes=True)
runtime_df

Unnamed: 0,labour,traffic,tourism,tourismlarge,wiki
ARIMA_NaiveBU,2.61 +/- 0.23,9.43 +/- 0.14,0.35 +/- 0.01,10.57 +/- 0.63,2.26 +/- 0.16
ETS_NaiveBU,1.73 +/- 0.38,3.42 +/- 0.17,0.32 +/- 0.01,4.11 +/- 0.05,2.11 +/- 0.14
ARIMA_MINT_shr,3.07 +/- 0.35,13.18 +/- 5.18,0.45 +/- 0.01,29.48 +/- 0.21,2.47 +/- 0.02
ARIMA_MINT_ols,2.69 +/- 0.35,9.71 +/- 0.05,0.47 +/- 0.01,29.51 +/- 0.23,2.46 +/- 0.02
ETS_MINT_shr,2.28 +/- 0.11,3.42 +/- 0.01,0.43 +/- 0.02,8.44 +/- 0.19,2.54 +/- 0.07
ETS_MINT_ols,2.55 +/- 0.02,3.38 +/- 0.01,0.42 +/- 0.03,8.21 +/- 0.01,2.46 +/- 0.04
ARIMA_ERM,24.8 +/- 0.06,13.32 +/- 0.44,6.97 +/- 0.69,435.63 +/- 39.84,3.54 +/- 0.1
ETS_ERM,16.25 +/- 0.15,4.27 +/- 0.22,5.91 +/- 0.12,142.75 +/- 17.88,2.98 +/- 0.13
PERMBU_MINT,3.15 +/- 0.09,7.4 +/- 0.4,1.99 +/- 0.12,,6.8 +/- 0.08
HierE2E,18.95 +/- 0.01,55.05 +/- 0.4,5.88 +/- 0.03,38.35 +/- 0.61,21.81 +/- 0.02


## Calculation of total experiment time

In [19]:
display(calculate_total_time())

Unnamed: 0,total_time [h]
ARIMA_NaiveBU,2.1
ETS_NaiveBU,0.97
ARIMA_MINT_shr,4.05
ARIMA_MINT_ols,3.74
ETS_MINT_shr,1.43
ETS_MINT_ols,1.42
ARIMA_ERM,40.36
ETS_ERM,14.35
PERMBU_MINT,1.61
HierE2E,112.28
