In [53]:
# import pandas as pd
# cm_features_v4 = pd.read_csv(f'data/cm_features_v0.4.csv')
# cm_features_v5 = pd.read_csv(f'data/cm_features_v0.5.csv')
# cm_features_v6 = pd.read_csv(f'data/cm_features_v0.6.csv')
# print(cm_features_v4.shape)
# print(cm_features_v5.shape)
# print(cm_features_v6.shape)

In [54]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt

# Path to the submission folder
base_path = "./submission"

# List of metrics
metrics = ['crps', 'ign', 'mis']

# Dictionary to hold all data
all_data = {}

# Process each metric
for metric in metrics:
    # Find all CSV files within the nested directories for each metric
    submission_files = glob.glob(f"{base_path}/**/{metric}.csv", recursive=True)

    # Dictionary to hold the dataframes grouped by submission name and year
    submissions = {}

    # Read each CSV file and store the dataframe in the dictionary grouped by submission name and year
    for file in submission_files:
        df = pd.read_csv(file)
        parts = file.split('/')
        submission_name = parts[2].replace('_', ' ').title()  # Adjust index based on your file path structure
        year = parts[5].split('=')[1]  # Extract year from the 'window=YYear' part
        if submission_name not in submissions:
            submissions[submission_name] = {}
        if year not in submissions[submission_name]:
            submissions[submission_name][year] = []
        submissions[submission_name][year].append(df['value'].mean())  # Store mean values

    # Merge the metric means into the all_data dictionary
    for submission_name, years_data in submissions.items():
        if submission_name not in all_data:
            all_data[submission_name] = {}
        for year, mean_values in years_data.items():
            if year not in all_data[submission_name]:
                all_data[submission_name][year] = {}
            all_data[submission_name][year][metric] = mean_values[0]  # There should be exactly one mean per metric

# Convert all_data to DataFrame
data_frames = {}
for submission, years_data in all_data.items():
    for year, metrics_data in years_data.items():
        index = (submission, year)
        data_frames[index] = metrics_data

# Create a multi-index DataFrame
result_df = pd.DataFrame.from_dict(data_frames, orient='index')
result_df.index = pd.MultiIndex.from_tuples(result_df.index, names=['Submission', 'Year'])

# Display the DataFrame
# result_df

In [55]:
submissions_2018 = result_df.xs('Y2018', level='Year').sort_values(by='crps')
submissions_2018

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ng Boost Cm V0.7 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,15.217186,0.875905,174.348102
Ng Boost Cm V0.6 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,15.28931,0.897854,173.132112
Ng Boost Cm V0.7 Normal D 20 N 300 S Crpscore C F M T Bsd 5 Mbf 0.5,15.341661,0.954039,175.065183
Ng Boost Normal D 10 N 300,16.01328,0.867593,201.772709
Ng Boost Cm V0.7 Normal D 80 N 300 S Crpscore C F M T Bsd 5 Mbf 0.5,16.135763,1.579458,154.988853
Ng Boost Cm V0.7 Normal D 80 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,16.191804,1.562017,155.744655
Ng Boost Cm V0.6 Pw 14 Normal D 80 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,16.241464,1.709124,157.852269
Ng Boost Cm V0.6 Normal D 80 N 300 S Crpscore C F M T Bsd 5 Mbf 0.5,16.296694,1.6422,159.663045
Ng Boost Normal D 80 N 400 S Crpscore C T M T Bsd 5 Mbf 0.3,16.318201,1.805858,156.495419
Ng Boost Cm V0.6 Normal D 80 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,16.323118,1.634026,158.954167


In [56]:
submissions_2019 = result_df.xs('Y2019', level='Year').sort_values(by='crps')
submissions_2019

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Benchmark Last Historical Poisson,9.640593,1.037815,175.834795
Ng Boost Normal D 80 N 300 S Crpscore C T M T Bsd 5,13.981267,1.486443,119.107984
Ng Boost Cm V0.7 Normal D 80 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,14.124318,1.533439,124.204647
Ng Boost Normal D 80 N 400 S Crpscore C T M T Bsd 5 Mbf 0.3,14.18061,1.629977,122.335929
Ng Boost Normal D 80 N 300 S Crpscore C T M T Bsd 5 Mbf 0.3,14.256492,1.631406,122.353949
Ng Boost Cm V0.6 Pw 14 Normal D 80 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,14.272918,1.57288,127.444743
Ng Boost Cm V0.6 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,14.463482,1.034683,149.727051
Ng Boost Cm V0.7 Normal D 80 N 300 S Crpscore C F M T Bsd 5 Mbf 0.5,14.499576,1.613607,128.662936
Ng Boost Cm V0.7 Normal D 20 N 300 S Crpscore C F M T Bsd 5 Mbf 0.5,14.803755,0.968469,164.565554
Ng Boost Cm V0.7 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,14.879782,0.973503,166.313656


In [57]:
submissions_2020 = result_df.xs('Y2020', level='Year').sort_values(by='crps')
submissions_2020

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ng Boost Normal D 80 N 300 S Crpscore C T M T Bsd 5,13.221716,1.448148,148.709184
Ng Boost Cm V0.6 Normal D 80 N 400 S Crpscore C T M T Bsd 5 Mbf 0.5,13.282012,1.532086,151.275807
Ng Boost Normal D 80 N 400 S Crpscore C T M T Bsd 5 Mbf 0.3,13.338849,1.557054,150.608944
Ng Boost Cm V0.7 Normal D 80 N 300 S Crpscore C F M T Bsd 5 Mbf 0.5,13.542423,1.674001,152.679385
Benchmark Last Historical Poisson,13.6982,1.084218,256.184097
Benchmark Boostrap,21.269923,1.088269,402.353163
Xg Boost,22.536649,3.071479,450.732984


In [58]:
submissions_2021 = result_df.xs('Y2021', level='Year').sort_values(by='crps')
submissions_2021

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Benchmark Boostrap,36.045139,1.129538,694.494786
Benchmark Last Historical Poisson,37.129253,1.227684,722.673604


In [59]:
PLOT_FIGURES = False
if PLOT_FIGURES:

    import os
    import glob
    import pandas as pd
    import matplotlib.pyplot as plt

    # Path to the submission folder
    base_path = "./submission"

    metrics = ['crps', 'ign', 'mis']

    for metric in metrics:
        # Find all CSV files within the nested directories
        submission_files = glob.glob(f"{base_path}/**/{metric}.csv", recursive=True)
        print(submission_files)

        # Dictionary to hold the dataframes grouped by submission name and year
        submissions = {}

        # Read each CSV file and store the dataframe in the dictionary grouped by submission name and year
        for file in submission_files:
            df = pd.read_csv(file)
            parts = file.split('/')
            submission_name = parts[2].replace('_', ' ').title()  # Adjust index based on your file path structure
            year = parts[5].split('=')[1]  # Extract year from the 'window=YYear' part
            if submission_name not in submissions:
                submissions[submission_name] = {}
            submissions[submission_name][year] = df

        # Create a figure for each submission
        for submission_name, years_data in submissions.items():
            num_years = len(years_data)
            plt.figure(figsize=(10, 5))
            plt.suptitle(f'{metric.upper()} Histograms for {submission_name}')

            # Create subplots for each year in a 2x2 grid
            for index, (year, df) in enumerate(sorted(years_data.items()), start=1):
                plt.subplot(2, 2, index)
                plt.hist(df['value'], bins=100, alpha=0.75, label=f'Year: {year}')
                mean_value = df['value'].mean()
                plt.axvline(mean_value, color='r', linestyle='dashed', linewidth=1)
                plt.title(f'Year: {year[1:]}')
                plt.xlabel(metric.upper())
                plt.ylabel('Frequency')
                # log
                plt.yscale('log')
                # plt.xscale('log')
                # mean_value = sub['value'].mean()
                # plt.axvline(mean_value, color='r', linestyle='dashed', linewidth=1)
                plt.legend([f'Mean: {mean_value:.1f}'])

            plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to not overlap with the suptitle
            plt.show()
