In [1]:
# import pandas as pd
# cm_features_v4 = pd.read_csv(f'data/cm_features_v0.4.csv')
# cm_features_v5 = pd.read_csv(f'data/cm_features_v0.5.csv')
# cm_features_v6 = pd.read_csv(f'data/cm_features_v0.6.csv')
# print(cm_features_v4.shape)
# print(cm_features_v5.shape)
# print(cm_features_v6.shape)

In [2]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt

# Path to the submission folder
base_path = "./submission"

# List of metrics
metrics = ['crps', 'ign', 'mis']

# Dictionary to hold all data
all_data = {}

# Process each metric
for metric in metrics:
    # Find all CSV files within the nested directories for each metric
    submission_files = glob.glob(f"{base_path}/**/{metric}.csv", recursive=True)

    # Dictionary to hold the dataframes grouped by submission name and year
    submissions = {}

    # Read each CSV file and store the dataframe in the dictionary grouped by submission name and year
    for file in submission_files:
        df = pd.read_csv(file)
        parts = file.split('/')
        submission_name = parts[2].replace('_', ' ').title()  # Adjust index based on your file path structure
        year = parts[5].split('=')[1]  # Extract year from the 'window=YYear' part
        if submission_name not in submissions:
            submissions[submission_name] = {}
        if year not in submissions[submission_name]:
            submissions[submission_name][year] = []
        submissions[submission_name][year].append(df['value'].mean())  # Store mean values

    # Merge the metric means into the all_data dictionary
    for submission_name, years_data in submissions.items():
        if submission_name not in all_data:
            all_data[submission_name] = {}
        for year, mean_values in years_data.items():
            if year not in all_data[submission_name]:
                all_data[submission_name][year] = {}
            all_data[submission_name][year][metric] = mean_values[0]  # There should be exactly one mean per metric

# Convert all_data to DataFrame
data_frames = {}
for submission, years_data in all_data.items():
    for year, metrics_data in years_data.items():
        index = (submission, year)
        data_frames[index] = metrics_data

# Create a multi-index DataFrame
result_df = pd.DataFrame.from_dict(data_frames, orient='index')
result_df.index = pd.MultiIndex.from_tuples(result_df.index, names=['Submission', 'Year'])

# Display the DataFrame
# result_df

In [3]:
submissions_2018 = result_df.xs('Y2018', level='Year').sort_values(by='crps')
submissions_2018

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ng Boost Cm V1.0 Pw 14 Normal D 50 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,15.552562,1.012841,169.086998
Ng Boost Cm V1.0 Pw 14 Normal D 0 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,15.693962,0.879032,187.23894
Ng Boost Cm V1.0 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,15.725545,0.892318,181.643412
Ng Boost Cm V1.0 Pw 14 Normal D 80 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,16.354103,1.559404,157.041056
Ng Boost Cm V1.0 Pw 14 Normal D 60 N 300 S Crpscore C F M F Bsd 5 Mbf 0.5,16.367978,1.247685,169.92452
Benchmark Last Historical Poisson,20.041059,1.190211,378.076614
Xg Boost,20.652705,3.088874,413.054101
Benchmark Boostrap,23.49413,1.109974,453.287544
Ng Boost Cm V1.0 Pw 14 Normal D 99 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,29.463046,3.822982,263.0613
Ng Boost Cm V1.0 Pw 14 Normal D 100 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,50.992464,5.222295,429.390118


In [4]:
submissions_2019 = result_df.xs('Y2019', level='Year').sort_values(by='crps')
submissions_2019

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Benchmark Last Historical Poisson,9.640593,1.037815,175.834795
Ng Boost Cm V1.0 Pw 14 Normal D 80 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,13.719319,1.409716,107.887282
Ng Boost Cm V1.0 Pw 14 Normal D 50 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,14.038923,1.174697,133.578861
Ng Boost Cm V1.0 Pw 14 Normal D 60 N 300 S Crpscore C F M F Bsd 5 Mbf 0.5 Dli 35,14.045514,1.329153,122.233813
Ng Boost Cm V1.0 Pw 14 Normal D 60 N 300 S Crpscore C F M F Bsd 5 Mbf 0.5,14.203654,1.307128,129.50373
Ng Boost Cm V1.0 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,14.711141,1.083588,152.698102
Ng Boost Cm V1.0 Pw 14 Normal D 0 N 300 S Crpscore C F M F Bsd 5 Mbf 0.5,14.909387,0.993027,170.919481
Ng Boost Cm V1.0 Pw 14 Normal D 0 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,14.957631,1.054029,172.914965
Xg Boost,17.625218,3.093222,352.504363
Benchmark Boostrap,22.074857,1.090364,419.315532


In [5]:
submissions_2020 = result_df.xs('Y2020', level='Year').sort_values(by='crps')
submissions_2020

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ng Boost Cm V1.0 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,12.816155,1.070696,155.546051
Ng Boost Cm V1.0 Pw 14 Normal D 50 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,13.02217,1.200449,151.988416
Ng Boost Cm V1.0 Pw 14 Normal D 80 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,13.107986,1.466948,149.729079
Ng Boost Cm V1.0 Pw 14 Normal D 0 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,13.238099,1.04006,162.281894
Benchmark Last Historical Poisson,13.6982,1.084218,256.184097
Benchmark Boostrap,21.269923,1.088269,402.353163
Xg Boost,22.536649,3.071479,450.732984
Ng Boost Cm V1.0 Pw 14 Normal D 99 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,24.013127,3.663109,226.390096
Ng Boost Cm V1.0 Pw 14 Normal D 100 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,41.805392,4.842497,399.683093


In [6]:
submissions_2021 = result_df.xs('Y2021', level='Year').sort_values(by='crps')
submissions_2021

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ng Boost Cm V1.0 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,29.209069,0.952732,461.70264
Ng Boost Cm V1.0 Pw 14 Normal D 50 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,29.360006,1.019948,456.725502
Ng Boost Cm V1.0 Pw 14 Normal D 0 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,29.415297,0.893066,472.497818
Ng Boost Cm V1.0 Pw 14 Normal D 60 N 300 S Crpscore C F M F Bsd 5 Mbf 0.5,29.489194,1.141157,454.500393
Ng Boost Cm V1.0 Pw 14 Normal D 80 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,29.688606,1.505556,444.113133
Benchmark Boostrap,36.045139,1.129538,694.494786
Benchmark Last Historical Poisson,37.129253,1.227684,722.673604
Ng Boost Cm V1.0 Pw 14 Normal D 99 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,40.356251,3.622447,525.082046
Ng Boost Cm V1.0 Pw 14 Normal D 100 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,58.355464,4.857692,703.790532


In [7]:
submissions_2022 = result_df.xs('Y2022', level='Year').sort_values(by='crps')
submissions_2022

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ng Boost Cm V1.0 Pw 14 Normal D 0 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,71.946043,1.080393,1332.974978
Ng Boost Cm V1.0 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,72.338727,1.060777,1332.438089
Ng Boost Cm V1.0 Pw 14 Normal D 50 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,72.972526,1.335668,1332.136649
Ng Boost Cm V1.0 Pw 14 Normal D 60 N 300 S Crpscore C F M F Bsd 5 Mbf 0.5 Dli 35,73.843485,1.414822,1337.859337
Ng Boost Cm V1.0 Pw 14 Normal D 60 N 300 S Crpscore C F M F Bsd 5 Mbf 0.5,73.935973,1.456656,1338.145877
Ng Boost Cm V1.0 Pw 14 Normal D 80 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,74.913168,1.63391,1342.22079
Ng Boost Cm V1.0 Pw 14 Normal D 99 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,87.02098,3.682881,1420.948408
Ng Boost Cm V1.0 Pw 14 Normal D 100 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,102.273055,4.885009,1577.162631


In [8]:
# find submissions that span over all 5 years
submissions_all_years = result_df.groupby('Submission').count()
submissions_all_years

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Benchmark Boostrap,4,4,4
Benchmark Last Historical Poisson,4,4,4
Ng Boost Cm V1.0 Pw 14 Normal D 0 N 300 S Crpscore C F M F Bsd 5 Mbf 0.5,1,1,1
Ng Boost Cm V1.0 Pw 14 Normal D 0 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,5,5,5
Ng Boost Cm V1.0 Pw 14 Normal D 100 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,5,5,5
Ng Boost Cm V1.0 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,5,5,5
Ng Boost Cm V1.0 Pw 14 Normal D 50 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,5,5,5
Ng Boost Cm V1.0 Pw 14 Normal D 60 N 300 S Crpscore C F M F Bsd 5 Mbf 0.5,4,4,4
Ng Boost Cm V1.0 Pw 14 Normal D 60 N 300 S Crpscore C F M F Bsd 5 Mbf 0.5 Dli 35,2,2,2
Ng Boost Cm V1.0 Pw 14 Normal D 80 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,5,5,5


In [9]:
submissions_all_years = submissions_all_years[submissions_all_years['crps'] == 5]
# calculate average crps, ign, mis over all years
submissions_all_years['crps_avg'] = result_df.groupby('Submission')['crps'].mean()
submissions_all_years['ign_avg'] = result_df.groupby('Submission')['ign'].mean()
submissions_all_years['mis_avg'] = result_df.groupby('Submission')['mis'].mean()
submissions_all_years = submissions_all_years.sort_values(by='crps_avg')
submissions_all_years


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submissions_all_years['crps_avg'] = result_df.groupby('Submission')['crps'].mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submissions_all_years['ign_avg'] = result_df.groupby('Submission')['ign'].mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submissions_all_years['mis_avg'] = result

Unnamed: 0_level_0,crps,ign,mis,crps_avg,ign_avg,mis_avg
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ng Boost Cm V1.0 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,5,5,5,28.960127,1.012022,456.805659
Ng Boost Cm V1.0 Pw 14 Normal D 50 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,5,5,5,28.989238,1.14872,448.703285
Ng Boost Cm V1.0 Pw 14 Normal D 0 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,5,5,5,29.050207,0.989316,465.581719
Ng Boost Cm V1.0 Pw 14 Normal D 80 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,5,5,5,29.556636,1.515107,440.198268
Ng Boost Cm V1.0 Pw 14 Normal D 99 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,5,5,5,40.621469,3.664012,520.868455
Ng Boost Cm V1.0 Pw 14 Normal D 100 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5,5,5,5,59.284242,4.958381,691.846191


In [10]:
PLOT_FIGURES = False
if PLOT_FIGURES:

    import os
    import glob
    import pandas as pd
    import matplotlib.pyplot as plt

    # Path to the submission folder
    base_path = "../submission"

    metrics = ['crps', 'ign', 'mis']

    for metric in metrics:
        # Find all CSV files within the nested directories
        submission_files = glob.glob(f"{base_path}/**/{metric}.csv", recursive=True)
        print(submission_files)

        # Dictionary to hold the dataframes grouped by submission name and year
        submissions = {}

        # Read each CSV file and store the dataframe in the dictionary grouped by submission name and year
        for file in submission_files:
            df = pd.read_csv(file)
            parts = file.split('/')
            submission_name = parts[2].replace('_', ' ').title()  # Adjust index based on your file path structure
            year = parts[5].split('=')[1]  # Extract year from the 'window=YYear' part
            if submission_name not in submissions:
                submissions[submission_name] = {}
            submissions[submission_name][year] = df

        # Create a figure for each submission
        for submission_name, years_data in submissions.items():
            num_years = len(years_data)
            plt.figure(figsize=(10, 5))
            plt.suptitle(f'{metric.upper()} Histograms for {submission_name}')

            # Create subplots for each year in a 2x2 grid
            for index, (year, df) in enumerate(sorted(years_data.items()), start=1):
                plt.subplot(2, 2, index)
                plt.hist(df['value'], bins=100, alpha=0.75, label=f'Year: {year}')
                mean_value = df['value'].mean()
                plt.axvline(mean_value, color='r', linestyle='dashed', linewidth=1)
                plt.title(f'Year: {year[1:]}')
                plt.xlabel(metric.upper())
                plt.ylabel('Frequency')
                # log
                plt.yscale('log')
                # plt.xscale('log')
                # mean_value = sub['value'].mean()
                # plt.axvline(mean_value, color='r', linestyle='dashed', linewidth=1)
                plt.legend([f'Mean: {mean_value:.1f}'])

            plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to not overlap with the suptitle
            plt.show()
