In [12]:
# import pandas as pd
# cm_features_v4 = pd.read_csv(f'data/cm_features_v0.4.csv')
# cm_features_v5 = pd.read_csv(f'data/cm_features_v0.5.csv')
# cm_features_v6 = pd.read_csv(f'data/cm_features_v0.6.csv')
# print(cm_features_v4.shape)
# print(cm_features_v5.shape)
# print(cm_features_v6.shape)

In [13]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt

# Path to the submission folder
base_path = "../submission"

# List of metrics
metrics = ['crps', 'ign', 'mis']

# Dictionary to hold all data
all_data = {}

# Process each metric
for metric in metrics:
    # Find all CSV files within the nested directories for each metric
    submission_files = glob.glob(f"{base_path}/**/{metric}.csv", recursive=True)

    # Dictionary to hold the dataframes grouped by submission name and year
    submissions = {}

    # Read each CSV file and store the dataframe in the dictionary grouped by submission name and year
    for file in submission_files:
        df = pd.read_csv(file)
        parts = file.split('/')
        submission_name = parts[2].replace('_', ' ').title()  # Adjust index based on your file path structure
        year = parts[5].split('=')[1]  # Extract year from the 'window=YYear' part
        if submission_name not in submissions:
            submissions[submission_name] = {}
        if year not in submissions[submission_name]:
            submissions[submission_name][year] = []
        submissions[submission_name][year].append(df['value'].mean())  # Store mean values

    # Merge the metric means into the all_data dictionary
    for submission_name, years_data in submissions.items():
        if submission_name not in all_data:
            all_data[submission_name] = {}
        for year, mean_values in years_data.items():
            if year not in all_data[submission_name]:
                all_data[submission_name][year] = {}
            all_data[submission_name][year][metric] = mean_values[0]  # There should be exactly one mean per metric

# Convert all_data to DataFrame
data_frames = {}
for submission, years_data in all_data.items():
    for year, metrics_data in years_data.items():
        index = (submission, year)
        data_frames[index] = metrics_data

# Create a multi-index DataFrame
result_df = pd.DataFrame.from_dict(data_frames, orient='index')
result_df.index = pd.MultiIndex.from_tuples(result_df.index, names=['Submission', 'Year'])

# Display the DataFrame
result_df

Unnamed: 0_level_0,Unnamed: 1_level_0,crps,ign,mis
Submission,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bm Last Historical Poisson,Y2022,131.017135,1.124429,2599.278338
Bm Last Historical Poisson,Y2023,678.959777,1.124699,13523.463264
Bm Last Historical Poisson,Y2019,9.480041,1.045585,172.686169
Bm Last Historical Poisson,Y2021,85.605461,1.227781,1690.710864
Bm Last Historical Poisson,Y2020,23.698109,1.110316,455.806457
Bm Last Historical Poisson,Y2018,20.173457,1.198439,380.623037
Bm Boot 240,Y2022,120.249169,1.154555,2380.743565
Bm Boot 240,Y2023,52.722147,1.154135,1030.986889
Bm Boot 240,Y2019,22.457582,1.111029,426.005912
Bm Boot 240,Y2021,86.626316,1.152036,1708.304188


In [14]:
# You may need to comment this out if you have only one year, as otherwise KeyError for non-existing year is thrown
try:
    submissions_2018 = result_df.xs('Y2018', level='Year').sort_values(by='crps')
except KeyError:
    print('No 2018 data')
    submissions_2018 = None
submissions_2018

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bm Conflictology,14.482875,0.640281,186.553578
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log F,15.56342,0.897932,180.352269
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log T,17.60779,0.855409,108.887435
Bm Last Historical Poisson,20.173457,1.198439,380.623037
Bm Boot 240,23.577316,1.123216,454.089616
Bm Zero,24.130454,1.55813,482.609075


In [15]:
try:
    submissions_2019 = result_df.xs('Y2019', level='Year').sort_values(by='crps')
except KeyError:
    print('No 2019 data')
    submissions_2019 = None
submissions_2019

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bm Conflictology,9.146306,0.610132,89.057941
Bm Last Historical Poisson,9.480041,1.045585,172.686169
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log F,14.767184,1.080652,158.910951
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log T,17.439474,0.953414,109.675982
Bm Boot 240,22.457582,1.111029,426.005912
Bm Zero,23.018761,1.55813,460.375218


In [16]:
try:
    submissions_2020 = result_df.xs('Y2020', level='Year').sort_values(by='crps')
except KeyError:
    print('No 2020 data')
    submissions_2020 = None
submissions_2020

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bm Conflictology,21.339332,0.566535,344.964311
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log F,21.931049,1.095663,335.626134
Bm Last Historical Poisson,23.698109,1.110316,455.806457
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log T,26.776637,0.907719,359.580781
Bm Boot 240,31.417437,1.115448,606.002792
Bm Zero,32.040576,1.549433,640.811518


In [17]:
try:
    submissions_2021 = result_df.xs('Y2021', level='Year').sort_values(by='crps')
except KeyError:
    print('No 2021 data')
    submissions_2021 = None
submissions_2021

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bm Conflictology,76.849476,0.685623,1435.554625
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log F,79.532981,1.067882,1464.791841
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log T,81.240706,0.918617,1402.012718
Bm Last Historical Poisson,85.605461,1.227781,1690.710864
Bm Boot 240,86.626316,1.152036,1708.304188
Bm Zero,87.339005,1.614664,1746.780105


In [18]:
try:
    submissions_2022 = result_df.xs('Y2022', level='Year').sort_values(by='crps')
except KeyError:
    print('No 2022 data')
    submissions_2022 = None
submissions_2022

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log T,117.36843,0.942212,2286.247251
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log F,119.384168,1.006404,2266.96839
Bm Boot 240,120.249169,1.154555,2380.743565
Bm Zero,120.96815,1.632058,2419.363002
Bm Conflictology,123.995164,0.694711,2142.128098
Bm Last Historical Poisson,131.017135,1.124429,2599.278338


In [19]:
try:
    submissions_2023 = result_df.xs('Y2023', level='Year').sort_values(by='crps')
except KeyError:
    print('No 2023 data')
    submissions_2023 = None
submissions_2023

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log F,48.883914,1.120273,830.872426
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log T,49.889089,0.893542,904.784664
Bm Conflictology,50.356712,0.682261,1042.916143
Bm Boot 240,52.722147,1.154135,1030.986889
Bm Zero,53.543194,1.614664,1070.863874
Bm Last Historical Poisson,678.959777,1.124699,13523.463264


In [20]:
# find submissions that span over all 5 years
submissions_all_years = result_df.groupby('Submission').count()
submissions_all_years

Unnamed: 0_level_0,crps,ign,mis
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bm Boot 240,6,6,6
Bm Conflictology,6,6,6
Bm Last Historical Poisson,6,6,6
Bm Zero,6,6,6
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log F,6,6,6
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log T,6,6,6


In [21]:
submissions_all_years = submissions_all_years[submissions_all_years['crps'] == 5]
# calculate average crps, ign, mis over all years
submissions_all_years['crps_avg'] = result_df.groupby('Submission')['crps'].mean()
submissions_all_years['ign_avg'] = result_df.groupby('Submission')['ign'].mean()
submissions_all_years['mis_avg'] = result_df.groupby('Submission')['mis'].mean()
submissions_all_years = submissions_all_years.sort_values(by='crps_avg')

submissions_all_years[['crps_avg', 'ign_avg', 'mis_avg']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submissions_all_years['crps_avg'] = result_df.groupby('Submission')['crps'].mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submissions_all_years['ign_avg'] = result_df.groupby('Submission')['ign'].mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submissions_all_years['mis_avg'] = result

Unnamed: 0_level_0,crps_avg,ign_avg,mis_avg
Submission,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bm Conflictology,49.361644,0.646591,873.529116
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log F,50.010453,1.044801,872.920335
Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C T M T Bsd 5 Mbf 0.5 Dli 35 Log T,51.720354,0.911819,861.864805
Bm Boot 240,56.174995,1.13507,1101.02216
Bm Zero,56.840023,1.587847,1136.800465
Bm Last Historical Poisson,158.155663,1.138541,3137.094688


In [22]:
PLOT_FIGURES = False
if PLOT_FIGURES:

    import os
    import glob
    import pandas as pd
    import matplotlib.pyplot as plt

    # Path to the submission folder
    base_path = "../submission"

    metrics = ['crps', 'ign', 'mis']

    for metric in metrics:
        # Find all CSV files within the nested directories
        submission_files = glob.glob(f"{base_path}/**/{metric}.csv", recursive=True)
        print(submission_files)

        # Dictionary to hold the dataframes grouped by submission name and year
        submissions = {}

        # Read each CSV file and store the dataframe in the dictionary grouped by submission name and year
        for file in submission_files:
            df = pd.read_csv(file)
            parts = file.split('/')
            submission_name = parts[2].replace('_', ' ').title()  # Adjust index based on your file path structure
            year = parts[5].split('=')[1]  # Extract year from the 'window=YYear' part
            if submission_name not in submissions:
                submissions[submission_name] = {}
            submissions[submission_name][year] = df

        # Create a figure for each submission
        for submission_name, years_data in submissions.items():
            num_years = len(years_data)
            plt.figure(figsize=(10, 5))
            plt.suptitle(f'{metric.upper()} Histograms for {submission_name}')

            # Create subplots for each year in a 2x2 grid
            for index, (year, df) in enumerate(sorted(years_data.items()), start=1):
                plt.subplot(2, 2, index)
                plt.hist(df['value'], bins=100, alpha=0.75, label=f'Year: {year}')
                mean_value = df['value'].mean()
                plt.axvline(mean_value, color='r', linestyle='dashed', linewidth=1)
                plt.title(f'Year: {year[1:]}')
                plt.xlabel(metric.upper())
                plt.ylabel('Frequency')
                # log
                plt.yscale('log')
                # plt.xscale('log')
                # mean_value = sub['value'].mean()
                # plt.axvline(mean_value, color='r', linestyle='dashed', linewidth=1)
                plt.legend([f'Mean: {mean_value:.1f}'])

            plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to not overlap with the suptitle
            plt.show()
