In [None]:
# import pandas as pd
# cm_features_v4 = pd.read_csv(f'data/cm_features_v0.4.csv')
# cm_features_v5 = pd.read_csv(f'data/cm_features_v0.5.csv')
# cm_features_v6 = pd.read_csv(f'data/cm_features_v0.6.csv')
# print(cm_features_v4.shape)
# print(cm_features_v5.shape)
# print(cm_features_v6.shape)

In [None]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt

# Path to the submission folder
base_path = "../submission"

# List of metrics
metrics = ['crps', 'ign', 'mis']

# Dictionary to hold all data
all_data = {}

# Process each metric
for metric in metrics:
    # Find all CSV files within the nested directories for each metric
    submission_files = glob.glob(f"{base_path}/**/{metric}.csv", recursive=True)

    # Dictionary to hold the dataframes grouped by submission name and year
    submissions = {}

    # Read each CSV file and store the dataframe in the dictionary grouped by submission name and year
    for file in submission_files:
        df = pd.read_csv(file)
        parts = file.split('/')
        submission_name = parts[2].replace('_', ' ').title()  # Adjust index based on your file path structure
        year = parts[5].split('=')[1]  # Extract year from the 'window=YYear' part
        if submission_name not in submissions:
            submissions[submission_name] = {}
        if year not in submissions[submission_name]:
            submissions[submission_name][year] = []
        submissions[submission_name][year].append(df['value'].mean())  # Store mean values

    # Merge the metric means into the all_data dictionary
    for submission_name, years_data in submissions.items():
        if submission_name not in all_data:
            all_data[submission_name] = {}
        for year, mean_values in years_data.items():
            if year not in all_data[submission_name]:
                all_data[submission_name][year] = {}
            all_data[submission_name][year][metric] = mean_values[0]  # There should be exactly one mean per metric

# Convert all_data to DataFrame
data_frames = {}
for submission, years_data in all_data.items():
    for year, metrics_data in years_data.items():
        index = (submission, year)
        data_frames[index] = metrics_data

# Create a multi-index DataFrame
result_df = pd.DataFrame.from_dict(data_frames, orient='index')
result_df.index = pd.MultiIndex.from_tuples(result_df.index, names=['Submission', 'Year'])

# Display the DataFrame
result_df

In [None]:
# You may need to comment this out if you have only one year, as otherwise KeyError for non-existing year is thrown
try:
    submissions_2018 = result_df.xs('Y2018', level='Year').sort_values(by='crps')
except KeyError:
    print('No 2018 data')
    submissions_2018 = None
submissions_2018

In [None]:
try:
    submissions_2019 = result_df.xs('Y2019', level='Year').sort_values(by='crps')
except KeyError:
    print('No 2019 data')
    submissions_2019 = None
submissions_2019

In [None]:
try:
    submissions_2020 = result_df.xs('Y2020', level='Year').sort_values(by='crps')
except KeyError:
    print('No 2020 data')
    submissions_2020 = None
submissions_2020

In [None]:
try:
    submissions_2021 = result_df.xs('Y2021', level='Year').sort_values(by='crps')
except KeyError:
    print('No 2021 data')
    submissions_2021 = None
submissions_2021

In [None]:
try:
    submissions_2022 = result_df.xs('Y2022', level='Year').sort_values(by='crps')
except KeyError:
    print('No 2022 data')
    submissions_2022 = None
submissions_2022

In [None]:
try:
    submissions_2023 = result_df.xs('Y2023', level='Year').sort_values(by='crps')
except KeyError:
    print('No 2023 data')
    submissions_2023 = None
submissions_2023

In [None]:
# find submissions that span over all 5 years
submissions_all_years = result_df.groupby('Submission').count()
submissions_all_years

In [None]:
submissions_all_years = submissions_all_years[submissions_all_years['crps'] == 5]
# calculate average crps, ign, mis over all years
submissions_all_years['crps_avg'] = result_df.groupby('Submission')['crps'].mean()
submissions_all_years['ign_avg'] = result_df.groupby('Submission')['ign'].mean()
submissions_all_years['mis_avg'] = result_df.groupby('Submission')['mis'].mean()
submissions_all_years = submissions_all_years.sort_values(by='crps_avg')

submissions_all_years[['crps_avg', 'ign_avg', 'mis_avg']]

In [None]:
PLOT_FIGURES = False
if PLOT_FIGURES:

    import os
    import glob
    import pandas as pd
    import matplotlib.pyplot as plt

    # Path to the submission folder
    base_path = "../submission"

    metrics = ['crps', 'ign', 'mis']

    for metric in metrics:
        # Find all CSV files within the nested directories
        submission_files = glob.glob(f"{base_path}/**/{metric}.csv", recursive=True)
        print(submission_files)

        # Dictionary to hold the dataframes grouped by submission name and year
        submissions = {}

        # Read each CSV file and store the dataframe in the dictionary grouped by submission name and year
        for file in submission_files:
            df = pd.read_csv(file)
            parts = file.split('/')
            submission_name = parts[2].replace('_', ' ').title()  # Adjust index based on your file path structure
            year = parts[5].split('=')[1]  # Extract year from the 'window=YYear' part
            if submission_name not in submissions:
                submissions[submission_name] = {}
            submissions[submission_name][year] = df

        # Create a figure for each submission
        for submission_name, years_data in submissions.items():
            num_years = len(years_data)
            plt.figure(figsize=(10, 5))
            plt.suptitle(f'{metric.upper()} Histograms for {submission_name}')

            # Create subplots for each year in a 2x2 grid
            for index, (year, df) in enumerate(sorted(years_data.items()), start=1):
                plt.subplot(2, 2, index)
                plt.hist(df['value'], bins=100, alpha=0.75, label=f'Year: {year}')
                mean_value = df['value'].mean()
                plt.axvline(mean_value, color='r', linestyle='dashed', linewidth=1)
                plt.title(f'Year: {year[1:]}')
                plt.xlabel(metric.upper())
                plt.ylabel('Frequency')
                # log
                plt.yscale('log')
                # plt.xscale('log')
                # mean_value = sub['value'].mean()
                # plt.axvline(mean_value, color='r', linestyle='dashed', linewidth=1)
                plt.legend([f'Mean: {mean_value:.1f}'])

            plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to not overlap with the suptitle
            plt.show()


# Best Model

In [None]:
years: list[str] = ['Y2018', 'Y2019', 'Y2020', 'Y2021', 'Y2022', 'Y2023']

In [None]:
benchmarks = result_df.loc[list(set([index for index in result_df.index.to_list() if 'Bm' in index[0]]))]
predictions = result_df.loc[list(set([index for index in result_df.index.to_list() if 'Boost' in index[0]]))]

In [None]:
models: list[str] = list({index[0] for index in predictions.index})

In [None]:
benchmarks_mins: list[float] = []

for year in years:
    benchmarks_year = benchmarks.loc[list(set([index for index in benchmarks.index.to_list() if index[1] == year]))]
    benchmarks_mins.append(benchmarks_year['crps'].min())

In [None]:
(
    predictions
    .loc[
        [index for index in predictions.index.to_list() if index[1] == 'Y2021' and index[
            0] == 'Ng Boost Cm V2.4 Pw 14 Normal D 20 N 300 S Crpscore C F M T Bsd 5 Mbf 0.5 Dli 35 Log T']
    ]
    ['crps']
    .values
    [0]
)

In [None]:
models

In [None]:
scores: dict[str, tuple[int, float]] = {}

for model in models:
    pred_sum: float = .0
    score: int = 0
    diffs: list[float] = []

    for i, year in enumerate(years):
        prediction = (
            predictions
            .loc[
                [index for index in predictions.index.to_list() if index[1] == year and index[0] == model]
            ]
            ['crps']
            .values
            [0]
        )

        pred_sum += prediction
        if prediction < benchmarks_mins[i]:
            diffs.append(prediction - benchmarks_mins[i])
            score += 1

    scores[model] = (score, pred_sum / len(years), diffs)

for model, score in zip(scores.keys(), scores.values()):
    print('-' * 32)
    print(model)
    print(score)
print('-' * 32)

In [None]:
for year in years:
    predictions_year = predictions.loc[list(set([index for index in predictions.index.to_list() if index[1] == year]))]
    print(predictions_year[predictions_year['crps'] == predictions_year['crps'].min()].index[0])

In [None]:
best_model = 'Ng Boost Cm V2.5 Pw 14 Normal D 20 N 300 S Crpscore C F M T Bsd 5 Mbf 0.5 Dli 0 Log F'

# Plots

In [None]:
best_model_preds: list[float] = (
    predictions
    .loc[[index for index in predictions.index.to_list() if index[0] == best_model]]
    ['crps']
    .sort_index()
    .to_list()
)
best_model_preds

In [None]:
news = [
    23.4649634966269,
    22.377350533740533,
    32.29989332306505,
    96.76587886250475,
    137.1217659362756,
    58.808471417815895
]

In [None]:
import seaborn as sns

plt.rcParams.update({'font.size': 14})

sns.set_style("whitegrid")

plt.figure(figsize=(12, 6))

sns.lineplot(
    data=pd.DataFrame([years, best_model_preds]),
    x=pd.Series(years),
    y=pd.Series(best_model_preds),
    label='Best Model',
    marker='o',
    color=(254 / 256, 33 / 256, 139 / 256)
)
sns.lineplot(
    data=pd.DataFrame([years, news]),
    x=pd.Series(years),
    y=pd.Series(news),
    label='Model Trained On High-Frequency Data',
    marker='o',
    color=(254 / 256, 215 / 256, 0 / 256)
)
sns.lineplot(
    data=pd.DataFrame([years, benchmarks_mins]),
    x=pd.Series(years),
    y=pd.Series(benchmarks_mins),
    label='Benchmark',
    marker='o',
    color=(33 / 256, 176 / 256, 254 / 256)
)

plt.legend()

plt.xlabel('Year')
plt.ylabel('CRPS')
plt.title('CRPS Over Years')

plt.savefig('../figures/best_model_over_years.png')
plt.show()