In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from properscoring import crps_ensemble

In [2]:
dir = '/Users/ljob/Desktop/Data/'

In [3]:
def format_observed_data(df):
    # Melt the observed data to long format and create the 'year_month' column in a single step
    observed_df = df.melt(id_vars=['Year'], var_name='Month', value_name='Observed')

    # Combine 'Year' and 'Month' into a proper 'year_month' format and convert to datetime in one step
    observed_df['year_month'] = (pd.to_datetime(observed_df['Year'].astype(str) + '-' + observed_df['Month'].str[1:4], format='%Y-%b')).dt.strftime('%Y-%m')
    
    # Sort by 'year_month' for proper chronological order
    observed_df = observed_df.sort_values(by='year_month')
    observed_df = observed_df[['year_month', 'Observed']].reset_index(drop=True)

    return observed_df

In [4]:
def format_forecast_data(df):
    # Step 1: Combine 'year' and 'month' into a datetime column 'first_month'
    df['forecast_date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str), format='%Y-%m')

    # Step 2: Melt the DataFrame
    melted_df = pd.melt(df, id_vars=['forecast_date'], value_vars=['one', 'two', 'three', 'four', 'five', 'six'], 
                    var_name='month', value_name='nbs')
    
    # Step 3: Define the forecast month mapping
    forecast_map = {
        'one': 0,
        'two': 1,
        'three': 2,
        'four': 3,
        'five': 4,
        'six': 5
    }

    melted_df['year_month'] = melted_df['forecast_date'] + melted_df['month'].map(forecast_map).apply(lambda x: pd.DateOffset(months=x))

    melted_df['year_month'] = pd.to_datetime(melted_df['year_month']).dt.strftime('%Y-%m')

    final_df = melted_df[['forecast_date', 'year_month', 'nbs']].sort_values(by=['forecast_date', 'year_month']).reset_index(drop=True)

    return final_df

In [5]:
def calc_skill_metrics(predictions, observations):

    # Standardizing the observed and predicted values
    scaler = StandardScaler()

    # Fit and transform the data
    pred_scaled = scaler.fit_transform(predictions.values.reshape(-1, 1))
    obs_scaled = scaler.fit_transform(observations.values.reshape(-1, 1))

    # RMSE (Root Mean Squared Error) on standardized data
    rmse = np.sqrt(mean_squared_error(obs_scaled, pred_scaled))
    print(f"RMSE (Standardized): {rmse}")

    # R-squared on standardized data
    r_squared = r2_score(obs_scaled, pred_scaled)
    print(f"R-squared (Standardized): {r_squared}")

    # Bias (Average of prediction - observation) on standardized data
    bias = np.mean(pred_scaled - obs_scaled)
    print(f"Bias (Standardized): {bias}")

    # Variance (Variance of predictions) on standardized data
    variance = np.var(pred_scaled)
    print(f"Variance (Standardized): {variance}")

    # CRPS (Continuous Ranked Probability Score)
    # Assuming predicted is a deterministic point forecast (not distribution)
    # CRPS is more meaningful for probabilistic forecasts. Here we use `crps_ensemble`
    # For simplicity, we use the predicted as the ensemble of one prediction (as a proxy).
    crps = crps_ensemble(obs_scaled, pred_scaled)
    print(f"CRPS (Standardized): {np.mean(crps)}")

In [6]:
# Load the observed data
observed_sup = pd.read_csv(dir + 'GLCC/LakeSuperior_MonthlyNetBasinSupply_1900to2025.csv', skiprows=11)
observed_mih = pd.read_csv(dir + 'GLCC/LakeMichiganHuron_MonthlyNetBasinSupply_1900to2025.csv', skiprows=11)
observed_eri = pd.read_csv(dir + 'GLCC/LakeErie_MonthlyNetBasinSupply_1900to2025.csv', skiprows=11)
observed_ont = pd.read_csv(dir + 'GLCC/LakeOntario_MonthlyNetBasinSupply_1900to2025.csv', skiprows=11)               

In [7]:
df_obs_sup = format_observed_data(observed_sup)
df_obs_mih = format_observed_data(observed_mih)
df_obs_eri = format_observed_data(observed_eri)
df_obs_ont = format_observed_data(observed_ont)

In [8]:
# Load forecast data
glshfs_sup = pd.read_csv(dir + 'NBS_LF/GLSHFSModel/SUP.GLSHFS.csv', skiprows=10)
glshfs_mih = pd.read_csv(dir + 'NBS_LF/GLSHFSModel/MIH.GLSHFS.csv', skiprows=10)
glshfs_eri = pd.read_csv(dir + 'NBS_LF/GLSHFSModel/ERI.GLSHFS.csv', skiprows=10)
glshfs_ont = pd.read_csv(dir + 'NBS_LF/GLSHFSModel/ONT.GLSHFS.csv', skiprows=10)

In [96]:
df_glshfs_sup = format_forecast_data(glshfs_sup)
df_glshfs_mih = format_forecast_data(glshfs_mih)
df_glshfs_eri = format_forecast_data(glshfs_eri)
df_glshfs_ont = format_forecast_data(glshfs_ont)

  melted_df['year_month'] = melted_df['forecast_date'] + melted_df['month'].map(forecast_map).apply(lambda x: pd.DateOffset(months=x))
  melted_df['year_month'] = melted_df['forecast_date'] + melted_df['month'].map(forecast_map).apply(lambda x: pd.DateOffset(months=x))
  melted_df['year_month'] = melted_df['forecast_date'] + melted_df['month'].map(forecast_map).apply(lambda x: pd.DateOffset(months=x))
  melted_df['year_month'] = melted_df['forecast_date'] + melted_df['month'].map(forecast_map).apply(lambda x: pd.DateOffset(months=x))


In [97]:
# Merge forecast and observations
df_merged_sup = pd.merge(df_glshfs_sup, df_obs_sup, on='year_month', how='left')
df_merged_mih = pd.merge(df_glshfs_mih, df_obs_mih, on='year_month', how='left')
df_merged_eri = pd.merge(df_glshfs_eri, df_obs_eri, on='year_month', how='left')
df_merged_ont = pd.merge(df_glshfs_ont, df_obs_ont, on='year_month', how='left')

In [98]:
# Remove any missing data in the observations
df_merged_sup.replace(-99990.0, np.nan, inplace=True)
df_merged_sup = df_merged_sup.dropna()

df_merged_mih.replace(-99990.0, np.nan, inplace=True)
df_merged_mih = df_merged_mih.dropna()

df_merged_eri.replace(-99990.0, np.nan, inplace=True)
df_merged_eri = df_merged_eri.dropna()

df_merged_ont.replace(-99990.0, np.nan, inplace=True)
df_merged_ont = df_merged_ont.dropna()

In [99]:
print(df_merged_sup)

    forecast_date year_month   nbs  Observed
0      2021-01-01    2021-01  -974    -650.0
1      2021-01-01    2021-02  -393     260.0
2      2021-01-01    2021-03  1370    1960.0
3      2021-01-01    2021-04  2769    3800.0
4      2021-01-01    2021-05  4241    2760.0
..            ...        ...   ...       ...
283    2024-12-01    2025-01 -1364    -760.0
284    2024-12-01    2025-02  -739    -460.0
288    2025-01-01    2025-01 -1185    -760.0
289    2025-01-01    2025-02  -748    -460.0
294    2025-02-01    2025-02  -604    -460.0

[285 rows x 4 columns]


In [100]:
# Combine the 4 dataframes into one long dataframe
combined_df = pd.concat([df_merged_sup, df_merged_mih, df_merged_eri, df_merged_ont], ignore_index=True)

In [101]:
calc_skill_metrics(combined_df['nbs'], combined_df['Observed'])

RMSE (Standardized): 0.6432531574837181
R-squared (Standardized): 0.5862253753872269
Bias (Standardized): -1.246566203087895e-16
Variance (Standardized): 0.9999999999999998
CRPS (Standardized): 0.4723780650560114
