In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import os
import geopandas as gpd

In [None]:
# Compute MBE averaged over space (lat and lon) and months for each lead time (i.e. timestep)

In [None]:
# MBE function
def compute_perstep_mbe(truth, pred):
    return (pred - truth).mean(dim=["latitude", "longitude"])
    
# Function to subset to subpolar and polar regions
def subset_polar_regions(ds):
    return ds.where((ds.latitude >= 60) | (ds.latitude <= -60), drop=True)

def subset_northern_temperate(ds):
    """Subset dataset to the Northern Temperate Zone: 35°N to 60°N."""
    return ds.where((ds.latitude >= 35) & (ds.latitude <= 60), drop=True) # keeps only the points falling into the defined latitude range: drops all the others

def subset_southern_temperate(ds):
    """Subset dataset to the Northern Temperate Zone: 35°N to 60°N."""
    return ds.where((ds.latitude <= -35) & (ds.latitude >= -60), drop=True)
    
def subset_tropics(ds):
    """Subset dataset to the Tropics: -23.5° to 23.5° latitude."""
    return ds.where((ds.latitude >= -23.5) & (ds.latitude <= 23.5), drop=True)

def subset_subtropics(ds):
    """
    Subset dataset to the subtropics:
    - Northern Subtropics: 23.5°N to 35°N
    - Southern Subtropics: 23.5°S to 35°S
    """
    return ds.where(
        ((ds.latitude >= 23.5) & (ds.latitude <= 35)) | 
        ((ds.latitude <= -23.5) & (ds.latitude >= -35)),
        drop=True
    )

def subset_africa(ds, africa_gdf):
    """
    Subset an xarray dataset to the Africa region using a GeoDataFrame polygon.
    """
    ds_rio = ds.rio.write_crs("EPSG:4326", inplace=False)
    return ds_rio.rio.clip(africa_gdf.geometry, africa_gdf.crs, drop=True)

africa_gdf = gpd.read_file("Africa_outline.geojson").to_crs("EPSG:4326")

# Preprocess function for unit conversion
def preprocess(ds):
    if "t2m" in ds:
        ds["t2m"] = ds["t2m"] - 273.15
        ds["t2m"].attrs["units"] = "Celsius"
    if "msl" in ds:
        ds["msl"] = ds["msl"] / 100.0
        ds["msl"].attrs["units"] = "hPa"
    if "tp" in ds:
        ds = ds.drop_vars("tp")
    return ds

In [None]:
# Compute MAE of IFS-HRES (apply needed subset function depending on required geographic region)

In [None]:
# Root directory where all monthly folders are stored
base_dir = "../Surface Variables/"

# Define monthly date range and variables
months = pd.date_range("2024-01-01", "2024-12-01", freq="MS")
variables = ["u10", "v10", "t2m", "msl", "q"]

# Collect monthly MBE datasets
monthly_mbe_datasets = []

for month in months:
    folder_str = month.strftime("%Y%m01")
    forecast_path = os.path.join(base_dir, folder_str, f"{folder_str}_marsfc_sv_q.nc")
    truth_path = os.path.join(base_dir, folder_str, f"{folder_str}_era5_fc_sv_q.nc")
    print(f"starting for {folder_str}")

    fc = preprocess(xr.open_dataset(forecast_path))
    era5 = preprocess(xr.open_dataset(truth_path))
    era5 = era5.rename({"valid_time": "time"})

    # Replace time with index
    fc = fc.assign_coords(time=np.arange(len(fc.time)).astype("float64"))
    era5 = era5.assign_coords(time=np.arange(len(era5.time)).astype("float64"))
    
    print(f"preprocessed for {folder_str}")

    # Subset to southern temperate
    fc = subset_southern_temperate(fc)
    era5 = subset_southern_temperate(era5)
    print(f"subset computed for {folder_str}")

    mbe_dict = {var: compute_perstep_mbe(era5[var], fc[var]) for var in variables}
    mbe_ds = xr.Dataset(mbe_dict)
    mbe_ds = mbe_ds.expand_dims(month=[month.month])

    monthly_mbe_datasets.append(mbe_ds)
    print(f"computed per lead time MBE for {folder_str}")

# Concatenate and save
annual_mbe_ds = xr.concat(monthly_mbe_datasets, dim="month")
# Compute average across all months
mbe_mean = annual_mbe_ds.mean(dim="month")
# Expand and label it as "month 13"
mbe_mean = mbe_mean.expand_dims(month=[13])  # month 13 = annual mean
# Concatenate with original data
annual_mbe_ds_with_mean = xr.concat([annual_mbe_ds, mbe_mean], dim="month")

annual_mbe_ds_with_mean.to_netcdf("SouthernTemperate/PerLeadTime/SouthernTemperate_marsfc_MBE_leadtimes_w_annual.nc")
print("Saved to 'SouthernTemperate/PerLeadTime/SouthernTemperate_marsfc_MBE_leadtimes_w_annual.nc'")

In [None]:
# Compute MBE of AIFS (apply needed subset function depending on required geographic region)

In [None]:
# Root directory where all monthly folders are stored
base_dir = "../Surface Variables/"

# Define monthly date range and variables
months = pd.date_range("2024-03-01", "2024-12-01", freq="MS")
variables = ["u10", "v10", "t2m", "msl", "q"]


# Collect monthly MBE datasets
monthly_mbe_datasets = []

for month in months:
    folder_str = month.strftime("%Y%m01")
    forecast_path = os.path.join(base_dir, folder_str, f"{folder_str}_marsai_sv_q.nc")
    truth_path = os.path.join(base_dir, folder_str, f"{folder_str}_era5_gcai_sv_q.nc")
    print(f"starting for {folder_str}")

    ai = preprocess(xr.open_dataset(forecast_path))
    era5 = preprocess(xr.open_dataset(truth_path))
    era5 = era5.rename({"valid_time": "time"})

    # Replace time with index
    ai = ai.assign_coords(time=np.arange(len(ai.time)).astype("float64"))
    era5 = era5.assign_coords(time=np.arange(len(era5.time)).astype("float64"))
    
    print(f"preprocessed for {folder_str}")

    # Subset to southern temperate
    ai = subset_southern_temperate(ai)
    era5 = subset_southern_temperate(era5)
    print(f"subset computed for {folder_str}")

    mbe_dict = {var: compute_perstep_mbe(era5[var], ai[var]) for var in variables}
    mbe_ds = xr.Dataset(mbe_dict)
    mbe_ds = mbe_ds.expand_dims(month=[month.month])

    monthly_mbe_datasets.append(mbe_ds)
    print(f"computed per lead time MBE for {folder_str}")

# Concatenate and save
annual_mbe_ds = xr.concat(monthly_mbe_datasets, dim="month")
# Compute average across all months
mbe_mean = annual_mbe_ds.mean(dim="month")
# Expand and label it as "month 13"
mbe_mean = mbe_mean.expand_dims(month=[13])  # month 13 = annual mean
# Concatenate with original data
annual_mbe_ds_with_mean = xr.concat([annual_mbe_ds, mbe_mean], dim="month")

annual_mbe_ds_with_mean.to_netcdf("SouthernTemperate/PerLeadTime/SouthernTemperate_marsai_MBE_leadtimes_w_annual.nc")
print("Saved to 'SouthernTemperate/PerLeadTime/SouthernTemperate_marsai_MBE_leadtimes_w_annual.nc'")

In [None]:
# Compute MAE of GraphCast (apply needed subset function depending on required geographic region)

In [None]:
# Root directory where all monthly folders are stored
base_dir = "../Surface Variables/"

# Define monthly date range and variables
months = pd.date_range("2024-01-01", "2024-12-01", freq="MS")
variables = ["u10", "v10", "t2m", "msl", "q"]


# Collect monthly MBE datasets
monthly_mbe_datasets = []

for month in months:
    folder_str = month.strftime("%Y%m01")
    forecast_path = os.path.join(base_dir, folder_str, f"{folder_str}_gc_sv_q.nc")
    truth_path = os.path.join(base_dir, folder_str, f"{folder_str}_era5_gcai_sv_q.nc")
    print(f"starting for {folder_str}")

    gc = preprocess(xr.open_dataset(forecast_path))
    era5 = preprocess(xr.open_dataset(truth_path))
    era5 = era5.rename({"valid_time": "time"})

    # Drop the existing "time" coordinate to avoid conflicts
    if "time" in gc.coords:
        gc = gc.drop_vars("time")

    # Swap the "step" dimension with "valid_time" and rename it to "time"
    gc = gc.swap_dims({"step": "valid_time"})  # make valid_time a dimension
    gc = gc.rename({"valid_time": "time"})     # rename the dimension to "time"

    # Replace time with index
    gc = gc.assign_coords(time=np.arange(len(gc.time)).astype("float64"))
    era5 = era5.assign_coords(time=np.arange(len(era5.time)).astype("float64"))
    
    print(f"preprocessed for {folder_str}")

    # Subset to southern temperate
    gc = subset_southern_temperate(gc)
    era5 = subset_southern_temperate(era5)
    print(f"subset computed for {folder_str}")

    mbe_dict = {var: compute_perstep_mbe(era5[var], gc[var]) for var in variables}
    mbe_ds = xr.Dataset(mbe_dict)
    mbe_ds = mbe_ds.expand_dims(month=[month.month])

    monthly_mbe_datasets.append(mbe_ds)
    print(f"computed per lead time MBE for {folder_str}")

# Concatenate and save
annual_mbe_ds = xr.concat(monthly_mbe_datasets, dim="month")
# Compute average across all months
mbe_mean = annual_mbe_ds.mean(dim="month")
# Expand and label it as "month 13"
mbe_mean = mbe_mean.expand_dims(month=[13])  # month 13 = annual mean
# Concatenate with original data
annual_mbe_ds_with_mean = xr.concat([annual_mbe_ds, mbe_mean], dim="month")

annual_mbe_ds_with_mean.to_netcdf("SouthernTemperate/PerLeadTime/SouthernTemperate_gc_MBE_leadtimes_w_annual.nc")
print("Saved to 'SouthernTemperate/PerLeadTime/SouthernTemperate_gc_MBE_leadtimes_w_annual.nc'")