In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import os
import json
import geopandas as gpd

In [None]:
# Compute RMSE averaged over space (lat and lon) and months for each lead time (i.e. timestep)

In [None]:
# RMSE function
def compute_perstep_rmse(truth, pred):
    return np.sqrt(((truth - pred) ** 2).mean(dim=["latitude", "longitude"]))

# Function to subset to subpolar and polar regions
def subset_polar_regions(ds):
    return ds.where((ds.latitude >= 60) | (ds.latitude <= -60), drop=True)

def subset_northern_temperate(ds):
    """Subset dataset to the Northern Temperate Zone: 35°N to 60°N."""
    return ds.where((ds.latitude >= 35) & (ds.latitude <= 60), drop=True)

def subset_southern_temperate(ds):
    """Subset dataset to the Northern Temperate Zone: 35°N to 60°N."""
    return ds.where((ds.latitude <= -35) & (ds.latitude >= -60), drop=True)

def subset_tropics(ds):
    """Subset dataset to the Tropics: -23.5° to 23.5° latitude."""
    return ds.where((ds.latitude >= -23.5) & (ds.latitude <= 23.5), drop=True)

def subset_subtropics(ds):
    """
    Subset dataset to the subtropics:
    - Northern Subtropics: 23.5°N to 35°N
    - Southern Subtropics: 23.5°S to 35°S
    """
    return ds.where(
        ((ds.latitude >= 23.5) & (ds.latitude <= 35)) | 
        ((ds.latitude <= -23.5) & (ds.latitude >= -35)),
        drop=True
    )

def subset_africa(ds, africa_gdf):
    """
    Subset an xarray dataset to the Africa region using a GeoDataFrame polygon.
    """
    ds_rio = ds.rio.write_crs("EPSG:4326", inplace=False)
    return ds_rio.rio.clip(africa_gdf.geometry, africa_gdf.crs, drop=True)

africa_gdf = gpd.read_file("Africa_outline.geojson").to_crs("EPSG:4326")

# Preprocess function for unit conversion
def preprocess(ds):
    if "t2m" in ds:
        ds["t2m"] = ds["t2m"] - 273.15
        ds["t2m"].attrs["units"] = "Celsius"
    if "msl" in ds:
        ds["msl"] = ds["msl"] / 100.0
        ds["msl"].attrs["units"] = "hPa"
    if "tp" in ds:
        ds = ds.drop_vars("tp")
    return ds

In [None]:
# Compute RMSE of IFS-HRES (apply needed subset function depending on required geographic region)

In [None]:
# Root directory where all monthly folders are stored
base_dir = "../Surface Variables/"

# Define monthly date range and variables
months = pd.date_range("2024-01-01", "2024-12-01", freq="MS")
variables = ["u10", "v10", "t2m", "msl", "q"]


# Collect monthly RMSE datasets
monthly_rmse_datasets = []

for month in months:
    folder_str = month.strftime("%Y%m01")
    forecast_path = os.path.join(base_dir, folder_str, f"{folder_str}_marsfc_sv_q.nc")
    truth_path = os.path.join(base_dir, folder_str, f"{folder_str}_era5_fc_sv_q.nc")
    print(f"starting for {folder_str}")

    fc = preprocess(xr.open_dataset(forecast_path))
    era5 = preprocess(xr.open_dataset(truth_path))
    era5 = era5.rename({"valid_time": "time"})

    # assign numbers for timesteps instead of datetime stamps
    fc = fc.assign_coords(time=np.arange(len(fc.time)).astype("float64"))
    era5 = era5.assign_coords(time=np.arange(len(era5.time)).astype("float64"))
    
    print(f"preprocessed, renamed in era5 for {folder_str}")

    # Subset to southern temperate
    fc = subset_southern_temperate(fc)
    era5 = subset_southern_temperate(era5)
    print(f"subset computed for {folder_str}")


    rmse_dict = {var: compute_perstep_rmse(era5[var], fc[var]) for var in variables}
    rmse_ds = xr.Dataset(rmse_dict)
    rmse_ds = rmse_ds.expand_dims(month=[month.month]) # assign numbers from 1 to 12

    monthly_rmse_datasets.append(rmse_ds)
    
    print(f"computed per lead time rmse for {folder_str}")


# Concatenate all monthly datasets
annual_rmse_ds = xr.concat(monthly_rmse_datasets, dim="month")

# Save the result
annual_rmse_ds.to_netcdf("SouthernTemperate_marsfc_RMSE_leadtimes.nc")
print("Saved to 'SouthernTemperate_marsfc_RMSE_leadtimes.nc'")

In [None]:
# compute rmse per lead time over the whole year for IFS-HRES

In [None]:
# For storing full-year truth and forecast data
year_fc = {var: [] for var in variables}
year_era5 = {var: [] for var in variables}

for month in months:
    folder_str = month.strftime("%Y%m01")
    forecast_path = os.path.join(base_dir, folder_str, f"{folder_str}_marsfc_sv_q.nc")
    truth_path = os.path.join(base_dir, folder_str, f"{folder_str}_era5_fc_sv_q.nc")
    print(f"starting for {folder_str}")

    fc = preprocess(xr.open_dataset(forecast_path))
    era5 = preprocess(xr.open_dataset(truth_path))
    era5 = era5.rename({"valid_time": "time"})

    # assign numbers for timesteps instead of datetime stamps
    fc = fc.assign_coords(time=np.arange(len(fc.time)).astype("float64"))
    era5 = era5.assign_coords(time=np.arange(len(era5.time)).astype("float64"))
    
    print(f"preprocessed, renamed in era5 for {folder_str}")

    # Subset to southern temperate
    fc = subset_southern_temperate(fc)
    era5 = subset_southern_temperate(era5)
    print(f"subset computed for {folder_str}")
    
    # Store raw data for 13th month RMSE
    for var in variables:
        year_fc[var].append(fc[var])
        year_era5[var].append(era5[var])

    print(f" stored raw data for {folder_str}")

# Compute "13th month" RMSE over all months
full_year_rmse = {}

print(f"starting to compute 13th month")

for var in variables:
    # Stack all monthly data along a new dimension
    all_fc = xr.concat(year_fc[var], dim="month_data")
    all_era5 = xr.concat(year_era5[var], dim="month_data")

    # Compute RMSE per timestep over all lat/lon/month_data
    rmse = np.sqrt(((all_fc - all_era5) ** 2).mean(dim=["latitude", "longitude", "month_data"]))
    full_year_rmse[var] = rmse
    print(f"computed annual rmse for {var}")

print(f"starting to compute 13th month")

# Create dataset for month 13
rmse_13 = xr.Dataset(full_year_rmse)
rmse_13 = rmse_13.expand_dims(month=[13])

rmse_13.to_netcdf("SouthernTemperate_marsfc_RMSE_leadtimes_13.nc")

print("Saved to 'SouthernTemperate_marsfc_RMSE_leadtimes_13.nc'")

In [None]:
# merge "marsfc_RMSE_leadtimes.nc" with "marsfc_RMSE_leadtimes_13.nc"

In [None]:
annual = xr.open_dataset("SouthernTemperate_marsfc_RMSE_leadtimes_13.nc")
monthly = xr.open_dataset("SouthernTemperate_marsfc_RMSE_leadtimes.nc")

monthly_rmse_ds_with_annual = xr.concat([monthly, annual], dim="month")

# Save final dataset
monthly_rmse_ds_with_annual.to_netcdf("SouthernTemperate/PerLeadTime/SouthernTemperate_marsfc_RMSE_leadtimes_w_annual.nc")

In [None]:
# Compute RMSE of AIFS (apply needed subset function depending on required geographic region)

In [None]:
# Root directory where all monthly folders are stored
base_dir = "../Surface Variables/"

# Define monthly date range and variables
months = pd.date_range("2024-03-01", "2024-12-01", freq="MS")
variables = ["u10", "v10", "t2m", "msl", "q"]

# Collect monthly RMSE datasets
monthly_rmse_datasets = []



for month in months:
    folder_str = month.strftime("%Y%m01")
    forecast_path = os.path.join(base_dir, folder_str, f"{folder_str}_marsai_sv_q.nc")
    truth_path = os.path.join(base_dir, folder_str, f"{folder_str}_era5_gcai_sv_q.nc")
    print(f"starting for {folder_str}")

    ai = preprocess(xr.open_dataset(forecast_path))
    era5 = preprocess(xr.open_dataset(truth_path))
    era5 = era5.rename({"valid_time": "time"})

    # assign numbers for timesteps instead of datetime stamps
    ai = ai.assign_coords(time=np.arange(len(ai.time)).astype("float64"))
    era5 = era5.assign_coords(time=np.arange(len(era5.time)).astype("float64"))
    
    print(f"preprocessed, renamed in era5 for {folder_str}")

    # Subset to southern temperate
    ai = subset_southern_temperate(ai)
    era5 = subset_southern_temperate(era5)
    print(f"subset computed for {folder_str}")

    rmse_dict = {var: compute_perstep_rmse(era5[var], ai[var]) for var in variables}
    rmse_ds = xr.Dataset(rmse_dict)
    rmse_ds = rmse_ds.expand_dims(month=[month.month]) # assign numbers from 1 to 12

    monthly_rmse_datasets.append(rmse_ds)
    
    print(f"computed per lead time rmse for {folder_str}")


# Concatenate all monthly datasets
annual_rmse_ds = xr.concat(monthly_rmse_datasets, dim="month")

# Save the result
annual_rmse_ds.to_netcdf("SouthernTemperate_marsai_RMSE_leadtimes.nc")
print("Saved to 'SouthernTemperate_marsai_RMSE_leadtimes.nc'")

In [None]:
# compute rmse per lead time over the whole year for AIFS

In [None]:
# For storing full-year truth and forecast data
year_ai = {var: [] for var in variables}
year_era5 = {var: [] for var in variables}

for month in months:
    folder_str = month.strftime("%Y%m01")
    forecast_path = os.path.join(base_dir, folder_str, f"{folder_str}_marsai_sv_q.nc")
    truth_path = os.path.join(base_dir, folder_str, f"{folder_str}_era5_gcai_sv_q.nc")
    print(f"starting for {folder_str}")

    ai = preprocess(xr.open_dataset(forecast_path))
    era5 = preprocess(xr.open_dataset(truth_path))
    era5 = era5.rename({"valid_time": "time"})

    # assign numbers for timesteps instead of datetime stamps
    ai = ai.assign_coords(time=np.arange(len(ai.time)).astype("float64"))
    era5 = era5.assign_coords(time=np.arange(len(era5.time)).astype("float64"))
    
    print(f"preprocessed, renamed in era5 for {folder_str}")

    # Subset to southern temperate
    ai = subset_southern_temperate(ai)
    era5 = subset_southern_temperate(era5)
    print(f"subset computed for {folder_str}")
    
    # Store raw data for 13th month RMSE
    for var in variables:
        year_ai[var].append(ai[var])
        year_era5[var].append(era5[var])

    print(f" stored raw data for {folder_str}")

# Compute "13th month" RMSE over all months
full_year_rmse = {}

print(f"starting to compute 13th month")

for var in variables:
    # Stack all monthly data along a new dimension
    all_ai = xr.concat(year_ai[var], dim="month_data")
    all_era5 = xr.concat(year_era5[var], dim="month_data")

    # Compute RMSE per timestep over all lat/lon/month_data
    rmse = np.sqrt(((all_ai - all_era5) ** 2).mean(dim=["latitude", "longitude", "month_data"]))
    full_year_rmse[var] = rmse
    print(f"computed annual rmse for {var}")

print(f"starting to compute 13th month")

# Create dataset for month 13
rmse_13 = xr.Dataset(full_year_rmse)
rmse_13 = rmse_13.expand_dims(month=[13])

rmse_13.to_netcdf("SouthernTemperate_marsai_RMSE_leadtimes_13.nc")

print("Saved to 'SouthernTemperate_marsai_RMSE_leadtimes_13.nc'")

In [None]:
# merge "marsai_RMSE_leadtimes.nc" with "marsai_RMSE_leadtimes_13.nc"

In [None]:
annual = xr.open_dataset("SouthernTemperate_marsai_RMSE_leadtimes_13.nc")
monthly = xr.open_dataset("SouthernTemperate_marsai_RMSE_leadtimes.nc")

monthly_rmse_ds_with_annual = xr.concat([monthly, annual], dim="month")

# Save final dataset
monthly_rmse_ds_with_annual.to_netcdf("SouthernTemperate/PerLeadTime/SouthernTemperate_marsai_RMSE_leadtimes_w_annual.nc")

In [None]:
# Compute RMSE of GraphCast (apply needed subset function depending on required geographic region)

In [None]:
# Root directory where all monthly folders are stored
base_dir = "../Surface Variables/"

# Define monthly date range and variables
months = pd.date_range("2024-01-01", "2024-12-01", freq="MS")
variables = ["u10", "v10", "t2m", "msl", "q"]


# Collect monthly RMSE datasets
monthly_rmse_datasets = []



for month in months:
    folder_str = month.strftime("%Y%m01")
    forecast_path = os.path.join(base_dir, folder_str, f"{folder_str}_gc_sv_q.nc")
    truth_path = os.path.join(base_dir, folder_str, f"{folder_str}_era5_gcai_sv_q.nc")
    print(f"starting for {folder_str}")

    gc = preprocess(xr.open_dataset(forecast_path))
    era5 = preprocess(xr.open_dataset(truth_path))
    era5 = era5.rename({"valid_time": "time"})

    # Drop the existing "time" coordinate to avoid conflicts
    if "time" in gc.coords:
        gc = gc.drop_vars("time")

    # Swap the "step" dimension with "valid_time" and rename it to "time"
    gc = gc.swap_dims({"step": "valid_time"})  # make valid_time a dimension
    gc = gc.rename({"valid_time": "time"})     # rename the dimension to "time"

    # assign numbers for timesteps instead of datetime stamps
    gc = gc.assign_coords(time=np.arange(len(gc.time)).astype("float64"))
    era5 = era5.assign_coords(time=np.arange(len(era5.time)).astype("float64"))
    
    print(f"preprocessed, renamed in era5 for {folder_str}")

    # Subset to southern temperate
    gc = subset_southern_temperate(gc)
    era5 = subset_southern_temperate(era5)
    print(f"subset computed for {folder_str}")

    rmse_dict = {var: compute_perstep_rmse(era5[var], gc[var]) for var in variables}
    rmse_ds = xr.Dataset(rmse_dict)
    rmse_ds = rmse_ds.expand_dims(month=[month.month]) # assign numbers from 1 to 12

    monthly_rmse_datasets.append(rmse_ds)
    
    print(f"computed per lead time rmse for {folder_str}")


# Concatenate all monthly datasets
annual_rmse_ds = xr.concat(monthly_rmse_datasets, dim="month")

# Save the result
annual_rmse_ds.to_netcdf("SouthernTemperate_gc_RMSE_leadtimes.nc")
print("Saved to 'SouthernTemperate_gc_RMSE_leadtimes.nc'")

In [None]:
# compute rmse across whole year per lead time for graphcast

In [None]:
# Root directory where all monthly folders are stored
base_dir = "../Surface Variables/"

# Define monthly date range and variables
months = pd.date_range("2024-01-01", "2024-12-01", freq="MS")
variables = ["u10", "v10", "t2m", "msl", "q"]

africa_gdf = gpd.read_file("Africa_outline.geojson").to_crs("EPSG:4326")

# RMSE function
def compute_perstep_rmse(truth, pred):
    return np.sqrt(((truth - pred) ** 2).mean(dim=["latitude", "longitude"]))

# Preprocess function
def preprocess(ds):
    if "t2m" in ds:
        ds["t2m"] = ds["t2m"] - 273.15
        ds["t2m"].attrs["units"] = "Celsius"
    if "msl" in ds:
        ds["msl"] = ds["msl"] / 100.0
        ds["msl"].attrs["units"] = "hPa"
    if "tp" in ds:
        ds = ds.drop_vars("tp")
    return ds

# For storing full-year truth and forecast data
year_gc = {var: [] for var in variables}
year_era5 = {var: [] for var in variables}

for month in months:
    folder_str = month.strftime("%Y%m01")
    forecast_path = os.path.join(base_dir, folder_str, f"{folder_str}_gc_sv_q.nc")
    truth_path = os.path.join(base_dir, folder_str, f"{folder_str}_era5_gcai_sv_q.nc")
    print(f"starting for {folder_str}")

    gc = preprocess(xr.open_dataset(forecast_path))
    era5 = preprocess(xr.open_dataset(truth_path))
    era5 = era5.rename({"valid_time": "time"})

    # Drop the existing "time" coordinate to avoid conflicts
    if "time" in gc.coords:
        gc = gc.drop_vars("time")

    # Swap the "step" dimension with "valid_time" and rename it to "time"
    gc = gc.swap_dims({"step": "valid_time"})  # make valid_time a dimension
    gc = gc.rename({"valid_time": "time"})     # rename the dimension to "time"

    # assign numbers for timesteps instead of datetime stamps
    gc = gc.assign_coords(time=np.arange(len(gc.time)).astype("float64"))
    era5 = era5.assign_coords(time=np.arange(len(era5.time)).astype("float64"))
    
    print(f"preprocessed, renamed in era5 for {folder_str}")

    # Subset to southern temperate
    gc = subset_southern_temperate(gc)
    era5 = subset_southern_temperate(era5)
    print(f"subset computed for {folder_str}")
    
    # Store raw data for 13th month RMSE
    for var in variables:
        year_gc[var].append(gc[var])
        year_era5[var].append(era5[var])

    print(f" stored raw data for {folder_str}")

# Compute "13th month" RMSE over all months
full_year_rmse = {}

print(f"starting to compute 13th month")

for var in variables:
    # Stack all monthly data along a new dimension
    all_gc = xr.concat(year_gc[var], dim="month_data")
    all_era5 = xr.concat(year_era5[var], dim="month_data")

    # Compute RMSE per timestep over all lat/lon/month_data
    rmse = np.sqrt(((all_gc - all_era5) ** 2).mean(dim=["latitude", "longitude", "month_data"]))
    full_year_rmse[var] = rmse
    print(f"computed annual rmse for {var}")


# Create dataset for month 13
rmse_13 = xr.Dataset(full_year_rmse)
rmse_13 = rmse_13.expand_dims(month=[13])

rmse_13.to_netcdf("SouthernTemperate_gc_RMSE_leadtimes_13.nc")
print("Saved to 'SouthernTemperate_gc_RMSE_leadtimes_13.nc'")

In [None]:
# merge "gc_RMSE_leadtimes.nc" with "gc_RMSE_leadtimes_13.nc"

In [None]:
annual = xr.open_dataset("SouthernTemperate_gc_RMSE_leadtimes_13.nc")
monthly = xr.open_dataset("SouthernTemperate_gc_RMSE_leadtimes.nc")

monthly_rmse_ds_with_annual = xr.concat([monthly, annual], dim="month")

# Save final dataset
monthly_rmse_ds_with_annual.to_netcdf("SouthernTemperate/PerLeadTime/SouthernTemperate_gc_RMSE_leadtimes_w_annual.nc")

In [None]:
# merge all models into 1 dataset

In [None]:
# Load datasets
airmse = xr.open_dataset("SouthernTemperate/PerLeadTime/SouthernTemperate_marsai_RMSE_leadtimes_w_annual.nc")
gcrmse = xr.open_dataset("SouthernTemperate/PerLeadTime/SouthernTemperate_gc_RMSE_leadtimes_w_annual.nc")
fcrmse = xr.open_dataset("SouthernTemperate/PerLeadTime/SouthernTemperate_marsfc_RMSE_leadtimes_w_annual.nc")

# Ensure all datasets have the full month range 1 to 13
full_months = np.arange(1, 14)

# Reindex to include all months, filling missing with NaN
airmse = airmse.reindex(month=full_months)
gcrmse = gcrmse.reindex(month=full_months)
fcrmse = fcrmse.reindex(month=full_months)

# Drop any unrelated extra coordinates to match structure 
drop_coords = ["meanSea", "surface", "isobaricInhPa", "number", "expver", "step"]
gcrmse = gcrmse.drop_vars([c for c in drop_coords if c in gcrmse])
airmse = airmse.drop_vars([c for c in drop_coords if c in airmse])
fcrmse = fcrmse.drop_vars([c for c in drop_coords if c in fcrmse])

# Stack them into a new 'model' dimension
combined = xr.concat([airmse, gcrmse, fcrmse], dim="model")

# Add model labels
combined = combined.assign_coords(model=["marsai", "gc", "marsfc"])

# Save the merged dataset (optional)
combined.to_netcdf("SouthernTemperate/PerLeadTime/SouthernTemperate_RMSE_leadtimes_allmodels.nc")

# Print confirmation
print("Merged dataset created with shape:", combined.sizes)
print("Models:", combined.model.values)