In [5]:
import xarray as xr
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

In [None]:
# Notebook to compute the RMSE of IFS-HRES, AIFS, and GraphCast averaged over all 12 months per grid cell per lead time

In [6]:
# Preprocess function
def preprocess(ds):
    if "t2m" in ds:
        ds["t2m"] = ds["t2m"] - 273.15
        ds["t2m"].attrs["units"] = "Celsius"
    if "msl" in ds:
        ds["msl"] = ds["msl"] / 100.0
        ds["msl"].attrs["units"] = "hPa"
    if "tp" in ds:
        ds = ds.drop_vars("tp")
    return ds

In [None]:
# IFS-HRES - Compute the RMSE per lead time per grid cell across all 12 months

In [4]:
# forecast and truth file paths
forecast_files = sorted([
    "../../Surface Variables/20240101/20240101_marsfc_sv_q.nc", "../../Surface Variables/20240201/20240201_marsfc_sv_q.nc", 
    "../../Surface Variables/20240301/20240301_marsfc_sv_q.nc", "../../Surface Variables/20240401/20240401_marsfc_sv_q.nc",
    "../../Surface Variables/20240501/20240501_marsfc_sv_q.nc", "../../Surface Variables/20240601/20240601_marsfc_sv_q.nc", 
    "../../Surface Variables/20240701/20240701_marsfc_sv_q.nc", "../../Surface Variables/20240801/20240801_marsfc_sv_q.nc",
    "../../Surface Variables/20240901/20240901_marsfc_sv_q.nc", "../../Surface Variables/20241001/20241001_marsfc_sv_q.nc", 
    "../../Surface Variables/20241101/20241101_marsfc_sv_q.nc", "../../Surface Variables/20241201/20241201_marsfc_sv_q.nc"
])

truth_files = sorted([
    "../../Surface Variables/20240101/20240101_era5_fc_sv_q.nc", "../../Surface Variables/20240201/20240201_era5_fc_sv_q.nc", 
    "../../Surface Variables/20240301/20240301_era5_fc_sv_q.nc", "../../Surface Variables/20240401/20240401_era5_fc_sv_q.nc",
    "../../Surface Variables/20240501/20240501_era5_fc_sv_q.nc", "../../Surface Variables/20240601/20240601_era5_fc_sv_q.nc", 
    "../../Surface Variables/20240701/20240701_era5_fc_sv_q.nc", "../../Surface Variables/20240801/20240801_era5_fc_sv_q.nc",
    "../../Surface Variables/20240901/20240901_era5_fc_sv_q.nc", "../../Surface Variables/20241001/20241001_era5_fc_sv_q.nc", 
    "../../Surface Variables/20241101/20241101_era5_fc_sv_q.nc", "../../Surface Variables/20241201/20241201_era5_fc_sv_q.nc"
])

# List of variables to process
variables = ['t2m', 'q', 'u10', 'v10', 'msl']  # replace with your actual variable names

# Dictionary to hold RMSE DataArrays for each variable
rmse_vars = {}

for var in variables:
    squared_errors = []
    print(f"starting for {var}")

    for f_path, t_path in zip(forecast_files, truth_files):
        print(f"starting for {f_path}")
        # Open forecast and truth files
        ds_f = preprocess(xr.open_dataset(f_path))
        ds_t = preprocess(xr.open_dataset(t_path))
        print(f"preprocessing done")
        
        ds_t = ds_t.rename({"valid_time": "time"})

        # assign numbers for timesteps instead of datetime stamps
        ds_f = ds_f.assign_coords(time=np.arange(len(ds_f.time)).astype("float64"))
        ds_t = ds_t.assign_coords(time=np.arange(len(ds_t.time)).astype("float64"))

        # Select variable
        forecast = ds_f[var]
        truth = ds_t[var]

        # Align in case coordinates differ slightly
        forecast, truth = xr.align(forecast, truth)

        # Compute squared error
        error_sq = (forecast - truth) ** 2
        squared_errors.append(error_sq)
        print(f"squared error computed")

    # Stack over a new 'month' dimension
    stacked_errors = xr.concat(squared_errors, dim='month')
    print(f"stacked all squared errors")

    # Compute RMSE: sqrt of mean squared error across months
    rmse = np.sqrt(stacked_errors.mean(dim='month'))
    print(f"computed RMSE")

    # Store the RMSE result in the dictionary
    rmse.name = var
    rmse_vars[var] = rmse

# Combine all RMSE DataArrays into a single Dataset
rmse_dataset = xr.Dataset(rmse_vars)

# Save to NetCDF
rmse_dataset.to_netcdf('Global_marsfc_RMSE_MAP_leadtimes.nc')

print("RMSE computed and saved for variables:", ', '.join(variables))

starting for t2m
starting for ../../Surface Variables/20240101/20240101_marsfc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240201/20240201_marsfc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240301/20240301_marsfc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240401/20240401_marsfc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240501/20240501_marsfc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240601/20240601_marsfc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240701/20240701_marsfc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240801/20240801_marsfc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240901/20240901_marsfc_sv_q.nc
preprocessing don

In [None]:
# AIFS - Compute the RMSE per lead time per grid cell across all 12 months

In [7]:
# forecast and truth file paths
forecast_files = sorted([
   # "../../Surface Variables/20240101/20240101_marsai_sv_q.nc", "../../Surface Variables/20240201/20240201_marsai_sv_q.nc", 
    "../../Surface Variables/20240301/20240301_marsai_sv_q.nc", "../../Surface Variables/20240401/20240401_marsai_sv_q.nc",
    "../../Surface Variables/20240501/20240501_marsai_sv_q.nc", "../../Surface Variables/20240601/20240601_marsai_sv_q.nc", 
    "../../Surface Variables/20240701/20240701_marsai_sv_q.nc", "../../Surface Variables/20240801/20240801_marsai_sv_q.nc",
    "../../Surface Variables/20240901/20240901_marsai_sv_q.nc", "../../Surface Variables/20241001/20241001_marsai_sv_q.nc", 
    "../../Surface Variables/20241101/20241101_marsai_sv_q.nc", "../../Surface Variables/20241201/20241201_marsai_sv_q.nc"
])

truth_files = sorted([
 #   "../../Surface Variables/20240101/20240101_era5_gcai_sv_q.nc", "../../Surface Variables/20240201/20240201_era5_gcai_sv_q.nc", 
    "../../Surface Variables/20240301/20240301_era5_gcai_sv_q.nc", "../../Surface Variables/20240401/20240401_era5_gcai_sv_q.nc",
    "../../Surface Variables/20240501/20240501_era5_gcai_sv_q.nc", "../../Surface Variables/20240601/20240601_era5_gcai_sv_q.nc", 
    "../../Surface Variables/20240701/20240701_era5_gcai_sv_q.nc", "../../Surface Variables/20240801/20240801_era5_gcai_sv_q.nc",
    "../../Surface Variables/20240901/20240901_era5_gcai_sv_q.nc", "../../Surface Variables/20241001/20241001_era5_gcai_sv_q.nc", 
    "../../Surface Variables/20241101/20241101_era5_gcai_sv_q.nc", "../../Surface Variables/20241201/20241201_era5_gcai_sv_q.nc"
])

# List of variables to process
variables = ['t2m', 'q', 'u10', 'v10', 'msl']  

# Dictionary to hold RMSE DataArrays for each variable
rmse_vars = {}

for var in variables:
    squared_errors = []
    print(f"starting for {var}")

    for f_path, t_path in zip(forecast_files, truth_files):
        print(f"starting for {f_path}")
        # Open forecast and truth files
        ds_f = preprocess(xr.open_dataset(f_path))
        ds_t = preprocess(xr.open_dataset(t_path))
        print(f"preprocessing done")
        
        ds_t = ds_t.rename({"valid_time": "time"})

        # assign numbers for timesteps instead of datetime stamps
        ds_f = ds_f.assign_coords(time=np.arange(len(ds_f.time)).astype("float64"))
        ds_t = ds_t.assign_coords(time=np.arange(len(ds_t.time)).astype("float64"))

        # Select variable
        forecast = ds_f[var]
        truth = ds_t[var]

        # Align in case coordinates differ slightly
        forecast, truth = xr.align(forecast, truth)

        # Compute squared error
        error_sq = (forecast - truth) ** 2
        squared_errors.append(error_sq)
        print(f"squared error computed")

    # Stack over a new 'month' dimension
    stacked_errors = xr.concat(squared_errors, dim='month')
    print(f"stacked all squared errors")

    # Compute RMSE: sqrt of mean squared error across months
    rmse = np.sqrt(stacked_errors.mean(dim='month'))
    print(f"computed RMSE")

    # Store the RMSE result in the dictionary
    rmse.name = var
    rmse_vars[var] = rmse

# Combine all RMSE DataArrays into a single Dataset
rmse_dataset = xr.Dataset(rmse_vars)

# Save to NetCDF
rmse_dataset.to_netcdf('Global_marsai_RMSE_MAP_leadtimes.nc')

print("RMSE computed and saved for variables:", ', '.join(variables))

starting for t2m
starting for ../../Surface Variables/20240301/20240301_marsai_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240401/20240401_marsai_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240501/20240501_marsai_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240601/20240601_marsai_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240701/20240701_marsai_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240801/20240801_marsai_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240901/20240901_marsai_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20241001/20241001_marsai_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20241101/20241101_marsai_sv_q.nc
preprocessing don

In [None]:
# GraphCast - Compute the RMSE per lead time per grid cell across all 12 months

In [8]:
# forecast and truth file paths
forecast_files = sorted([
    "../../Surface Variables/20240101/20240101_gc_sv_q.nc", "../../Surface Variables/20240201/20240201_gc_sv_q.nc", 
    "../../Surface Variables/20240301/20240301_gc_sv_q.nc", "../../Surface Variables/20240401/20240401_gc_sv_q.nc",
    "../../Surface Variables/20240501/20240501_gc_sv_q.nc", "../../Surface Variables/20240601/20240601_gc_sv_q.nc", 
    "../../Surface Variables/20240701/20240701_gc_sv_q.nc", "../../Surface Variables/20240801/20240801_gc_sv_q.nc",
    "../../Surface Variables/20240901/20240901_gc_sv_q.nc", "../../Surface Variables/20241001/20241001_gc_sv_q.nc", 
    "../../Surface Variables/20241101/20241101_gc_sv_q.nc", "../../Surface Variables/20241201/20241201_gc_sv_q.nc"
])

truth_files = sorted([
    "../../Surface Variables/20240101/20240101_era5_gcai_sv_q.nc", "../../Surface Variables/20240201/20240201_era5_gcai_sv_q.nc", 
    "../../Surface Variables/20240301/20240301_era5_gcai_sv_q.nc", "../../Surface Variables/20240401/20240401_era5_gcai_sv_q.nc",
    "../../Surface Variables/20240501/20240501_era5_gcai_sv_q.nc", "../../Surface Variables/20240601/20240601_era5_gcai_sv_q.nc", 
    "../../Surface Variables/20240701/20240701_era5_gcai_sv_q.nc", "../../Surface Variables/20240801/20240801_era5_gcai_sv_q.nc",
    "../../Surface Variables/20240901/20240901_era5_gcai_sv_q.nc", "../../Surface Variables/20241001/20241001_era5_gcai_sv_q.nc", 
    "../../Surface Variables/20241101/20241101_era5_gcai_sv_q.nc", "../../Surface Variables/20241201/20241201_era5_gcai_sv_q.nc"
])

# List of variables to process
variables = ['t2m', 'q', 'u10', 'v10', 'msl']  

# Dictionary to hold RMSE DataArrays for each variable
rmse_vars = {}

for var in variables:
    squared_errors = []
    print(f"starting for {var}")

    for f_path, t_path in zip(forecast_files, truth_files):
        print(f"starting for {f_path}")
        # Open forecast and truth files
        ds_f = preprocess(xr.open_dataset(f_path))
        ds_t = preprocess(xr.open_dataset(t_path))
        print(f"preprocessing done")
        
        ds_t = ds_t.rename({"valid_time": "time"})

        # Drop the existing "time" coordinate to avoid conflicts
        if "time" in ds_f.coords:
            ds_f = ds_f.drop_vars("time")
    
        # Swap the "step" dimension with "valid_time" and rename it to "time"
        ds_f = ds_f.swap_dims({"step": "valid_time"})  # make valid_time a dimension
        ds_f = ds_f.rename({"valid_time": "time"})     # rename the dimension to "time"

        # assign numbers for timesteps instead of datetime stamps
        ds_f = ds_f.assign_coords(time=np.arange(len(ds_f.time)).astype("float64"))
        ds_t = ds_t.assign_coords(time=np.arange(len(ds_t.time)).astype("float64"))

        # Select variable
        forecast = ds_f[var]
        truth = ds_t[var]

        # Align in case coordinates differ slightly
        forecast, truth = xr.align(forecast, truth)

        # Compute squared error
        error_sq = (forecast - truth) ** 2
        squared_errors.append(error_sq)
        print(f"squared error computed")

    # Stack over a new 'month' dimension
    stacked_errors = xr.concat(squared_errors, dim='month')
    print(f"stacked all squared errors")

    # Compute RMSE: sqrt of mean squared error across months
    rmse = np.sqrt(stacked_errors.mean(dim='month'))
    print(f"computed RMSE")

    # Store the RMSE result in the dictionary
    rmse.name = var
    rmse_vars[var] = rmse

# Combine all RMSE DataArrays into a single Dataset
rmse_dataset = xr.Dataset(rmse_vars)

# Save to NetCDF
rmse_dataset.to_netcdf('Global_gc_RMSE_MAP_leadtimes.nc')

print("RMSE computed and saved for variables:", ', '.join(variables))

starting for t2m
starting for ../../Surface Variables/20240101/20240101_gc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240201/20240201_gc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240301/20240301_gc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240401/20240401_gc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240501/20240501_gc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240601/20240601_gc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240701/20240701_gc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240801/20240801_gc_sv_q.nc
preprocessing done
squared error computed
starting for ../../Surface Variables/20240901/20240901_gc_sv_q.nc
preprocessing done
squared error computed
starting fo