In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import json
from scipy.stats import pearsonr
import geopandas as gpd
import rioxarray

In [None]:
# Compute R of the forecast averaged over all time steps, latitudes, and longitudes per month

In [None]:
def subset_northern_temperate(ds):
    """Subset dataset to the Northern Temperate Zone: 35°N to 60°N."""
    return ds.where((ds.latitude >= 35) & (ds.latitude <= 60), drop=True)

def subset_southern_temperate(ds):
    """Subset dataset to the Northern Temperate Zone: 35°N to 60°N."""
    return ds.where((ds.latitude <= -35) & (ds.latitude >= -60), drop=True)

# Function to subset to subpolar and polar regions
def subset_polar_regions(ds):
    """Subset dataset to the (Sub-)Polar Zones: 60°N to 90°N and -60°N to -90°N """
    return ds.where((ds.latitude >= 60) | (ds.latitude <= -60), drop=True)

def subset_tropics(ds):
    """Subset dataset to the Tropics: -23.5° to 23.5° latitude."""
    return ds.where((ds.latitude >= -23.5) & (ds.latitude <= 23.5), drop=True)

def subset_subtropics(ds):
    """
    Subset dataset to the subtropics:
    - Northern Subtropics: 23.5°N to 35°N
    - Southern Subtropics: 23.5°S to 35°S
    """
    return ds.where(
        ((ds.latitude >= 23.5) & (ds.latitude <= 35)) | 
        ((ds.latitude <= -23.5) & (ds.latitude >= -35)),
        drop=True
    )

def subset_africa(ds, africa_gdf):
    """
    Subset an xarray dataset to the Africa region using a GeoDataFrame polygon.
    """
    ds_rio = ds.rio.write_crs("EPSG:4326", inplace=False)
    return ds_rio.rio.clip(africa_gdf.geometry, africa_gdf.crs, drop=True)

africa_gdf = gpd.read_file("Africa_outline.geojson").to_crs("EPSG:4326")

# Preprocess function for unit conversion
def preprocess(ds):
    if "t2m" in ds:
        ds["t2m"] = ds["t2m"] - 273.15
        ds["t2m"].attrs["units"] = "Celsius"
    if "msl" in ds:
        ds["msl"] = ds["msl"] / 100.0
        ds["msl"].attrs["units"] = "hPa"
    if "tp" in ds:
        ds = ds.drop_vars("tp")
    return ds

In [None]:
# Compute R of IFS-HRES (apply needed subset function depending on required geographic region)

In [None]:
# Root directory where all monthly folders are stored
base_dir = "../Surface Variables/"

# Define monthly date range and variables
months = pd.date_range("2024-01-01", "2024-12-01", freq="MS")
variables = ["u10", "v10", "t2m", "msl", "q"]


# Collect monthly correlation datasets
monthly_r_datasets = []

for month in months:
    folder_str = month.strftime("%Y%m01")
    forecast_path = os.path.join(base_dir, folder_str, f"{folder_str}_marsfc_sv_q.nc")
    truth_path = os.path.join(base_dir, folder_str, f"{folder_str}_era5_fc_sv_q.nc")

    print(f" Start for {folder_str}")

    try:
        fc = preprocess(xr.open_dataset(forecast_path))
        era5 = preprocess(xr.open_dataset(truth_path))
    except FileNotFoundError:
        print(f"Missing data for {folder_str}, skipping...")
        continue

    print(f" Preprocessing done for {folder_str}")
    
    era5 = era5.rename({"valid_time": "time"})

    # Subset to polar regions
    # fc = subset_polar_regions(fc)
    # era5 = subset_polar_regions(era5)

    # Subset to southern temperate
    fc = subset_southern_temperate(fc)
    era5 = subset_southern_temperate(era5)
    print(f"subset computed for {folder_str}")

    results = []

    for var in variables:
        fc_var, era5_var = xr.align(fc[var], era5[var])
        fc_flat = fc_var.values.flatten()
        era5_flat = era5_var.values.flatten()
        valid_mask = np.isfinite(fc_flat) & np.isfinite(era5_flat)

        if np.any(valid_mask):
            r, _ = pearsonr(fc_flat[valid_mask], era5_flat[valid_mask])
        else:
            r = np.nan

        results.append((var, r))

    # Create dataset for this month
    month_ds = xr.Dataset(
        {var: (["month"], [r]) for var, r in results},
        coords={"month": [month.month]} # sets coordinate value to 1, 2, ..., 12
    )

    monthly_r_datasets.append(month_ds)
    print(f" Computed Pearson R for {folder_str}")

# Concatenate all months
annual_r_ds = xr.concat(monthly_r_datasets, dim="month")

# Compute average across months
r_mean = annual_r_ds.mean(dim="month", skipna=True)
r_mean = r_mean.expand_dims(month=[13])  # 13th month is average

# Append mean to dataset
annual_r_ds_with_mean = xr.concat([annual_r_ds, r_mean], dim="month")

# Save the final dataset
annual_r_ds_with_mean.to_netcdf("SouthernTemperate/PerMonth/SouthernTemperate_marsfc_PearsonR_monthly.nc")
print("Pearson R (with 13th-month average) saved to 'SouthernTemperate/PerMonth/SouthernTemperate_marsfc_PearsonR_monthly.nc'")

In [None]:
# Compute R of AIFS (apply needed subset function depending on required geographic region)

In [None]:
# Root directory where all monthly folders are stored
base_dir = "../Surface Variables/"

# Define monthly date range and variables
months = pd.date_range("2024-03-01", "2024-12-01", freq="MS")
variables = ["u10", "v10", "t2m", "msl", "q"]


# Collect monthly correlation datasets
monthly_r_datasets = []

for month in months:
    folder_str = month.strftime("%Y%m01")
    forecast_path = os.path.join(base_dir, folder_str, f"{folder_str}_marsai_sv_q.nc")
    truth_path = os.path.join(base_dir, folder_str, f"{folder_str}_era5_gcai_sv_q.nc")

    print(f" Start for {folder_str}")

    try:
        ai = preprocess(xr.open_dataset(forecast_path))
        era5 = preprocess(xr.open_dataset(truth_path))
    except FileNotFoundError:
        print(f"Missing data for {folder_str}, skipping...")
        continue

    print(f" Preprocessing done for {folder_str}")
    
    era5 = era5.rename({"valid_time": "time"})

    # Subset to southern temperate
    ai = subset_southern_temperate(ai)
    era5 = subset_southern_temperate(era5)
    print(f"subset computed for {folder_str}")

    results = []

    for var in variables:
        ai_var, era5_var = xr.align(ai[var], era5[var])
        ai_flat = ai_var.values.flatten()
        era5_flat = era5_var.values.flatten()
        valid_mask = np.isfinite(ai_flat) & np.isfinite(era5_flat)

        if np.any(valid_mask):
            r, _ = pearsonr(ai_flat[valid_mask], era5_flat[valid_mask])
        else:
            r = np.nan

        results.append((var, r))

    # Create dataset for this month
    month_ds = xr.Dataset(
        {var: (["month"], [r]) for var, r in results},
        coords={"month": [month.month]} # sets coordinate value to 1, 2, ..., 12
    )

    monthly_r_datasets.append(month_ds)
    print(f" Computed Pearson R for {folder_str}")

# Concatenate all months
annual_r_ds = xr.concat(monthly_r_datasets, dim="month")

# Compute average across months
r_mean = annual_r_ds.mean(dim="month", skipna=True)
r_mean = r_mean.expand_dims(month=[13])  # 13th month is average

# Append mean to dataset
annual_r_ds_with_mean = xr.concat([annual_r_ds, r_mean], dim="month")

# Save the final dataset
annual_r_ds_with_mean.to_netcdf("SouthernTemperate/PerMonth/SouthernTemperate_marsai_PearsonR_monthly.nc")
print("Pearson R (with 13th-month average) saved to 'SouthernTemperate/PerMonth/SouthernTemperate_marsai_PearsonR_monthly.nc'")

In [None]:
# Compute R of GraphCast (apply needed subset function depending on required geographic region)

In [None]:
# Root directory where all monthly folders are stored
base_dir = "../Surface Variables/"

# Define monthly date range and variables
months = pd.date_range("2024-01-01", "2024-12-01", freq="MS")
variables = ["u10", "v10", "t2m", "msl", "q"]


# Collect monthly correlation datasets
monthly_r_datasets = []

for month in months:
    folder_str = month.strftime("%Y%m01")
    forecast_path = os.path.join(base_dir, folder_str, f"{folder_str}_gc_sv_q.nc")
    truth_path = os.path.join(base_dir, folder_str, f"{folder_str}_era5_gcai_sv_q.nc")

    print(f" Start for {folder_str}")

    try:
        gc = preprocess(xr.open_dataset(forecast_path))
        era5 = preprocess(xr.open_dataset(truth_path))
    except FileNotFoundError:
        print(f"Missing data for {folder_str}, skipping...")
        continue
    
    era5 = era5.rename({"valid_time": "time"})
    # Drop the existing "time" coordinate to avoid conflicts
    if "time" in gc.coords:
        gc = gc.drop_vars("time")

    # Swap the "step" dimension with "valid_time" and rename it to "time"
    gc = gc.swap_dims({"step": "valid_time"})  # make valid_time a dimension
    gc = gc.rename({"valid_time": "time"})     # rename the dimension to "time"
    
    print(f" Preprocessing done for {folder_str}")

    # Subset to southern temperate
    gc = subset_southern_temperate(gc)
    era5 = subset_southern_temperate(era5)
    print(f"subset computed for {folder_str}")

    results = []

    for var in variables:
        gc_var, era5_var = xr.align(gc[var], era5[var])
        gc_flat = gc_var.values.flatten()
        era5_flat = era5_var.values.flatten()
        valid_mask = np.isfinite(gc_flat) & np.isfinite(era5_flat)

        if np.any(valid_mask):
            r, _ = pearsonr(gc_flat[valid_mask], era5_flat[valid_mask])
        else:
            r = np.nan

        results.append((var, r))

    # Create dataset for this month
    month_ds = xr.Dataset(
        {var: (["month"], [r]) for var, r in results},
        coords={"month": [month.month]} # sets coordinate value to 1, 2, ..., 12
    )

    monthly_r_datasets.append(month_ds)
    print(f" Computed Pearson R for {folder_str}")

# Concatenate all months
annual_r_ds = xr.concat(monthly_r_datasets, dim="month")

# Compute average across months
r_mean = annual_r_ds.mean(dim="month", skipna=True)
r_mean = r_mean.expand_dims(month=[13])  # 13th month is average

# Append mean to dataset
annual_r_ds_with_mean = xr.concat([annual_r_ds, r_mean], dim="month")

# Save the final dataset
annual_r_ds_with_mean.to_netcdf("SouthernTemperate/PerMonth/SouthernTemperate_gc_PearsonR_monthly.nc")
print("Pearson R (with 13th-month average) saved to 'SouthernTemperate/PerMonth/SouthernTemperate_gc_PearsonR_monthly.nc'")

In [None]:
# merge all models into 1 dataset

In [None]:
# Load datasets
gcr = xr.open_dataset("SouthernTemperate/PerMonth/SouthernTemperate_gc_PearsonR_monthly.nc")
fcr = xr.open_dataset("SouthernTemperate/PerMonth/SouthernTemperate_marsfc_PearsonR_monthly.nc")
air = xr.open_dataset("SouthernTemperate/PerMonth/SouthernTemperate_marsai_PearsonR_monthly.nc")

# Ensure all datasets have the full month range 1 to 13
full_months = np.arange(1, 14)

# Reindex to include all months, filling missing with NaN
air = air.reindex(month=full_months)
gcr = gcr.reindex(month=full_months)
fcr = fcr.reindex(month=full_months)

# Drop any unrelated extra coordinates to match structure 
drop_coords = ["meanSea", "surface", "isobaricInhPa", "number"]
gcr = gcr.drop_vars([c for c in drop_coords if c in gcr])

# Stack them into a new 'model' dimension
combined = xr.concat([air, gcr, fcr], dim="model")

# Add model labels
combined = combined.assign_coords(model=["marsai", "gc", "marsfc"])

# Save the merged dataset (optional)
combined.to_netcdf("SouthernTemperate/PerMonth/SouthernTemperate_PearsonR_monthly_allmodels.nc")

# Print confirmation
print("Merged dataset created with shape:", combined.sizes)
print("Models:", combined.model.values)