In [3]:
import xarray as xr
import geopandas as gpd
from shapely.geometry import mapping
import rioxarray
import numpy as np
import numcodecs
import os

In [4]:
# ---- Constants ----
FILL_VALUE = 9.969209968386869e36
INPUT_FOLDER = "MinimapsZarrFiles"
OUTPUT_PREFIX = "Africa_"

for filename in os.listdir(INPUT_FOLDER):
    if filename.startswith("Global_") and filename.endswith(".zarr"):
        input_path = os.path.join(INPUT_FOLDER, filename)
        output_filename = filename.replace("Global_", "Africa_")
        output_path = os.path.join(INPUT_FOLDER, output_filename)

        print(f"Processing {filename} -> {output_filename}")

Processing Global_marsfc_RMSE_MAP_leadtimes.zarr -> Africa_marsfc_RMSE_MAP_leadtimes.zarr
Processing Global_marsai_RMSE_MAP_leadtimes.zarr -> Africa_marsai_RMSE_MAP_leadtimes.zarr
Processing Global_gc_MBE_MAP_leadtimes.zarr -> Africa_gc_MBE_MAP_leadtimes.zarr
Processing Global_gc_MAE_MAP_leadtimes.zarr -> Africa_gc_MAE_MAP_leadtimes.zarr
Processing Global_marsfc_MAE_MAP_leadtimes.zarr -> Africa_marsfc_MAE_MAP_leadtimes.zarr
Processing Global_marsfc_MBE_MAP_leadtimes.zarr -> Africa_marsfc_MBE_MAP_leadtimes.zarr
Processing Global_marsai_MBE_MAP_leadtimes.zarr -> Africa_marsai_MBE_MAP_leadtimes.zarr
Processing Global_marsai_MAE_MAP_leadtimes.zarr -> Africa_marsai_MAE_MAP_leadtimes.zarr
Processing Global_gc_RMSE_MAP_leadtimes.zarr -> Africa_gc_RMSE_MAP_leadtimes.zarr


In [5]:
# ---- Constants ----
FILL_VALUE = 9.969209968386869e36
INPUT_FOLDER = "MinimapsZarrFiles"
OUTPUT_PREFIX = "Africa_"

# ---- Load Africa GeoJSON ----
gdf = gpd.read_file("Africa_outline.geojson").to_crs("EPSG:4326")

# ---- Loop through all matching Zarr files ----
for filename in os.listdir(INPUT_FOLDER):
    if filename.startswith("Global_") and filename.endswith(".zarr"):
        input_path = os.path.join(INPUT_FOLDER, filename)
        output_filename = filename.replace("Global_", "Africa_")
        output_path = os.path.join(INPUT_FOLDER, output_filename)

        print(f"Processing {filename} -> {output_filename}")

        # ---- Load Zarr File ----
        zarr = xr.open_datatree(input_path, engine='zarr', consolidated=True)
        ds = zarr["/"].to_dataset()

        # Ensure spatial coordinates are CRS-aware
        for var in ds.data_vars:
            ds[var] = ds[var].rio.write_crs("EPSG:4326")

        # ---- Clip Each Variable and Fill Outside with FILL_VALUE ----
        clipped_vars = {}
        for var in ds.data_vars:
            print(f"Clipping {var}...")
            data = ds[var]

            # Clip with geometry (values outside will be NaN)
            clipped = data.rio.clip(gdf.geometry.apply(mapping), gdf.crs, drop=False)

            # Replace NaN with fill value
            clipped_filled = clipped.fillna(FILL_VALUE)

            clipped_vars[var] = clipped_filled

        # ---- Create final dataset ----
        clipped_ds = xr.Dataset(clipped_vars, coords=ds.coords, attrs=ds.attrs)

        # ---- Rechunk to match original file layout ----
        clipped_ds = clipped_ds.chunk({'time': 1, 'y': 721, 'x': 1440})

        # ---- Define compression ----
        encoding = {
            var: {'compressor': numcodecs.Zlib(level=1)}
            for var in clipped_ds.data_vars
        }

        # ---- Save to Zarr ----
        clipped_ds.to_zarr(output_path, mode="w", encoding=encoding)

        print(f"Saved clipped file to {output_path}\n")

Processing Global_marsfc_RMSE_MAP_leadtimes.zarr -> Africa_marsfc_RMSE_MAP_leadtimes.zarr
Clipping msl...
Clipping q...
Clipping t2m...
Clipping u10...
Clipping v10...
Saved clipped file to MinimapsZarrFiles/Africa_marsfc_RMSE_MAP_leadtimes.zarr

Processing Global_marsai_RMSE_MAP_leadtimes.zarr -> Africa_marsai_RMSE_MAP_leadtimes.zarr
Clipping msl...
Clipping q...
Clipping t2m...
Clipping u10...
Clipping v10...
Saved clipped file to MinimapsZarrFiles/Africa_marsai_RMSE_MAP_leadtimes.zarr

Processing Global_gc_MBE_MAP_leadtimes.zarr -> Africa_gc_MBE_MAP_leadtimes.zarr
Clipping msl...
Clipping q...
Clipping t2m...
Clipping u10...
Clipping v10...
Saved clipped file to MinimapsZarrFiles/Africa_gc_MBE_MAP_leadtimes.zarr

Processing Global_gc_MAE_MAP_leadtimes.zarr -> Africa_gc_MAE_MAP_leadtimes.zarr
Clipping msl...
Clipping q...
Clipping t2m...
Clipping u10...
Clipping v10...
Saved clipped file to MinimapsZarrFiles/Africa_gc_MAE_MAP_leadtimes.zarr

Processing Global_marsfc_MAE_MAP_leadtimes

In [6]:
# ---- Constants ----
FILL_VALUE = 9.969209968386869e36

# ---- Load Zarr File ----
zarr = xr.open_datatree('MinimapsZarrFiles/Global_marsai_MAE_MAP_leadtimes.zarr', engine='zarr', consolidated=True)

# main variables are stored directly under root
ds = zarr["/"].to_dataset()

# Ensure spatial coordinates are CRS-aware
for var in ds.data_vars:
    ds[var] = ds[var].rio.write_crs("EPSG:4326")

# ---- Load Africa GeoJSON ----
gdf = gpd.read_file("Africa_outline.geojson").to_crs("EPSG:4326")

# ---- Clip Each Variable and Fill Outside with FILL_VALUE ----
clipped_vars = {}
for var in ds.data_vars:
    print(f"Clipping {var}...")
    data = ds[var]
    
    # Clip with geometry (values outside will be NaN)
    clipped = data.rio.clip(gdf.geometry.apply(mapping), gdf.crs, drop=False)
    
    # Replace NaN with fill value
    clipped_filled = clipped.fillna(FILL_VALUE)
    
    clipped_vars[var] = clipped_filled

# ---- Create final dataset with original coordinates and attributes ----
clipped_ds = xr.Dataset(clipped_vars, coords=ds.coords, attrs=ds.attrs)

# ---- Rechunk to match original file layout ----
# original shape was (time=41, y=721, x=1440), all in one chunk except time
clipped_ds = clipped_ds.chunk({'time': 1, 'y': 721, 'x': 1440})

# Define zlib compression for all data variables
encoding = {
    var: {'compressor': numcodecs.Zlib(level=1)}
    for var in clipped_ds.data_vars
}


# ---- Save to Zarr ----
clipped_ds.to_zarr('Africa_marsai_MAE_MAP_leadtimes.zarr', mode="w", encoding=encoding)

Clipping msl...
Clipping q...
Clipping t2m...
Clipping u10...
Clipping v10...


<xarray.backends.zarr.ZarrStore at 0x12b8e2c40>