In [1]:
import xarray as xr
import pandas as pd
import numpy as np
from pathlib import Path
import zarr
import dask

In [2]:
# Access the full dataset
DATASET_PATH = Path("data/IberFire.nc")
ds = xr.open_dataset(DATASET_PATH)

In [3]:
# Basic dataset overview
print("=== BASIC DATASET INFO ===")
print(f"Dataset dimensions: {dict(ds.sizes)}")
print(f"Dataset size: {ds.nbytes / 1e9:.2f} GB")
print(f"Number of variables: {len(ds.data_vars)}")
print(f"Coordinates: {list(ds.coords)}")
print(ds)

=== BASIC DATASET INFO ===
Dataset dimensions: {'y': 920, 'x': 1188, 'time': 6241}
Dataset size: 730.86 GB
Number of variables: 261
Coordinates: ['x', 'y', 'time']
<xarray.Dataset> Size: 731GB
Dimensions:                                        (y: 920, x: 1188, time: 6241)
Coordinates:
  * x                                              (x) float64 10kB 2.675e+06...
  * y                                              (y) float64 7kB 2.492e+06 ...
  * time                                           (time) datetime64[ns] 50kB ...
Data variables: (12/261)
    x_index                                        (y, x) uint16 2MB ...
    y_index                                        (y, x) uint16 2MB ...
    is_spain                                       (y, x) uint16 2MB ...
    is_fire                                        (time, y, x) uint8 7GB ...
    is_near_fire                                   (time, y, x) uint8 7GB ...
    x_coordinate                                   (y, x) float32 4MB

In [4]:
from dask.distributed import Client
client = Client()  # dashboard URL will print

In [5]:
import xarray as xr
import zarr
from numcodecs import Blosc

# compressor: blosc + zstd is a great default
compressor = Blosc(cname="zstd", clevel=5, shuffle=Blosc.BITSHUFFLE)

# Open lazily with dask-backed chunks (don’t load)
ds = xr.open_dataset(
    "data/IberFire.nc",
    engine="h5netcdf",          # robust HDF5 reader
    chunks={},                  # let us choose chunks next
    decode_cf=True,
    mask_and_scale=True,
    decode_times=True,
    lock=False,                 # better parallel reads on local SSD
)

# Choose chunking (start simple; adjust to your dims)
# Replace 'lat','lon','time' with your actual names
ds = ds.chunk({"time": 64, "y": 256, "x": 256})

# (Optional) Downcast to float32 and standardize fillvalues
for v in ds.data_vars:
    if ds[v].dtype == "float64":
        ds[v] = ds[v].astype("float32")
    # ensure missing values are consistent
    ds[v].encoding["_FillValue"] = None  # Zarr uses NaN for float; keep clean encodings

# Write to Zarr (directory store). Use consolidated metadata for fast opens.
# Tip: write each variable with its compressor
encoding = {v: {"compressor": compressor} for v in ds.data_vars}

ds.to_zarr(
    "data/IberFire.zarr",
    mode="w",
    compute=True,     # triggers Dask graph; watch the dashboard
)

This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
2025-10-24 14:01:49,580 - distributed.worker - ERROR - Compute Failed
Key:       ('open_dataset-surface_pressure_range-634ce179d1f2ffc896345f85528c3a23', 11, 8, 9)
State:     executing
Task:  <Task ('open_dataset-surface_pressure_range-634ce179d1f2ffc896345f85528c3a23', 11, 8, 9) getter(...)>
Exception: "KeyError('Unable to synchronously open object (invalid identifier type to function)')"
Traceback: '  File "/Users/vladimir/catalonia-wildfire-prediction/catalonia-wildfire-prediction/.venv/lib/python3.13/site-packages/dask/array/core.py", line 141, in getter\n    c = np.asarray(c)\n  File "/Users/vladimir/catalonia-wildfire-prediction/catalonia-wildfire-prediction/.venv/lib/python3.13/site-packages/xarray/core/indexing.py"

KeyError: 'Unable to synchronously open object (invalid identifier type to function)'