In [1]:
!pip install google-cloud-storage



In [1]:
from google.cloud import storage

In [2]:
def ensure_google_cloud_storage():
    """Install google-cloud-storage on the worker if it's missing."""
    import importlib
    import subprocess
    import sys

    try:
        importlib.import_module("google.cloud.storage")
    except ImportError:
        subprocess.check_call(
            [sys.executable, "-m", "pip", "install", "google-cloud-storage"]
        )
ensure_google_cloud_storage()

In [1]:
"""
Parallel daily CHLA(z) production using Dask-Gateway.

- Searches PACE L3M Rrs DAY granules via earthaccess
- For each granule/day:
    * downloads Rrs
    * runs BRT CHLA(z) prediction
    * computes integrated/peak metrics
    * writes a daily NetCDF locally
    * uploads to GCS
- Skips days that already exist in GCS unless FORCE_RERUN=True
"""

import os
from pathlib import Path
import tempfile

import numpy as np
import pandas as pd
import xarray as xr
import earthaccess
from google.cloud import storage
from dask_gateway import Gateway
from dask.distributed import Client

# --------------------------------------------------------------------------------------
# CONFIG
# --------------------------------------------------------------------------------------

# Path to your saved ML bundle (zip) – adjust as needed
BUNDLE_PATH = "models/brt_chla_profiles_bundle.zip"
BUNDLE_FILENAME = Path(BUNDLE_PATH).name  # "brt_chla_profiles_bundle.zip"

# GCS target
BUCKET_NAME = "nmfs_odp_nwfsc"
DESTINATION_PREFIX = "CB/fish-pace-datasets/chla-z/netcdf"

# Dask-Gateway settings
MIN_WORKERS = 4
MAX_WORKERS = 12
WORKER_CORES = 4
WORKER_MEMORY = "32GiB"

# Spatial chunking for NetCDF output
LAT_CHUNK = 100
LON_CHUNK = 100

# Rerun control: if False, skip days that already exist in GCS
FORCE_RERUN = False

# Optional date filtering for rrs_results (None = no filter)
START_DATE = None  # e.g. "2024-03-01"
END_DATE   = None  # e.g. "2024-04-30"

#START_DATE = "2024-04-01"
#END_DATE   = "2024-04-02"

import netrc
import json

netrc_path = os.path.expanduser("~/.netrc")
auth = netrc.netrc(netrc_path)
login, account, password = auth.authenticators("urs.earthdata.nasa.gov")
ED_USER = login
ED_PASS = password
with open("/home/jovyan/.config/gcloud/application_default_credentials.json") as f:
    GCP_SA_JSON = f.read()


# --------------------------------------------------------------------------------------
# Helper: load ML bundle and build CHLA profile dataset
# --------------------------------------------------------------------------------------

# Ensure ml_utils is available
if not os.path.exists("ml_utils.py"):
    import subprocess
    subprocess.run(
        [
            "wget",
            "-q",
            "https://raw.githubusercontent.com/fish-pace/chla-z-modeling/main/ml_utils.py",
        ],
        check=True,
    )

import ml_utils as mu  # noqa: E402

# Load the bundle once on the client side; workers will receive it via pickling
# DELETE
# bundle = mu.load_ml_bundle(BUNDLE_PATH)

#######################
# - Helper
#######################
def ensure_google_cloud_storage():
    """Install google-cloud-storage on the worker if it's missing."""
    import importlib
    import subprocess
    import sys

    try:
        importlib.import_module("google.cloud.storage")
    except ImportError:
        subprocess.check_call(
            [sys.executable, "-m", "pip", "install", "google-cloud-storage"]
        )


def build_chla_profile_dataset(CHLA: xr.DataArray) -> xr.Dataset:
    """
    Given CHLA(time, z, lat, lon), compute derived metrics and
    return an xr.Dataset suitable for writing to Zarr/NetCDF.
    """
    # Start from CHLA's own dataset so its coords (including z_start/z_end) win
    ds = CHLA.to_dataset(name="CHLA")

    # ---- Layer thickness (z dimension) ----
    z_start = CHLA.coords.get("z_start", None)
    z_end   = CHLA.coords.get("z_end", None)

    if (z_start is not None) and (z_end is not None):
        z_thick = (z_end - z_start).rename("z_thickness")   # (z)
    else:
        # fallback: uniform layer thickness, e.g. 10 m
        z_thick = xr.full_like(CHLA["z"], 10.0).rename("z_thickness")

    z_center = CHLA["z"]

    # total CHLA in column (used for validity + center-of-mass)
    col_total = CHLA.sum("z")          # (time, lat, lon)
    valid = col_total > 0              # True where there is some CHLA

    # ---- Integrated CHLA (nominal 0–200 m; actual range = z extent) ----
    CHLA_int = (CHLA * z_thick).sum("z")
    CHLA_int = CHLA_int.where(valid)
    CHLA_int.name = "CHLA_int_0_200"

    # ---- Peak value and depth (NaN-safe) ----
    CHLA_filled = CHLA.fillna(-np.inf)
    peak_idx = CHLA_filled.argmax("z")       # (time, lat, lon) integer indices

    CHLA_peak = CHLA.isel(z=peak_idx).where(valid)
    CHLA_peak.name = "CHLA_peak"

    CHLA_peak_depth = z_center.isel(z=peak_idx).where(valid)
    CHLA_peak_depth.name = "CHLA_peak_depth"

    # ---- Depth-weighted mean depth (center of mass) ----
    num = (CHLA * z_center).sum("z")
    den = col_total
    depth_cm = (num / den).where(valid)
    depth_cm.name = "CHLA_depth_center_of_mass"

    # ---- Attach derived fields to the dataset ----
    ds["CHLA_int_0_200"] = CHLA_int
    ds["CHLA_peak"] = CHLA_peak
    ds["CHLA_peak_depth"] = CHLA_peak_depth
    ds["CHLA_depth_center_of_mass"] = depth_cm
    ds["z_thickness"] = z_thick

    # ---- Variable attributes ----
    ds["CHLA"].attrs.setdefault("units", "mg m-3")
    ds["CHLA"].attrs.setdefault("long_name", "Chlorophyll-a concentration")
    ds["CHLA"].attrs.setdefault(
        "description",
        "BRT-derived chlorophyll-a profiles from PACE hyperspectral Rrs",
    )

    ds["CHLA_int_0_200"].attrs.update(
        units="mg m-2",
        long_name="Depth-integrated chlorophyll-a",
        description=(
            "Vertical integral of CHLA over the available depth bins "
            "(nominally 0–200 m; actual range defined by z_start/z_end)."
        ),
    )

    ds["CHLA_peak"].attrs.update(
        units="mg m-3",
        long_name="Peak chlorophyll-a concentration in the water column",
        description="Maximum CHLA value over depth at each (time, lat, lon).",
    )

    ds["CHLA_peak_depth"].attrs.update(
        units="m",
        long_name="Depth of peak chlorophyll-a",
        positive="down",
        description=(
            "Depth (bin center) where CHLA is maximal in the water column "
            "at each (time, lat, lon)."
        ),
    )

    ds["CHLA_depth_center_of_mass"].attrs.update(
        units="m",
        long_name="Chlorophyll-a depth center of mass",
        positive="down",
        description=(
            "Depth of the chlorophyll-a center of mass, computed as "
            "sum_z(CHLA * z) / sum_z(CHLA)."
        ),
    )

    ds["z_thickness"].attrs.update(
        units="m",
        long_name="Layer thickness",
        description=(
            "Thickness of each vertical bin used for depth integration. "
            "Derived from z_end - z_start when available; otherwise set to a "
            "uniform nominal thickness."
        ),
    )

    return ds


# --------------------------------------------------------------------------------------
# Worker-side function: process ONE granule/day
# --------------------------------------------------------------------------------------

from functools import partial

def process_one_granule(
    res,
    lat_chunk=LAT_CHUNK,
    lon_chunk=LON_CHUNK,
    bucket_name=BUCKET_NAME,
    destination_prefix=DESTINATION_PREFIX,
    force_rerun=FORCE_RERUN,
    ed_username=ED_USER,
    ed_password=ED_PASS,
    gcp_sa_json=GCP_SA_JSON,
    bundle_filename=BUNDLE_FILENAME,
):
    import os
    import tempfile
    import earthaccess
    import xarray as xr
    import pandas as pd
    from pathlib import Path
    import ml_utils as mu  # <- now workers can import this

    # --- ensure google-cloud-storage is available on THIS worker ---
    import importlib
    import subprocess
    import sys
    try:
        importlib.import_module("google.cloud.storage")
    except ImportError:
        subprocess.check_call(
            [sys.executable, "-m", "pip", "install", "google-cloud-storage"]
        )

    from google.cloud import storage  # now this should succeed

    # --- locate the bundle file next to ml_utils.py ---
    bundle_path = Path(mu.__file__).with_name(bundle_filename)
    # just to be extra defensive:
    if not bundle_path.exists():
        raise FileNotFoundError(f"Bundle not found at {bundle_path}")

    bundle = mu.load_ml_bundle(str(bundle_path))
    
    # Load bundle on the worker from the uploaded zip file
    #bundle = mu.load_ml_bundle(bundle_filename)

    # --- EARTHACCESS AUTH VIA ENV VARS (inside worker) ---
    if ed_username is not None and ed_password is not None:
        os.environ["EARTHDATA_USERNAME"] = ed_username
        os.environ["EARTHDATA_PASSWORD"] = ed_password

    auth = earthaccess.login(strategy="environment", persist=False)

    # --- GCP AUTH VIA JSON TEXT (inside worker) ---
    import uuid

    cred_path = None
    if gcp_sa_json:
        cred_path = os.path.join(tempfile.gettempdir(), f"gcp_sa_worker_{uuid.uuid4().hex}.json")
        with open(cred_path, "w") as f:
            f.write(gcp_sa_json)
        os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path

    # -------------------------------
    #  Normal per-day pipeline below
    # -------------------------------
    day_iso = res["umm"]["TemporalExtent"]["RangeDateTime"]["BeginningDateTime"]
    day = pd.to_datetime(day_iso)
    day_str = day.strftime("%Y%m%d")

    storage_client = storage.Client(project="noaa-gcs-public-data")
    bucket = storage_client.bucket(bucket_name)
    blob_path = f"{destination_prefix}/chla_z_{day_str}.nc"
    blob = bucket.blob(blob_path)

    if blob.exists() and not force_rerun:
        msg = f"[{day_str}] SKIP (exists at gs://{bucket_name}/{blob_path})"
        print(msg)
        return msg

    files = earthaccess.open([res], pqdm_kwargs={"disable": True})
    rrs_ds = xr.open_dataset(files[0])
    # debug
    # rrs_ds = rrs_ds.sel(lat=slice(40, 20), lon=slice(-70, -60) )

    try:
        if "time" in rrs_ds.dims:
            R = rrs_ds["Rrs"].sel(time=day).squeeze("time")
        else:
            R = rrs_ds["Rrs"]
        R = R.transpose("lat", "lon", "wavelength")

        pred = bundle.predict(
            R,
            brt_models=bundle.model,
            feature_cols=bundle.meta["feature_cols"],
            consts={"solar_hour": 0, "type": 1},
            chunk_size_lat=100,
            time=day.to_datetime64(),
            z_name="z",
            silent=True,
        )

        ds_day = build_chla_profile_dataset(pred)

        tmp_dir = Path(tempfile.gettempdir())
        local_path = tmp_dir / f"chla_z_{day_str}.nc"

        encoding = {
            "CHLA": {
                "dtype": "float32",
                "zlib": True,
                "complevel": 4,
                "chunksizes": (1, ds_day.sizes["z"], lat_chunk, lon_chunk),
            }
        }

        ds_day.to_netcdf(local_path, engine="h5netcdf", encoding=encoding)
        blob.upload_from_filename(str(local_path))
        local_path.unlink(missing_ok=True)

        gcs_url = f"gs://{bucket_name}/{blob_path}"
        msg = f"[{day_str}] WROTE {gcs_url}"
        print(msg)
        return msg

    finally:
        rrs_ds.close()
        # optional: clean up the creds file
        if cred_path is not None:
            try:
                os.remove(cred_path)
            except FileNotFoundError:
                pass

# --------------------------------------------------------------------------------------
# DRIVER: search granules, filter, and dispatch via Dask-Gateway
# --------------------------------------------------------------------------------------

def main():
    # 1. Earthaccess login on client
    auth = earthaccess.login(strategy="netrc", persist=True)
    if not auth.authenticated:
        raise RuntimeError("earthaccess login failed")

    # 2. Search PACE L3M Rrs daily granules
    rrs_results = earthaccess.search_data(
        short_name="PACE_OCI_L3M_RRS",
        granule_name="*.DAY.*.4km.nc",
        temporal=(START_DATE, END_DATE),
    )

    print(f"Found {len(rrs_results)} DAY granules after date filter.")
    if not rrs_results:
        print("Nothing to do.")
        return

    # 4. Dask-Gateway cluster setup
    gateway = Gateway()
    options = gateway.cluster_options()
    setattr(options, "worker_resource_allocation", '4CPU, 30.2Gi')
    
    cluster = gateway.new_cluster(options)
    cluster.adapt(minimum=MIN_WORKERS, maximum=MAX_WORKERS)

    client = cluster.get_client()
    print(cluster)
    print(client)

    # Dashboard link (copy/paste into a browser tab)
    print("Dask dashboard:", client.dashboard_link)

    # Make sure workers have needed files
    client.upload_file("ml_utils.py")
    client.upload_file(BUNDLE_PATH)

    # ensure google-cloud-storage is installed on every worker
    client.run(ensure_google_cloud_storage)

    # 5. Dispatch one task per granule
    futures = client.map(process_one_granule, rrs_results)

    # 6. Stream results as they complete (instead of blocking on gather)
    from dask.distributed import as_completed

    n = len(futures)
    done = 0
    errors = 0

    try:
        for fut in as_completed(futures):
            try:
                msg = fut.result()
                done += 1
                print(f"[{done}/{n}] {msg}")
            except Exception as e:
                errors += 1
                done += 1
                print(f"[{done}/{n}] ERROR: {repr(e)}")
                # If you want to stop on first error, uncomment:
                # raise
    finally:
        print(f"Finished. Success={done - errors}, Errors={errors}")
        client.close()
        cluster.close()


In [None]:
# took 10 hours for 560 files
if __name__ == "__main__": main()

In [2]:
# need to rerun a few days
START_DATE = '20250209'
END_DATE   = '20250211'
if __name__ == "__main__": main()

Found 3 DAY granules after date filter.
GatewayCluster<prod.0d2bdc79e25a43f3b9a184b8bb2c7020, status=running>
<Client: 'tls://192.168.35.98:8786' processes=0 threads=0, memory=0 B>
Dask dashboard: /services/dask-gateway/clusters/prod.0d2bdc79e25a43f3b9a184b8bb2c7020/status
[1/3] [20250211] WROTE gs://nmfs_odp_nwfsc/CB/fish-pace-datasets/chla-z/netcdf/chla_z_20250211.nc
[2/3] [20250210] WROTE gs://nmfs_odp_nwfsc/CB/fish-pace-datasets/chla-z/netcdf/chla_z_20250210.nc
[3/3] [20250209] WROTE gs://nmfs_odp_nwfsc/CB/fish-pace-datasets/chla-z/netcdf/chla_z_20250209.nc
Finished. Success=3, Errors=0


# Process Zarr

Should have baked this into the first pipeline. Alas.

In [1]:
from __future__ import annotations

from pathlib import Path
import os
import re
import uuid
import tempfile

import pandas as pd
import xarray as xr
import gcsfs
import zarr

from dask_gateway import Gateway
from dask.distributed import as_completed


# -----------------------
# CONFIG
# -----------------------
TOKEN_PATH = "/home/jovyan/.config/gcloud/application_default_credentials.json"

BUCKET = "nmfs_odp_nwfsc"
NETCDF_PREFIX = "CB/fish-pace-datasets/chla-z/netcdf"
NETCDF_PATTERN = f"{BUCKET}/{NETCDF_PREFIX}/chla_z_*.nc"

# You said you deleted this prefix already. Keep it Zarr v2 only.
ZARR_PATH = f"gcs://{BUCKET}/CB/fish-pace-datasets/chla-z/zarr_v2"

LAT_CHUNK = 128
LON_CHUNK = 128

MIN_WORKERS = 4
MAX_WORKERS = 12
WORKER_RESOURCE = "4CPU, 30.2Gi"


# -----------------------
# HELPERS
# -----------------------
_date_re = re.compile(r"chla_z_(\d{8})\.nc$")


def date_from_url(gcs_url: str) -> pd.Timestamp:
    m = _date_re.search(gcs_url)
    if not m:
        raise ValueError(f"Could not parse date from: {gcs_url}")
    return pd.to_datetime(m.group(1), format="%Y%m%d")


def list_netcdf_urls(fs: gcsfs.GCSFileSystem) -> list[str]:
    paths = sorted(fs.glob(NETCDF_PATTERN))
    urls = ["gcs://" + p for p in paths]
    urls = sorted(urls, key=date_from_url)
    return urls


def build_time_index(urls: list[str]) -> pd.DatetimeIndex:
    times = pd.to_datetime([date_from_url(u) for u in urls]).astype("datetime64[ns]")
    return pd.DatetimeIndex(times, name="time")


def _chunk_spec(ds: xr.Dataset, lat_chunk: int, lon_chunk: int) -> dict:
    spec: dict[str, int] = {}
    if "time" in ds.dims:
        spec["time"] = 1
    if "z" in ds.dims:
        spec["z"] = ds.sizes["z"]  # keep full z together
    if "lat" in ds.dims:
        spec["lat"] = lat_chunk
    if "lon" in ds.dims:
        spec["lon"] = lon_chunk
    return spec


def read_sa_json(token_path: str = TOKEN_PATH) -> str:
    with open(token_path, "r") as f:
        return f.read()


def setup_gcp_on_worker(gcp_sa_json: str) -> str:
    """
    Write SA json to a unique temp file and point GOOGLE_APPLICATION_CREDENTIALS at it.
    Returns the path so caller can optionally delete it.
    """
    cred_path = os.path.join(
        tempfile.gettempdir(), f"gcp_sa_worker_{uuid.uuid4().hex}.json"
    )
    with open(cred_path, "w") as f:
        f.write(gcp_sa_json)
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred_path
    return cred_path


def make_gcsfs_with_sa_json(gcp_sa_json: str) -> tuple[gcsfs.GCSFileSystem, str]:
    """
    Create a GCSFileSystem using a temp SA json file on this machine (client or worker).
    Uses token='google_default' so gcsfs reads GOOGLE_APPLICATION_CREDENTIALS.
    """
    cred_path = setup_gcp_on_worker(gcp_sa_json)
    fs = gcsfs.GCSFileSystem(token="google_default")
    return fs, cred_path


def create_zarr_template_by_write_then_resize(
    gcp_sa_json: str,
    first_url: str,
    times: pd.DatetimeIndex,
    zarr_path: str,
    lat_chunk: int,
    lon_chunk: int,
) -> None:
    """
    Create Zarr v2 store safely by:
      1) downloading/opening first NetCDF locally
      2) writing it to Zarr (mode="w") -> creates arrays/metadata (v2)
      3) resizing arrays along 'time' to full length (metadata-only)
      4) writing full time coord values
    """
    fs, cred_path = make_gcsfs_with_sa_json(gcp_sa_json)

    tmp_dir = Path("/tmp/chla_zarr_template")
    tmp_dir.mkdir(parents=True, exist_ok=True)

    local_nc = tmp_dir / Path(first_url).name
    fs.get(first_url, str(local_nc))

    ds0 = xr.open_dataset(local_nc, engine="h5netcdf")
    try:
        if "time" in ds0.coords:
            ds0 = ds0.assign_coords(time=ds0["time"].astype("datetime64[ns]"))

        ds0 = ds0.chunk(_chunk_spec(ds0, lat_chunk, lon_chunk))

        mapper = fs.get_mapper(zarr_path)

        # 1) Write first day (creates the store cleanly)
        ds0.to_zarr(
            mapper,
            mode="w",
            consolidated=False,
            zarr_format=2,
        )

        # 2) Resize time on all arrays that have a 'time' dimension
        full_len = len(times)
        root = zarr.open_group(mapper, mode="r+")

        for name, arr in root.arrays():
            dims = arr.attrs.get("_ARRAY_DIMENSIONS", None)
            if not dims or "time" not in dims:
                continue
            t_axis = list(dims).index("time")
            newshape = list(arr.shape)
            newshape[t_axis] = full_len
            if tuple(newshape) != arr.shape:
                arr.resize(tuple(newshape))

        # 3) Fill the time coordinate values (store as int64 ns)
        time_int = times.values.astype("datetime64[ns]").astype("int64")
        if "time" in root:
            root["time"][:] = time_int
            root["time"].attrs["_ARRAY_DIMENSIONS"] = ["time"]

    finally:
        ds0.close()
        local_nc.unlink(missing_ok=True)
        try:
            os.remove(cred_path)
        except FileNotFoundError:
            pass


def write_one_day_region(
    gcs_url: str,
    time_index: int,
    zarr_path: str,
    gcp_sa_json: str,
    lat_chunk: int,
    lon_chunk: int,
) -> str:
    """
    Worker task:
    - write SA json to /tmp on worker and set GOOGLE_APPLICATION_CREDENTIALS
    - download NC to local /tmp (no streaming)
    - open local
    - rechunk
    - write into correct time slice via region writes
    """
    from pathlib import Path
    import os
    import xarray as xr
    import gcsfs

    cred_path = setup_gcp_on_worker(gcp_sa_json)
    fs = gcsfs.GCSFileSystem(token="google_default")

    tmp_dir = Path("/tmp/chla_nc_to_zarr_workers")
    tmp_dir.mkdir(parents=True, exist_ok=True)
    local_nc = tmp_dir / Path(gcs_url).name

    fs.get(gcs_url, str(local_nc))

    ds = xr.open_dataset(local_nc, engine="h5netcdf")
    try:
        if "time" in ds.coords:
            ds = ds.assign_coords(time=ds["time"].astype("datetime64[ns]"))

        ds = ds.chunk(_chunk_spec(ds, lat_chunk, lon_chunk))

        region = {"time": slice(time_index, time_index + 1)}

        ds.to_zarr(
            fs.get_mapper(zarr_path),
            mode="r+",
            region=region,
            consolidated=False,
            zarr_format=2,
        )

        return f"OK {Path(gcs_url).name} -> time_index={time_index}"

    finally:
        ds.close()
        local_nc.unlink(missing_ok=True)
        try:
            os.remove(cred_path)
        except FileNotFoundError:
            pass


def consolidate_zarr(zarr_path: str, gcp_sa_json: str) -> None:
    fs, cred_path = make_gcsfs_with_sa_json(gcp_sa_json)
    try:
        mapper = fs.get_mapper(zarr_path)
        zarr.consolidate_metadata(mapper)
    finally:
        try:
            os.remove(cred_path)
        except FileNotFoundError:
            pass


# -----------------------
# MAIN
# -----------------------
def zarr_main():
    # Read SA JSON once on the client and pass the TEXT to workers
    gcp_sa_json = read_sa_json(TOKEN_PATH)

    # Use SA JSON on client too (don’t rely on file path existing in workers)
    fs, cred_path = make_gcsfs_with_sa_json(gcp_sa_json)
    try:
        urls = list_netcdf_urls(fs)
        if not urls:
            raise RuntimeError("No NetCDF files found with pattern: " + NETCDF_PATTERN)

        print("nfiles:", len(urls), "first:", urls[0])

        times = build_time_index(urls)

        print("Creating Zarr template (write first day, then resize time)…")
        create_zarr_template_by_write_then_resize(
            gcp_sa_json=gcp_sa_json,
            first_url=urls[0],
            times=times,
            zarr_path=ZARR_PATH,
            lat_chunk=LAT_CHUNK,
            lon_chunk=LON_CHUNK,
        )
        print("Template created:", ZARR_PATH)

        gateway = Gateway()
        options = gateway.cluster_options()
        setattr(options, "worker_resource_allocation", WORKER_RESOURCE)

        cluster = gateway.new_cluster(options)
        cluster.adapt(minimum=MIN_WORKERS, maximum=MAX_WORKERS)
        client = cluster.get_client()

        print(cluster)
        print(client)
        print("Dask dashboard:", client.dashboard_link)

        # Optional quick auth sanity check on workers (fails fast if creds not working)
        def _worker_ls(smoke_prefix: str, gcp_sa_json: str) -> int:
            import gcsfs, os, tempfile, uuid
            cred = os.path.join(tempfile.gettempdir(), f"gcp_sa_worker_{uuid.uuid4().hex}.json")
            with open(cred, "w") as f:
                f.write(gcp_sa_json)
            os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = cred
            fsw = gcsfs.GCSFileSystem(token="google_default")
            out = fsw.ls(smoke_prefix)
            try:
                os.remove(cred)
            except FileNotFoundError:
                pass
            return len(out)

        try:
            nls = client.run(_worker_ls, f"{BUCKET}/CB/fish-pace-datasets/chla-z", gcp_sa_json)
            print("Worker auth smoke test (ls counts):", nls)
        except Exception as e:
            print("Worker auth smoke test FAILED:", repr(e))
            raise

        futures = []
        for idx, url in enumerate(urls):
            fut = client.submit(
                write_one_day_region,
                url,
                idx,
                ZARR_PATH,
                gcp_sa_json,   # <-- JSON TEXT to worker
                LAT_CHUNK,
                LON_CHUNK,
                pure=False,
            )
            futures.append(fut)

        n = len(futures)
        done = 0
        errors = 0

        try:
            for fut in as_completed(futures):
                done += 1
                try:
                    msg = fut.result()
                    print(f"[{done}/{n}] {msg}")
                except Exception as e:
                    errors += 1
                    print(f"[{done}/{n}] ERROR: {repr(e)}")

            print(f"Finished writing. Success={n - errors}, Errors={errors}")

            print("Consolidating Zarr metadata…")
            consolidate_zarr(ZARR_PATH, gcp_sa_json)
            print("Done + consolidated.")

        finally:
            client.close()
            cluster.close()

    finally:
        try:
            os.remove(cred_path)
        except FileNotFoundError:
            pass



In [None]:
if __name__ == "__main__":
    zarr_main()




nfiles: 560 first: gcs://nmfs_odp_nwfsc/CB/fish-pace-datasets/chla-z/netcdf/chla_z_20240305.nc
Creating Zarr template (write first day, then resize time)…




Template created: gcs://nmfs_odp_nwfsc/CB/fish-pace-datasets/chla-z/zarr_v2
GatewayCluster<prod.e6cdcf3cb77d41a49e9f7486bbcbe91a, status=running>
<Client: 'tls://192.168.63.220:8786' processes=0 threads=0, memory=0 B>
Dask dashboard: /services/dask-gateway/clusters/prod.e6cdcf3cb77d41a49e9f7486bbcbe91a/status
Worker auth smoke test (ls counts): {}
