## Lat/Lon Matchups the Chlorophyll Rrs Data 
### HABs Group - West Coast
* Keshav Dubedi
* Bikas Gupta
* Deborah Kutner
* Mathieu Richaud
* Dale Robinson
##### PACE Hack Week, January 2026

## Import libraries and set up environment

In [None]:
import logging
from pathlib import Path
from typing import Dict

import numpy as np
import pandas as pd
import xarray as xr
import tqdm.auto
# Kill all tqdm progress bars
tqdm.auto.tqdm.disable = True
# Silence earthaccess logs
logging.getLogger("earthaccess").setLevel(logging.ERROR)
import earthaccess
import os

## Set up logging

In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Define functions

In [None]:

def pace_file_finder(
    start_time: str,
    end_time: str,
    short_id: str = "PACE_OCI_L3M_CHL",
    granule_pattern: str = "*.DAY.CHL.V3_1.chlor_a.4km.nc",  # *.8D.CHL.V3_1.chlor_a.4km.nc
):
    """
    Find and open PACE OCI chlorophyll granules for a given date range.

    Args:
        start_time: ISO date string (YYYY-MM-DD).
        end_time: ISO date string (YYYY-MM-DD).
        short_id: Earthdata short name.
        granule_pattern: Granule filename pattern.

    Returns:
        List of opened xarray datasets.
    """
    results = earthaccess.search_data(
        short_name=short_id,
        temporal=(start_time, end_time),
        granule_name=granule_pattern,
    )
    return earthaccess.open(results)


def extract_mean_chl(
    ds: xr.Dataset,
    lat: float,
    lon: float,
    location_code: str,
) -> float:
    """
    Extract a directionally averaged chlorophyll value.

    Args:
        ds: Open PACE chlorophyll dataset.
        lat: Target latitude.
        lon: Target longitude.
        location_code: Station identifier.

    Returns:
        Mean chlorophyll value or NaN.
    """
    # Correct nearest-neighbor index lookup
    lat_idx = int(np.abs(ds["lat"].values - lat).argmin())
    lon_idx = int(np.abs(ds["lon"].values - lon).argmin())

    ny = ds.sizes["lat"]
    nx = ds.sizes["lon"]

    def clip_slice(start, stop, maxval):
        return slice(max(start, 0), min(stop, maxval))

    if location_code in {"SIO", "NP"}:  # West
        lat_slice = lat_idx
        lon_slice = clip_slice(lon_idx - 8, lon_idx + 1, nx)

    elif location_code in {"CPP", "HAB_SCW", "SW", "TP"}:  # South
        lat_slice = clip_slice(lat_idx, lat_idx + 9, ny)
        lon_slice = lon_idx

    elif location_code == "MB":  # North
        lat_slice = clip_slice(lat_idx - 8, lat_idx + 1, ny)
        lon_slice = lon_idx

    else:  # Southwest
        lat_slice = clip_slice(lat_idx, lat_idx + 3, ny)
        lon_slice = clip_slice(lon_idx - 2, lon_idx + 1, nx)

    return (
        ds["chlor_a"]
        .isel(lat=lat_slice, lon=lon_slice)
        .mean(skipna=True)
        .item()
    )



## Set up for matchup run
* Set Directories
* Login to earthaccess
* Load in-situ data
* Create place holder variables

In [None]:
input_csv = "./calhabs_data.csv"
output_csv = "./cal_habs_pace_chl_new_DAY1.csv"
earthaccess.login()

df_insitu = pd.read_csv(input_csv)
df = df_insitu[(df_insitu["date"] >= "2024-03-05")].copy()

df["chlor_a"] = np.nan

## Main download and processing loop

In [None]:
# Process once per unique date
datasets_by_date: Dict[str, xr.Dataset] = {}

for date, group in df.groupby("date"):
    logger.info("Processing date %s", date)

    files = pace_file_finder(date, date)
    if not files:
        logger.warning("No PACE data for %s", date)
        continue

    with xr.open_dataset(files[0]) as ds:
        for idx, row in group.iterrows():
            try:
                df.at[idx, "chlor_a"] = extract_mean_chl(
                    ds,
                    row["latitude"],
                    row["longitude"],
                    row["Location_Code"],
                )
            except Exception as e:
                logger.error(
                    "Failed at %s (%s): %s",
                    row["Location_Code"],
                    date,
                    e,
                )


## Save results

In [None]:

df.to_csv(output_csv, index=False)
logger.info("Saved output to %s", output_csv)