In [None]:
| Product Name       | Variable | URL                                                                                     |
|-------------------|----------|-----------------------------------------------------------------------------------------|
| PACE_OCI_L2_AOP    | rrs      | [Link](https://www.earthdata.nasa.gov/data/catalog/ob-cloud-pace-oci-l2-aop-3.1#overview) |
| PACE_OCI_L2_BGC    | chlor_a  | [Link](https://www.earthdata.nasa.gov/data/catalog/ob-cloud-pace-oci-l2-bgc-3.1)        |
| PACE_OCI_L2_IOP    | bbp      | [Link](https://www.earthdata.nasa.gov/data/catalog/ob-cloud-pace-oci-l2-iop-3.1)        |


In [2]:
import earthaccess
import xarray as xr
import pandas as pd
import numpy as np


# Pipeline for aligning different lvl_2 pace products along time dimension for later analysis

# Hopkins bounds, need to update
BOUNDS = {"min_lon": -121.904, "max_lon": -120.900, "min_lat": 35.621, "max_lat": 37.623}
DATE_RANGE = ("2024-04-01", "2024-04-02")
PRODUCTS = {
    "PACE_OCI_L2_AOP": "Rrs", #not band specific, here just for now
    "PACE_OCI_L2_BGC": "chlor_a",
    "PACE_OCI_L2_IOP": "bbp"
}

auth = earthaccess.login()
final_data = {}

for prod_name, var_name in PRODUCTS.items():
    print(f" --- {prod_name} ---")
    query = earthaccess.search_data(
        short_name=prod_name,
        bounding_box=(BOUNDS['min_lon'], BOUNDS['min_lat'], BOUNDS['max_lon'], BOUNDS['max_lat']),
        temporal=DATE_RANGE
    )
    if not query:
        print(f"No granules found for {prod_name}")
        continue
    files = earthaccess.open(query)
    granule_list = []
    
    for f in files:
        try:
            
            ds_nav = xr.open_dataset(f, group="navigation_data", engine="h5netcdf").compute()
            ds_geophys = xr.open_dataset(f, group="geophysical_data", chunks={}, engine="h5netcdf")
            mask = (
                (ds_nav.latitude >= BOUNDS['min_lat']) & (ds_nav.latitude <= BOUNDS['max_lat']) &
                (ds_nav.longitude >= BOUNDS['min_lon']) & (ds_nav.longitude <= BOUNDS['max_lon'])
            )
            subset = ds_geophys[var_name].where(mask, drop=True).compute()
            if subset.size > 0:
                with xr.open_dataset(f, engine="h5netcdf") as ds_meta:
                    time_val = pd.to_datetime(ds_meta.attrs['time_coverage_start'])
                
                # had to collapse for alignment erros
                spatial_dims = [d for d in subset.dims if d not in ['wavelength', 'bins', 'time']]
                granule_summary = subset.mean(dim=spatial_dims).expand_dims(time=[time_val]) #might misconstrue more watery areas
                
                granule_list.append(granule_summary)
        except Exception as e:
            print(f"skipping granule: {e}")

    if granule_list:
        # concatenate along time dim
        full_ds = xr.concat(granule_list, dim="time").sortby("time")
        
        
        summary_df = full_ds.to_dataframe(name=var_name)
        
        final_data[prod_name] = {
            "dataset": full_ds,
            "csv_ready": summary_df
        }
        print(f"{len(granule_list)} - valid observations for {var_name}")

# Merge everything into one clean table
if final_data:
    master_table = pd.concat([v['csv_ready'] for v in final_data.values()], axis=1)
    print(master_table.head())

 --- PACE_OCI_L2_AOP ---


QUEUEING TASKS | :   0%|          | 0/3 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/3 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/3 [00:00<?, ?it/s]

3 - valid observations for Rrs
 --- PACE_OCI_L2_BGC ---


QUEUEING TASKS | :   0%|          | 0/3 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/3 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/3 [00:00<?, ?it/s]

3 - valid observations for chlor_a
                                       Rrs   chlor_a
time                                                
2024-04-01 20:46:28.184000+00:00  0.001547  1.359915
2024-04-02 19:43:11.165000+00:00 -0.000698  7.534161
2024-04-02 21:21:31.077000+00:00  0.000736  2.817072
