In [None]:
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import requests
import os
from pyhdf.SD import SD, SDC
from tqdm import tqdm

## Download CHL Data

In [None]:
years = [1998, 1999]
save_dir = "chl_data"
os.makedirs(save_dir, exist_ok=True)

for year in years:
    print(year)
    for day in tqdm(range(1, 366)):  # adjust to 366 if leap year
        day_str = f"{day:03d}"
        url = f"https://spg-satdata.ucsd.edu/{year}/S{year}_chl_day/S{year}{day_str}_chl_mapped.hdf"
        local_path = os.path.join(save_dir, f"S{year}{day_str}_chl_mapped.hdf")
        
        if os.path.exists(local_path):
            continue  # skip if already downloaded
        
        try:
            r = requests.get(url, timeout=30)
            if r.status_code == 200:
                with open(local_path, "wb") as f:
                    f.write(r.content)
            else:
                print(f"File not found: {url}")
        except Exception as e:
            print(f"Error downloading {url}: {e}")

In [None]:
for y in range(2000, 2008):
    !wget -e robots=off -nd -r -np -l1 \
          -A "C{y}*_chl_comp.hdf" \
          https://spg-satdata.ucsd.edu/{y}/C{y}_chl_day/ \
          -P chl_data

In [None]:
# Example file
file_path = "/Users/deliacarpenter/Desktop/Research/primary-productivity/data/chl_data/C2000001_chl_comp.hdf"

# Open the HDF file
hdf = SD(file_path, SDC.READ)

# List all datasets
print("Datasets in HDF file:", hdf.datasets())

In [None]:
chl_data = hdf.select('chl_2000001')[:]  # shape (rows, cols)
print("Shape of CHL array:", chl_data.shape)
print("Min/max CHL values:", chl_data.min(), chl_data.max())

In [None]:
sds = hdf.select('chl_2000001')
print(sds.attributes())

In [None]:
hdf.attributes()

In [None]:
# --- Open your HDF file ---
#hdf_file = "/Users/deliacarpenter/Desktop/Research/primary-productivity/data/chl_data/C2000001_chl_comp.hdf"
hdf_file = "/Users/deliacarpenter/Desktop/Research/primary-productivity/data/chl_data/S1999351_chl_mapped.hdf"
hdf = SD(hdf_file, SDC.READ)

# --- Select dataset ---
sds_name = list(hdf.datasets().keys())[0]  # 'chl_1997335_1997365'
sds = hdf.select(sds_name)
chl_data = sds[:]  # shape (3405, 3840)

# --- Handle fill values ---
fill_value = sds.attributes()['_FillValue']
chl_data = chl_data.astype(np.float32)
chl_data[chl_data == fill_value] = np.nan  # mask missing values

# --- Apply scaling (from attributes) ---
slope = sds.attributes()['Slope']
intercept = sds.attributes()['Intercept']
base = sds.attributes()['Base']

# Original equation: Base ** (Slope * data + Intercept)
chl_data_scaled = base ** (slope * chl_data + intercept)

# --- Approximate lat/lon grid ---
nrows, ncols = chl_data.shape
min_lat, max_lat = 16, 45
min_lon, max_lon = -135, -100

lats = np.linspace(max_lat, min_lat, nrows)
lons = np.linspace(min_lon, max_lon, ncols)
lon_grid, lat_grid = np.meshgrid(lons, lats)

# --- Plot ---
plt.figure(figsize=(12,6))
plt.pcolormesh(lon_grid, lat_grid, chl_data_scaled, shading='auto', cmap='viridis')
plt.colorbar(label='Chlorophyll (mg m^-3)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title(f'{hdf_file.split('chl_data/')[1]}')
plt.show()

In [None]:
# Make chl lookup index

import os
from datetime import datetime

# Path to your directory
directory = '../data/chl_data'
chl_index = {}

# Iterate through files in the folder
for filename in os.listdir(directory):
    if filename.endswith(".hdf"):
        try:
            # 1. Extract the YYYYJJJ part
            # Based on your rule: second character to the first underscore
            date_part = filename.split('_')[0][1:] # e.g., '1998181' or '2007134'
            
            # 2. Parse the date
            # %Y is 4-digit year, %j is day of the year (001-366)
            date_obj = datetime.strptime(date_part, '%Y%j')
            
            # 3. Format as YYYY-MM-DD
            date_str = date_obj.strftime('%Y-%m-%d')
            
            # 4. Map it in the dictionary
            chl_index[date_str] = filename
            
        except (ValueError, IndexError):
            # Skips files that don't match the expected pattern
            print(f"Skipping incompatible file: {filename}")

# Sort the dictionary by keys (dates)
chl_index = {k: chl_index[k] for k in sorted(chl_index)}

In [None]:
# Save chl_index

import json

# Save the sorted index to a file
with open('../data/chl_data/chl_index.json', 'w') as f:
    json.dump(chl_index, f, indent=4)

In [None]:
import pydeck as pdk
import pandas as pd

# Get latitude/longitude grid
hdf_file = "../data/cal_aco_3840_Latitude_Longitude.hdf"
hdf = SD(hdf_file, SDC.READ)
lat_name = list(hdf.datasets().keys())[0]
lon_name = list(hdf.datasets().keys())[1]
lats = hdf.select(lat_name)
lons = hdf.select(lon_name)
lats = lats[:]
lons = lons[:]

stride = 10  # keep every 10th pixel in both directions

lat_sub = lats[::stride, ::stride]
lon_sub = lons[::stride, ::stride]

df = pd.DataFrame({
    "lat": lat_sub.ravel(),
    "lon": lon_sub.ravel(),
})

layer = pdk.Layer(
    "ScatterplotLayer",
    data=df,
    get_position=["lon", "lat"],
    get_radius=4000,              # meters; tweak if needed
    get_fill_color=[0, 120, 200, 80],
    pickable=False,
)

view_state = pdk.ViewState(
    latitude=float(df["lat"].mean()),
    longitude=float(df["lon"].mean()),
    zoom=3,
)

deck = pdk.Deck(
    layers=[layer],
    initial_view_state=view_state,
)

deck