In [3]:
from pathlib import Path
import pandas as pd

csv_path = Path("../data/seamap_with_species.csv")
out_path = Path("../data/seamap_with_species.parquet")

# Parse date, keep strings as strings, let pandas infer numerics
df = pd.read_csv(
    csv_path,
    parse_dates=["Loc_date"],
    low_memory=False,
)

# If Latitude/Longitude columns exist, use those preferentially
# (some SEAMAP exports also include DECSLAT/DECSLON etc.)
if "Latitude" in df.columns and "Longitude" in df.columns:
    df["Latitude"] = pd.to_numeric(df["Latitude"], errors="coerce")
    df["Longitude"] = pd.to_numeric(df["Longitude"], errors="coerce")

# Write parquet (snappy compression is default-ish and widely supported)
df.to_parquet(out_path, index=False)

print(f"Wrote: {out_path} rows={len(df):,} cols={len(df.columns):,}")

Wrote: ../data/plankton_master_2024_present_with_species.parquet rows=591 cols=3,199


In [4]:
df

Unnamed: 0,STATIONID,CRUISEID,Loc_date,DECSLAT,DECELAT,DECSLON,DECELON,Latitude,Longitude,DEPTH_SSTA,...,BIO_999998605,BIO_999998800,BIO_999998900,BIO_999998910,BIO_999998920,BIO_999999100,BIO_999999205,BIO_999999336,BIO_999999998,BIO_999999999
0,235611,1139,2024-06-13,27.378,27.389,-82.725,-82.750,27.3835,-82.7375,11.0,...,,,,,,,,,,
1,235612,1139,2024-06-14,27.115,27.123,-82.725,-82.700,27.1190,-82.7125,20.1,...,,,,,,,,,,
2,235613,1139,2024-06-14,26.931,26.956,-82.819,-82.815,26.9435,-82.8170,28.9,...,,,,,,,,,,
3,235614,1139,2024-06-14,26.700,26.675,-82.806,-82.807,26.6875,-82.8065,30.9,...,,,,,,,,,,
4,235615,1139,2024-06-14,26.678,26.657,-82.637,-82.654,26.6675,-82.6455,25.2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586,236323,1150,2024-06-14,28.669,28.653,-91.198,-91.176,28.6610,-91.1870,10.2,...,,,,,,,,,,
587,236324,1150,2024-06-14,28.897,28.901,-90.681,-90.653,28.8990,-90.6670,56.0,...,,,,,,,,,,
588,236325,1150,2024-06-14,28.882,28.872,-90.535,-90.509,28.8770,-90.5220,91.0,...,,,,,,,,,,
589,236326,1150,2024-06-11,28.513,28.494,-90.154,-90.136,28.5035,-90.1450,31.0,...,,,,,,,,,,


In [6]:
# scripts/make_seamap_plankton_master_item.py
from __future__ import annotations

import json
from pathlib import Path
import pandas as pd

CSV_PATH = Path("../data/seamap_with_species.csv")
PARQUET_PATH = Path("../data/seamap_with_species.parquet")
ITEM_PATH = Path("../items/seamap_with_species.json")

ASSET_HREF = "../data/seamap_with_species.parquet"  # relative to items/

def pick_lat_lon_columns(df: pd.DataFrame) -> tuple[str, str]:
    # Prefer explicit decimal degrees columns if available
    if "Latitude" in df.columns and "Longitude" in df.columns:
        return "Latitude", "Longitude"
    # Fallbacks sometimes seen in exports
    if "DECSLAT" in df.columns and "DECSLON" in df.columns:
        return "DECSLAT", "DECSLON"
    raise ValueError("No recognizable lat/lon columns found (expected Latitude/Longitude or DECSLAT/DECSLON).")

def main() -> None:
    df = pd.read_csv(CSV_PATH, parse_dates=["Loc_date"], low_memory=False)

    lat_col, lon_col = pick_lat_lon_columns(df)
    df[lat_col] = pd.to_numeric(df[lat_col], errors="coerce")
    df[lon_col] = pd.to_numeric(df[lon_col], errors="coerce")

    # Derive extents
    t0 = pd.to_datetime(df["Loc_date"], errors="coerce").min()
    t1 = pd.to_datetime(df["Loc_date"], errors="coerce").max()

    lat_min = float(df[lat_col].min())
    lat_max = float(df[lat_col].max())
    lon_min = float(df[lon_col].min())
    lon_max = float(df[lon_col].max())

    bio_cols = [c for c in df.columns if c.startswith("BIO_")]

    item = {
        "type": "Feature",
        "stac_version": "1.0.0",
        "id": "plankton_master_2024_present",
        "properties": {
            "title": "SEAMAP (Gulf of Mexico) survey data (2024–present)",
            "description": (
                "Station-based fish survey and environmental data table derived from SEAMAP Gulf of Mexico surveys. "
                "Includes station metadata, environmental fields (temperature, salinity, winds, Secchi, etc.), "
                "total live biomass (kg) per survey and many BIO_* columns representing species (or taxon) biomass values by code.\n\n"
                "Note: BIO_* code definitions and biomass units depend on the SEAMAP export; see dataset README for mapping/units."
            ),
            "created": pd.Timestamp.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
            "start_datetime": None if pd.isna(t0) else t0.strftime("%Y-%m-%dT%H:%M:%SZ"),
            "end_datetime": None if pd.isna(t1) else t1.strftime("%Y-%m-%dT%H:%M:%SZ"),
            "platform": "SEAMAP (Gulf of Mexico)",
            "variables": [
                "station_metadata",
                "environmental",
                "total_live_biomass",
                "species_biomass_by_taxon_code",
            ],
            "table:primary_key": ["STATIONID", "CRUISEID"],
            "table:time_field": "Loc_date",
            "table:lat_field": lat_col,
            "table:lon_field": lon_col,
            "table:bio_columns_prefix": "BIO_",
            "table:bio_columns_count": len(bio_cols),
        },
        "geometry": None,
        "bbox": [lon_min, lat_min, lon_max, lat_max],
        "assets": {
            "data": {
                "href": ASSET_HREF,
                "type": "application/x-parquet",
                "roles": ["data"],
                "title": "SEAMAP GOM table (Parquet)"
            },
            "source_csv": {
                "href": "../data/seamap_with_species.csv",
                "type": "text/csv",
                "roles": ["data"],
                "title": "Original CSV export"
            }
        },
        "links": [
            {
                "rel": "collection",
                "href": "../collection.json",
                "type": "application/json"
            }
        ]
    }

    ITEM_PATH.parent.mkdir(parents=True, exist_ok=True)
    ITEM_PATH.write_text(json.dumps(item, indent=2), encoding="utf-8")
    print(f"Wrote STAC Item: {ITEM_PATH}")

    # Optional: ensure parquet exists (nice for pipeline runs)
    if not PARQUET_PATH.exists():
        PARQUET_PATH.parent.mkdir(parents=True, exist_ok=True)
        df.to_parquet(PARQUET_PATH, index=False)
        print(f"(Also wrote parquet: {PARQUET_PATH})")

if __name__ == "__main__":
    main()


Wrote STAC Item: ../items/seamap_with_species.json
