<a href="https://colab.research.google.com/github/dgutiluns/water-anomaly-ca/blob/main/sample_data_with_streamcat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

#from google.colab import files

# CKAN API endpoint
url = "https://data.ca.gov/api/3/action/datastore_search"

# Parameters for the request
params = {
    "resource_id": "e07c5e0b-cace-4b70-9f13-b3e696cd5a99",
    "limit": 100000  # number of rows to fetch (can adjust as needed)
}

In [15]:
response = requests.get(url, params=params)
data = response.json()

# Extract the records and convert to DataFrame
records = data["result"]["records"]
df = pd.DataFrame.from_records(records)

In [16]:
# Connecting the lab sample results table to the watershed data based on the location of the watershed

url = "https://api.epa.gov/StreamCat/streams/waters_streamcat"

# Headers: Accept JSON + COMID
headers = {
    "accept": "application/json",
    "comid": "13293392"
}

# Request
r = requests.get(url, headers=headers)
r.raise_for_status()

data = r.json()           # The API returns a JSON object
items = data.get("items", [])

# Load into DataFrame for easy viewing
strmCt= pd.DataFrame(items)

In [17]:
import requests, json
import pyproj

# (lon,lat) -> Web Mercator meters, same projection the tool uses
_to_3857 = pyproj.Transformer.from_crs(4326, 3857, always_xy=True).transform

def get_featureid_streamcat(lat, lon, timeout=20, verbose=True):
    """
    Return NHDPlus catchment FeatureID (same as COMID in StreamCat)
    for a WGS84 lat/lon point using the same ArcGIS query the StreamCat
    map sends from the browser's Network tab.
    """
    x, y = _to_3857(lon, lat)  # should be ~(-1.303e7, 3.901e6) for your example

    url = "https://watersgeo.epa.gov/arcgis/rest/services/NHDPlus/NHDPlus/MapServer/6/query"
    payload = {
        "outFields": "FEATUREID",                 # the tool sends FEATUREID (upper)
        "f": "json",
        "returnGeometry": "true",
        "returnCountOnly": "false",
        "geometry": json.dumps({"x": x, "y": y}), # Web Mercator meters
        "geometryType": "esriGeometryPoint",
        "spatialRel": "esriSpatialRelWithin",
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8"}

    r = requests.post(url, data=payload, headers=headers, timeout=timeout)
    if verbose:
        print("Status:", r.status_code)
    if r.status_code != 200:
        print("Non-200:", r.status_code, r.text[:300])
        return None

    js = r.json()
    feats = js.get("features") or []
    if not feats:
        print("No feature found.", js)
        return None

    attrs = feats[0].get("attributes", {})
    # The layer returns the attribute in *lowercase* ("featureid")
    fid = attrs.get("featureid") or attrs.get("FEATUREID") or attrs.get("FeatureID")
    if verbose:
        print(f"✅ FeatureID for ({lat}, {lon}) → {fid}")
    return int(fid) if fid is not None else None

# sanity check: for 33.04581, -117.12715 this should print 20329198
print(get_featureid_streamcat(33.04581, -117.12715))

Status: 200
✅ FeatureID for (33.04581, -117.12715) → 20329198
20329198


In [18]:
# Grabbing each locations unique COMID

# --- 1) unique (lat, lon) -> COMID lookup ---
# coerce to numeric, drop rows missing either coord, round to reduce dupes
df["Latitude"]  = pd.to_numeric(df["Latitude"], errors="coerce")
df["Longitude"] = pd.to_numeric(df["Longitude"], errors="coerce")

coords_unique = (
    df.loc[df["Latitude"].notna() & df["Longitude"].notna(), ["Latitude", "Longitude"]]
      .round(6)                      # <- same precision you tested
      .drop_duplicates()
      .reset_index(drop=True)
)

# build a dict { (lat, lon): comid }
coord_to_comid = {}
coords_w_missing_comid = []
for lat, lon in coords_unique.to_numpy():
    comid = get_featureid_streamcat(lat, lon, verbose=False)
    if type(comid) == int:
      coords_w_missing_comid.append((lat, lon))

    coord_to_comid[(lat, lon)] = comid
    # optional tiny pause to be polite to the service
    # time.sleep(0.05)

No feature found. {'displayFieldName': 'WBD_HUC12', 'fieldAliases': {'featureid': 'FeatureID'}, 'geometryType': 'esriGeometryPolygon', 'spatialReference': {'wkid': 102100, 'latestWkid': 3857}, 'fields': [{'name': 'featureid', 'type': 'esriFieldTypeInteger', 'alias': 'FeatureID'}], 'features': []}
No feature found. {'displayFieldName': 'WBD_HUC12', 'fieldAliases': {'featureid': 'FeatureID'}, 'geometryType': 'esriGeometryPolygon', 'spatialReference': {'wkid': 102100, 'latestWkid': 3857}, 'fields': [{'name': 'featureid', 'type': 'esriFieldTypeInteger', 'alias': 'FeatureID'}], 'features': []}
No feature found. {'displayFieldName': 'WBD_HUC12', 'fieldAliases': {'featureid': 'FeatureID'}, 'geometryType': 'esriGeometryPolygon', 'spatialReference': {'wkid': 102100, 'latestWkid': 3857}, 'fields': [{'name': 'featureid', 'type': 'esriFieldTypeInteger', 'alias': 'FeatureID'}], 'features': []}
No feature found. {'displayFieldName': 'WBD_HUC12', 'fieldAliases': {'featureid': 'FeatureID'}, 'geometryT

In [19]:
# map back to df
df_round = df[["Latitude", "Longitude"]].round(6)
df["COMID"] = [coord_to_comid.get((lat, lon)) for lat, lon in df_round.to_numpy()]

In [None]:
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
import time, random

STREAMCAT_URL = "https://api.epa.gov/StreamCat/streams/waters_streamcat"

# session with retries + backoff
sess = requests.Session()
retry = Retry(
    total=5,                 # up to 5 tries
    connect=5,
    read=5,
    backoff_factor=0.8,      # 0.8, 1.6, 3.2, ...
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"],
    raise_on_status=False,
)
sess.mount("https://", HTTPAdapter(max_retries=retry))

def fetch_streamcat_by_comid(comid: int, timeout=(5, 60), verbose=True):
    """
    Fetch one StreamCat record. timeout=(connect, read) in seconds.
    Uses retries with exponential backoff.
    """
    hdrs = {"accept": "application/json", "comid": str(int(comid))}
    try:
        r = sess.get(STREAMCAT_URL, headers=hdrs, timeout=timeout)
        if verbose:
            print(f"HTTP {r.status_code} for COMID {comid}")
        r.raise_for_status()
    except requests.exceptions.RequestException as e:
        if verbose:
            print(f"Request error for {comid}: {e}")
        return None

    try:
        js = r.json()
    except ValueError:
        if verbose:
            print("Bad JSON:", r.text[:300])
        return None

    items = (js or {}).get("items") or []
    return items[0] if items else None

# --- test with first COMID ---
# one = fetch_streamcat_by_comid(20329198, timeout=(5, 90))  # allow up to 90s read
# if one:
#     print("sections:", list(one.keys())[:10])
# else:
#     print("No item returned.")


In [None]:
def flatten_selected(item: dict, keep: dict) -> dict:
    row = {"COMID": None}
    for section, fields in keep.items():
        rows = item.get(section, [])
        if isinstance(rows, list) and rows:
            d = rows[0]
            row["COMID"] = row["COMID"] or d.get("comid")
            for f in fields:
                row[f"{section}.{f}"] = d.get(f)
        else:
            # ensure columns exist even if that section is missing
            for f in fields:
                row[f"{section}.{f}"] = None
    return row

#pd.DataFrame([flatten_selected(one, KEEP)])


In [None]:
KEEP = {
    "aoi":               ["catareasqkm", "wsareasqkm"],
    "elevation":         ["elevcat", "elevws"],
    "impervious":        ["pctimp2019cat", "pctimp2019ws"],
    "runoff":            ["runoffcat", "runoffws"],
    "soil":              ["sandws", "clayws", "omws"],
    "prism_norm_8110":   ["precip8110ws", "tmean8110ws"],
    "road_dens":         ["rddenscat", "rddensws"],
    "census2010":        ["popden2010ws", "huden2010ws"],
}

In [None]:
df["COMID"] = pd.to_numeric(df["COMID"], errors="coerce").astype("Int64")

unique_comids = [int(c) for c in pd.Series(df["COMID"]).dropna().unique()]

cache = {}
rows = []
for comid in unique_comids:
    item = cache.get(comid)
    if item is None:
        item = fetch_streamcat_by_comid(comid, timeout=(5, 90), verbose=False)
        cache[comid] = item
        time.sleep(0.05 + random.random()*0.05)   # tiny pause
    if item:
        rows.append(flatten_selected(item, KEEP))
    else:
        rows.append({"COMID": comid})              # placeholder if missing

streamcat_df = pd.DataFrame(rows)

# ensure same dtype for join
streamcat_df["COMID"] = pd.to_numeric(streamcat_df["COMID"], errors="coerce").astype("Int64")

df_with_streamcat = df.merge(streamcat_df, on="COMID", how="left")
df_with_streamcat.head(3)