In [1]:
# ============================================================
# USGS NWIS IV (Instantaneous Values) Downloader
# Robust, production-safe, NO pygeohydro dependency
# Works for discharge (00060) and stage (00065)
# ============================================================

import requests
import pandas as pd
from pathlib import Path

# ------------------------
# USER INPUTS
# ------------------------
gauge_ids = [
    "12101500",  # example: Puyallup River
    "12093500",
]

start_date = "2021-01-01"
end_date   = "2021-12-31"

# USGS parameter codes
# 00060 = discharge (cfs)
# 00065 = stage (ft)
parameters = ["00060", "00065"]

# Output directory
out_dir = Path("usgs_iv_output")
out_dir.mkdir(exist_ok=True)

# ------------------------
# FUNCTION: DOWNLOAD IV DATA FOR ONE GAUGE
# ------------------------
def fetch_usgs_iv(site, start_date, end_date, parameters):
    param_str = ",".join(parameters)

    url = (
        "https://waterservices.usgs.gov/nwis/iv/"
        "?format=json"
        f"&sites={site}"
        f"&startDT={start_date}"
        f"&endDT={end_date}"
        f"&parameterCd={param_str}"
        "&siteStatus=all"
    )

    r = requests.get(url)
    r.raise_for_status()
    data = r.json()

    records = []

    for ts in data["value"]["timeSeries"]:
        variable = ts["variable"]["variableName"]
        param_cd = ts["variable"]["variableCode"][0]["value"]
        unit = ts["variable"]["unit"]["unitCode"]

        for v in ts["values"][0]["value"]:
            records.append({
                "datetime": pd.to_datetime(v["dateTime"]),
                "site_no": site,
                "parameter_cd": param_cd,
                "variable": variable,
                "unit": unit,
                "value": float(v["value"]) if v["value"] not in ["", None] else None
            })

    if not records:
        return pd.DataFrame()

    df = pd.DataFrame(records)
    return df


# ------------------------
# DOWNLOAD ALL GAUGES
# ------------------------
all_data = []

for site in gauge_ids:
    print(f"Downloading IV data for site {site} ...")
    try:
        df_site = fetch_usgs_iv(site, start_date, end_date, parameters)
        if not df_site.empty:
            all_data.append(df_site)
    except Exception as e:
        print(f"Failed for site {site}: {e}")

if not all_data:
    raise RuntimeError("No data downloaded for any gauge.")

df_all = pd.concat(all_data, ignore_index=True)

# ------------------------
# PIVOT TO ML-READY FORMAT
# ------------------------
# One column per variable (discharge, stage)
df_pivot = (
    df_all
    .pivot_table(
        index=["datetime", "site_no"],
        columns="parameter_cd",
        values="value"
    )
    .reset_index()
)

# Rename columns to human-readable names
df_pivot = df_pivot.rename(columns={
    "00060": "discharge_cfs",
    "00065": "stage_ft"
})

# ------------------------
# SAVE OUTPUT
# ------------------------
out_file = out_dir / f"usgs_iv_{start_date}_{end_date}.csv"
df_pivot.to_csv(out_file, index=False)

print("\nDownload complete.")
print("Saved file:", out_file)
print("\nPreview:")
print(df_pivot.head())

Downloading IV data for site 12101500 ...
Downloading IV data for site 12093500 ...

Download complete.
Saved file: usgs_iv_output/usgs_iv_2021-01-01_2021-12-31.csv

Preview:
parameter_cd                   datetime   site_no  discharge_cfs  stage_ft
0             2021-01-01 00:00:00-08:00  12093500         1390.0      6.34
1             2021-01-01 00:00:00-08:00  12101500         5170.0     13.76
2             2021-01-01 00:15:00-08:00  12093500         1370.0      6.33
3             2021-01-01 00:15:00-08:00  12101500         5170.0     13.76
4             2021-01-01 00:30:00-08:00  12093500         1370.0      6.33


parameter_cd,datetime,site_no,discharge_cfs,stage_ft
0,2021-01-01 00:00:00-08:00,12093500,1390.0,6.34
1,2021-01-01 00:00:00-08:00,12101500,5170.0,13.76
2,2021-01-01 00:15:00-08:00,12093500,1370.0,6.33
3,2021-01-01 00:15:00-08:00,12101500,5170.0,13.76
4,2021-01-01 00:30:00-08:00,12093500,1370.0,6.33
