In [None]:
# Daily AQ notebook — configuration

from __future__ import annotations

import os
from pathlib import Path
from datetime import datetime, timezone

# Load .env from repo root
env_file = Path("..") / ".env"
for line in env_file.read_text(encoding="utf-8").splitlines():
    line = line.strip()
    if not line or line.startswith("#") or "=" not in line:
        continue
    k, v = line.split("=", 1)
    os.environ.setdefault(k.strip(), v.strip().strip('"').strip("'"))

OPENAQ_API_KEY = os.environ["OPENAQ_API_KEY"]

# BBOX is a performance hint only; NY membership is enforced via the NY GeoJSON boundary.
BBOX = os.getenv("BBOX", "-79.8,40.4,-71.6,45.1")
SAMPLE_SIZE = int(os.getenv("SAMPLE_SIZE", "100"))
STALE_HOURS = int(os.getenv("STALE_HOURS", "12"))

TODAY_UTC = datetime.now(timezone.utc).strftime("%Y-%m-%d")

print("TODAY_UTC:", TODAY_UTC)
print("SAMPLE_SIZE:", SAMPLE_SIZE)
print("STALE_HOURS:", STALE_HOURS)

In [None]:
# Pull a location catalog for the region. This is used to build the daily sample.

from __future__ import annotations

import time
import requests

OPENAQ_BASE_URL = "https://api.openaq.org/v3"

t0 = time.time()
resp = requests.get(
    f"{OPENAQ_BASE_URL}/locations",
    params={
        "bbox": BBOX,
        "iso": "US",
        "limit": 1000,
    },
    headers={"X-API-Key": OPENAQ_API_KEY},
    timeout=30,
)
latency_ms = int((time.time() - t0) * 1000)

resp.raise_for_status()
locations_payload = resp.json()

locations_found = locations_payload.get("meta", {}).get("found")
locations = locations_payload.get("results", [])

print("status:", resp.status_code)
print("latency_ms:", latency_ms)
print("locations_found:", locations_found)
print("locations_returned:", len(locations))

In [None]:
# Load NY state boundary (GeoJSON Feature)

from __future__ import annotations

import json
from pathlib import Path

boundary_path = Path("..") / "data" / "nys_boundary.geojson"
ny_feature = json.loads(boundary_path.read_text(encoding="utf-8"))

geom = ny_feature.get("geometry") or {}
geom_type = geom.get("type")
coords = geom.get("coordinates")

if geom_type not in {"Polygon", "MultiPolygon"}:
    raise ValueError(f"Unsupported geometry type: {geom_type}")

print("geometry:", geom_type)

In [None]:
# Filter locations to points inside the NY boundary polygon.

from __future__ import annotations

from typing import Iterable

def point_in_ring(lon: float, lat: float, ring: list[list[float]]) -> bool:
    # Ray casting algorithm. Ring is a list of [lon, lat] points.
    inside = False
    j = len(ring) - 1
    for i in range(len(ring)):
        xi, yi = ring[i]
        xj, yj = ring[j]
        intersects = ((yi > lat) != (yj > lat)) and (
            lon < (xj - xi) * (lat - yi) / ((yj - yi) or 1e-15) + xi
        )
        if intersects:
            inside = not inside
        j = i
    return inside

def point_in_polygon(lon: float, lat: float, polygon: list[list[list[float]]]) -> bool:
    # polygon: [outer_ring, hole1, hole2, ...]
    if not polygon:
        return False
    outer = polygon[0]
    if not point_in_ring(lon, lat, outer):
        return False
    # If inside a hole, treat as outside.
    for hole in polygon[1:]:
        if point_in_ring(lon, lat, hole):
            return False
    return True

def point_in_multipolygon(lon: float, lat: float, multipolygon: list) -> bool:
    for polygon in multipolygon:
        if point_in_polygon(lon, lat, polygon):
            return True
    return False

def loc_lon_lat(loc: dict) -> tuple[float | None, float | None]:
    c = loc.get("coordinates") or {}
    try:
        return float(c.get("longitude")), float(c.get("latitude"))
    except (TypeError, ValueError):
        return None, None

geom = ny_feature["geometry"]
geom_type = geom["type"]
coords = geom["coordinates"]

ny_locations: list[dict] = []
missing_coords = 0

for loc in locations:
    lon, lat = loc_lon_lat(loc)
    if lon is None or lat is None:
        missing_coords += 1
        continue

    inside = (
        point_in_polygon(lon, lat, coords) if geom_type == "Polygon"
        else point_in_multipolygon(lon, lat, coords)
    )
    if inside:
        ny_locations.append(loc)

print("locations_total:", len(locations))
print("missing_coords:", missing_coords)
print("ny_locations:", len(ny_locations))
print("first_5_names:", [l.get("name") for l in ny_locations[:5]])

In [None]:
# Stable daily sample (deterministic ordering) from NY-only locations.

from __future__ import annotations

import hashlib

def stable_rank(location: dict) -> int:
    raw = (str(location.get("id", "")) + "|" + BBOX).encode("utf-8")
    return int(hashlib.sha256(raw).hexdigest(), 16)

ny_sorted = sorted(ny_locations, key=stable_rank)
sampled_locations = ny_sorted[: min(SAMPLE_SIZE, len(ny_sorted))]

sample_ids = [loc["id"] for loc in sampled_locations if "id" in loc]
sample_names = [loc.get("name") for loc in sampled_locations[:5]]

print("ny_locations:", len(ny_locations))
print("sample_size:", len(sample_ids))
print("first_5_names:", sample_names)
print("first_5_ids:", sample_ids[:5])

In [None]:
# Fetch latest measurements for the sampled locations.

from __future__ import annotations

import time
import requests

def fetch_latest_for_location(location_id: int) -> dict:
    url = f"{OPENAQ_BASE_URL}/locations/{location_id}/latest"
    r = requests.get(url, headers={"X-API-Key": OPENAQ_API_KEY}, timeout=30)
    r.raise_for_status()
    return r.json()

t0 = time.time()

latest_results: list[dict] = []
errors: list[tuple[int, str]] = []

for loc_id in sample_ids:
    try:
        latest_results.append(fetch_latest_for_location(loc_id))
    except Exception as e:
        errors.append((loc_id, str(e)))

elapsed_s = time.time() - t0

print("locations_requested:", len(sample_ids))
print("latest_ok:", len(latest_results))
print("latest_errors:", len(errors))
print("elapsed_s:", round(elapsed_s, 2))

if errors:
    print("first_error:", errors[0])

In [None]:
# Retry rate-limited requests (HTTP 429) with backoff and Retry-After support.

from __future__ import annotations

import time
import requests

session = requests.Session()
session.headers.update({"X-API-Key": OPENAQ_API_KEY})

def fetch_latest_with_retries(location_id: int, max_attempts: int = 5) -> dict:
    url = f"{OPENAQ_BASE_URL}/locations/{location_id}/latest"
    delay_s = 1.0

    for attempt in range(1, max_attempts + 1):
        resp = session.get(url, timeout=30)

        if resp.status_code == 429:
            retry_after = resp.headers.get("Retry-After")
            if retry_after:
                try:
                    delay_s = max(delay_s, float(retry_after))
                except ValueError:
                    pass
            time.sleep(delay_s)
            delay_s = min(delay_s * 2, 30.0)
            continue

        resp.raise_for_status()
        return resp.json()

    raise RuntimeError(f"Rate-limited after {max_attempts} attempts")

# Retry only the failures from the previous cell
retry_ids = [loc_id for loc_id, _ in errors]

retried_ok: list[dict] = []
retried_errors: list[tuple[int, str]] = []

t0 = time.time()
for loc_id in retry_ids:
    try:
        retried_ok.append(fetch_latest_with_retries(loc_id))
    except Exception as e:
        retried_errors.append((loc_id, str(e)))

elapsed_s = time.time() - t0

latest_results.extend(retried_ok)
errors = retried_errors

print("retried_requested:", len(retry_ids))
print("retried_ok:", len(retried_ok))
print("remaining_errors:", len(errors))
print("retry_elapsed_s:", round(elapsed_s, 2))

In [None]:
# Normalize latest payloads and compute daily metrics.
# This version works with the OpenAQ v3 /locations/{id}/latest schema we observed.

from __future__ import annotations

from datetime import datetime, timezone

def parse_utc_datetime(dt_obj) -> datetime | None:
    if not isinstance(dt_obj, dict):
        return None
    s = dt_obj.get("utc")
    if not isinstance(s, str) or not s:
        return None
    try:
        if s.endswith("Z"):
            s = s[:-1] + "+00:00"
        return datetime.fromisoformat(s).astimezone(timezone.utc)
    except Exception:
        return None

now_utc = datetime.now(timezone.utc)
stale_cutoff = now_utc.timestamp() - (STALE_HOURS * 3600)

rows: list[dict] = []
missing_datetime = 0

for payload in latest_results:
    for m in payload.get("results", []) or []:
        dt = parse_utc_datetime(m.get("datetime"))
        if dt is None:
            missing_datetime += 1
        dt_ts = dt.timestamp() if dt else None

        stale = (dt_ts is None) or (dt_ts < stale_cutoff)
        coords = m.get("coordinates") or {}
        rows.append(
            {
                "locationsId": m.get("locationsId"),
                "sensorsId": m.get("sensorsId"),
                "latitude": coords.get("latitude"),
                "longitude": coords.get("longitude"),
                "value": m.get("value"),
                "datetime_utc": dt.isoformat().replace("+00:00", "Z") if dt else None,
                "stale": stale,
            }
        )

total = len(rows)
stale_count = sum(1 for r in rows if r["stale"])
stale_fraction = (stale_count / total) if total else 0.0

# Useful "top values" view (across all sensors)
numeric_rows = [r for r in rows if isinstance(r.get("value"), (int, float))]
top_values = sorted(numeric_rows, key=lambda r: r["value"], reverse=True)[:5]

print("locations_sampled:", len(sample_ids))
print("locations_with_latest:", len(latest_results))
print("measurements_total:", total)
print("missing_datetime:", missing_datetime)
print("stale_fraction:", round(stale_fraction, 3))
print("top_5_values:", [(r["locationsId"], r["value"], r["datetime_utc"]) for r in top_values])

In [None]:
# Enrich measurements with sensor metadata via /locations/{id}/sensors (throttled)

from __future__ import annotations

import time
import requests

session = requests.Session()
session.headers.update({"X-API-Key": OPENAQ_API_KEY})

def get_with_backoff(url: str, *, max_attempts: int = 10, timeout_s: int = 30) -> requests.Response:
    delay_s = 1.0
    for _ in range(max_attempts):
        r = session.get(url, timeout=timeout_s)

        if r.status_code == 429:
            retry_after = r.headers.get("Retry-After")
            if retry_after:
                try:
                    delay_s = max(delay_s, float(retry_after))
                except ValueError:
                    pass
            time.sleep(delay_s)
            delay_s = min(delay_s * 1.5, 30.0)
            continue

        r.raise_for_status()
        return r

    raise RuntimeError(f"Rate-limited too long: {url}")

# Query sensors only for locations we actually sampled (NY-only).
location_ids = sorted({loc["id"] for loc in sampled_locations if "id" in loc})

sensor_meta: dict[int, dict] = {}
errors_sensors: list[tuple[int, str]] = []

t0 = time.time()
for i, loc_id in enumerate(location_ids, start=1):
    url = f"{OPENAQ_BASE_URL}/locations/{loc_id}/sensors"
    try:
        payload = get_with_backoff(url).json()
        for s in payload.get("results", []) or []:
            sid = s.get("id")
            param = s.get("parameter") or {}
            if isinstance(sid, int):
                sensor_meta[sid] = {
                    "parameter_name": param.get("name"),
                    "units": param.get("units"),
                }
    except Exception as e:
        errors_sensors.append((loc_id, str(e)))

    # Gentle throttle to avoid sustained rate limiting.
    time.sleep(0.25)

elapsed_s = time.time() - t0

missing_sensor_meta = 0
for r in rows:
    meta = sensor_meta.get(r.get("sensorsId"))
    if not meta:
        missing_sensor_meta += 1
        continue
    r.update(meta)

param_counts: dict[str, int] = {}
for r in rows:
    p = r.get("parameter_name") or "unknown"
    param_counts[p] = param_counts.get(p, 0) + 1

top_params = sorted(param_counts.items(), key=lambda kv: (-kv[1], kv[0]))[:10]

print("locations_queried_for_sensors:", len(location_ids))
print("unique_sensors_mapped:", len(sensor_meta))
print("rows_missing_sensor_meta:", missing_sensor_meta)
print("sensor_errors:", len(errors_sensors))
print("top_parameters:", top_params)
print("sensor_lookup_elapsed_s:", round(elapsed_s, 2))

if errors_sensors:
    print("first_sensor_error:", errors_sensors[0])

In [None]:
# Cache sensor metadata to keep daily runs fast.

from __future__ import annotations

import json
from pathlib import Path
from datetime import datetime, timezone, timedelta

CACHE_PATH = (Path("..") / "data" / "sensor_meta_cache.json").resolve()
CACHE_MAX_AGE_DAYS = 7

def utc_now() -> datetime:
    return datetime.now(timezone.utc)

def load_cache(path: Path) -> dict[int, dict] | None:
    if not path.exists():
        return None
    try:
        payload = json.loads(path.read_text(encoding="utf-8"))
        saved_at = payload.get("saved_at_utc")
        if not isinstance(saved_at, str):
            return None
        saved_dt = datetime.fromisoformat(saved_at.replace("Z", "+00:00"))
        if utc_now() - saved_dt > timedelta(days=CACHE_MAX_AGE_DAYS):
            return None
        data = payload.get("sensor_meta")
        if not isinstance(data, dict):
            return None
        # keys come back as strings from JSON
        return {int(k): v for k, v in data.items()}
    except Exception:
        return None

def save_cache(path: Path, data: dict[int, dict]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    payload = {
        "saved_at_utc": utc_now().isoformat().replace("+00:00", "Z"),
        "sensor_meta": {str(k): v for k, v in data.items()},
    }
    path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8")

cached = load_cache(CACHE_PATH)
if cached is None:
    save_cache(CACHE_PATH, sensor_meta)
    print("sensor cache: saved", len(sensor_meta), "entries to", CACHE_PATH)
else:
    print("sensor cache: loaded", len(cached), "entries from", CACHE_PATH)


In [None]:
# Daily metrics (parameter-aware) for notes + CSV

from __future__ import annotations

from datetime import datetime, timezone, timedelta

def parse_utc(s: str | None) -> datetime | None:
    if not s:
        return None
    try:
        if s.endswith("Z"):
            s = s[:-1] + "+00:00"
        return datetime.fromisoformat(s).astimezone(timezone.utc)
    except Exception:
        return None

now_utc = datetime.now(timezone.utc)
stale_cutoff = now_utc - timedelta(hours=STALE_HOURS)
very_old_cutoff = now_utc - timedelta(days=30)

total_rows = len(rows)
stale_rows = 0
very_old_rows = 0

param_counts: dict[str, int] = {}
param_stale_counts: dict[str, int] = {}
param_units: dict[str, str] = {}

latest_dt_seen: datetime | None = None
oldest_dt_seen: datetime | None = None

for r in rows:
    p = r.get("parameter_name") or "unknown"
    param_counts[p] = param_counts.get(p, 0) + 1
    if p not in param_units and r.get("units"):
        param_units[p] = str(r["units"])

    dt = parse_utc(r.get("datetime_utc"))
    if dt:
        if latest_dt_seen is None or dt > latest_dt_seen:
            latest_dt_seen = dt
        if oldest_dt_seen is None or dt < oldest_dt_seen:
            oldest_dt_seen = dt

    is_stale = bool(r.get("stale"))
    if is_stale:
        stale_rows += 1
        param_stale_counts[p] = param_stale_counts.get(p, 0) + 1

    if dt is None or dt < very_old_cutoff:
        very_old_rows += 1

stale_fraction = (stale_rows / total_rows) if total_rows else 0.0
very_old_fraction = (very_old_rows / total_rows) if total_rows else 0.0

top_params = sorted(param_counts.items(), key=lambda kv: (-kv[1], kv[0]))[:10]

def top_values_for_parameter(param: str, n: int = 5) -> list[dict]:
    candidates = [
        r for r in rows
        if (r.get("parameter_name") or "unknown") == param and isinstance(r.get("value"), (int, float))
    ]
    return sorted(candidates, key=lambda r: r["value"], reverse=True)[:n]

primary_param = top_params[0][0] if top_params else "unknown"
primary_top = top_values_for_parameter(primary_param, n=5)

print("locations_sampled:", len(sample_ids))
print("locations_with_latest:", len(latest_results))
print("measurements_total:", total_rows)
print("stale_fraction:", round(stale_fraction, 3))
print("very_old_fraction:", round(very_old_fraction, 3))
print("datetime_range_utc:", (oldest_dt_seen, latest_dt_seen))
print("top_parameters:", top_params)
print(
    "primary_param_top5:",
    [(r.get("locationsId"), r.get("value"), r.get("units"), r.get("datetime_utc")) for r in primary_top],
)

In [None]:
# Write today's artifacts in an idempotent way (safe to rerun).

from __future__ import annotations

import csv
from pathlib import Path
from datetime import datetime, timezone

REPO_ROOT = Path("..").resolve()
DATA_DIR = REPO_ROOT / "data"
NOTES_DIR = REPO_ROOT / "notes"

DATA_DIR.mkdir(parents=True, exist_ok=True)
NOTES_DIR.mkdir(parents=True, exist_ok=True)

daily_csv_path = DATA_DIR / "daily.csv"
note_path = NOTES_DIR / f"{TODAY_UTC}.md"


def dt_to_z(dt: datetime | None) -> str:
    if not dt:
        return ""
    return dt.astimezone(timezone.utc).isoformat().replace("+00:00", "Z")


csv_fields = [
    "date_utc",
    "bbox",
    "sample_size",
    "locations_in_ny_boundary",
    "locations_with_latest",
    "measurements_total",
    "api_latency_ms_locations",
    "stale_hours",
    "stale_fraction",
    "very_old_fraction",
    "oldest_datetime_utc",
    "latest_datetime_utc",
    "top_parameters",
]

today_row = {
    "date_utc": TODAY_UTC,
    "bbox": BBOX,
    "sample_size": len(sample_ids),
    "locations_in_ny_boundary": len(ny_locations),
    "locations_with_latest": len(latest_results),
    "measurements_total": len(rows),
    "api_latency_ms_locations": latency_ms,
    "stale_hours": STALE_HOURS,
    "stale_fraction": f"{stale_fraction:.4f}",
    "very_old_fraction": f"{very_old_fraction:.4f}",
    "oldest_datetime_utc": dt_to_z(oldest_dt_seen),
    "latest_datetime_utc": dt_to_z(latest_dt_seen),
    "top_parameters": ";".join([f"{p}:{c}" for p, c in top_params]),
}

# Upsert by date_utc, and collapse any existing duplicates for the same date.
existing: list[dict] = []
if daily_csv_path.exists():
    with daily_csv_path.open("r", newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for r in reader:
            if r.get("date_utc"):
                existing.append(r)

kept: list[dict] = [r for r in existing if r.get("date_utc") != TODAY_UTC]
kept.append(today_row)
kept.sort(key=lambda r: r["date_utc"])

tmp_path = daily_csv_path.with_suffix(".csv.tmp")
with tmp_path.open("w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=csv_fields)
    w.writeheader()
    for r in kept:
        w.writerow({k: r.get(k, "") for k in csv_fields})

tmp_path.replace(daily_csv_path)

# Daily note overwrites the same path for the day.
lines: list[str] = []
lines.append(f"# NYS air quality daily summary — {TODAY_UTC} (UTC)")
lines.append("")
lines.append("## Technical summary")
lines.append(f"- Locations in NY boundary: {len(ny_locations)}")
lines.append(f"- Locations sampled: {len(sample_ids)}")
lines.append(f"- Locations with latest data: {len(latest_results)}")
lines.append(f"- Measurements normalized: {len(rows)}")
lines.append(f"- API latency (locations catalog): {latency_ms} ms")
lines.append("")
lines.append("### Data quality checks")
lines.append(f"- Stale threshold: {STALE_HOURS} hours")
lines.append(f"- Stale fraction: {stale_fraction:.3f}")
lines.append(f"- Very old fraction (>30 days): {very_old_fraction:.3f}")
lines.append(f"- Timestamp range (UTC): {dt_to_z(oldest_dt_seen)} → {dt_to_z(latest_dt_seen)}")
lines.append("")
lines.append("### Parameter coverage (top)")
for p, c in top_params:
    unit = param_units.get(p, "")
    unit_s = f" ({unit})" if unit else ""
    lines.append(f"- {p}{unit_s}: {c}")
lines.append("")
lines.append(f"### Highest values for {primary_param}")
if primary_top:
    for r in primary_top:
        loc_id = r.get("locationsId")
        val = r.get("value")
        unit = r.get("units") or ""
        dt = r.get("datetime_utc") or ""
        lines.append(f"- location_id={loc_id}: {val} {unit} at {dt}")
else:
    lines.append("- No numeric values available.")
lines.append("")
lines.append("## Plain-language summary")
lines.append(
    "This report tracks a stable set of monitoring locations within New York State and records the latest readings "
    "available from OpenAQ. Some readings are old or missing; those are called out in the quality section so daily "
    "changes are not misread as real shifts in air quality."
)
lines.append("")
lines.append(f"Today, the most commonly reported parameter in the sample was '{primary_param}'.")
note_path.write_text("\n".join(lines) + "\n", encoding="utf-8")

print("Updated:", daily_csv_path)
print("Wrote:", note_path)


In [None]:
# SVG chart: parameter coverage

from __future__ import annotations

from pathlib import Path
from xml.sax.saxutils import escape

REPO_ROOT = Path("..").resolve()
report_dir = REPO_ROOT / "reports" / TODAY_UTC
report_dir.mkdir(parents=True, exist_ok=True)

chart_path = report_dir / "parameter_coverage.svg"

labels = [p for p, _ in top_params]
values = [c for _, c in top_params]

width, height = 900, 360
pad_left, pad_right, pad_top, pad_bottom = 60, 20, 30, 80
plot_w = width - pad_left - pad_right
plot_h = height - pad_top - pad_bottom

max_v = max(values) if values else 1
bar_w = plot_w / max(len(values), 1)

def y_for(v: float) -> float:
    return pad_top + (1 - (v / max_v)) * plot_h

parts: list[str] = []
parts.append(f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}">')
parts.append(f'<rect x="0" y="0" width="{width}" height="{height}" fill="white"/>')
parts.append(f'<text x="{pad_left}" y="20" font-family="Arial, sans-serif" font-size="16">Parameter coverage (NY sample)</text>')

# Axes
parts.append(f'<line x1="{pad_left}" y1="{pad_top}" x2="{pad_left}" y2="{pad_top + plot_h}" stroke="black"/>')
parts.append(f'<line x1="{pad_left}" y1="{pad_top + plot_h}" x2="{pad_left + plot_w}" y2="{pad_top + plot_h}" stroke="black"/>')

for i, (lab, v) in enumerate(zip(labels, values)):
    x = pad_left + i * bar_w + bar_w * 0.15
    w = bar_w * 0.7
    y = y_for(v)
    h = (pad_top + plot_h) - y
    parts.append(f'<rect x="{x:.2f}" y="{y:.2f}" width="{w:.2f}" height="{h:.2f}" fill="gray"/>')

    lx = pad_left + i * bar_w + bar_w * 0.5
    ly = pad_top + plot_h + 10
    parts.append(
        f'<text x="{lx:.2f}" y="{ly:.2f}" font-family="Arial, sans-serif" font-size="11" '
        f'transform="rotate(45 {lx:.2f} {ly:.2f})" text-anchor="start">{escape(lab)}</text>'
    )

parts.append(
    f'<text x="15" y="{pad_top + plot_h/2:.2f}" font-family="Arial, sans-serif" font-size="12" '
    f'transform="rotate(-90 15 {pad_top + plot_h/2:.2f})" text-anchor="middle">measurement count</text>'
)

parts.append("</svg>")

chart_path.write_text("\n".join(parts) + "\n", encoding="utf-8")
print("Wrote:", chart_path)

In [None]:
# SVG map: NY outline + sample points, colored by primary parameter, radius scaled by value

from __future__ import annotations

from pathlib import Path
from xml.sax.saxutils import escape

REPO_ROOT = Path("..").resolve()
report_dir = REPO_ROOT / "reports" / TODAY_UTC
report_dir.mkdir(parents=True, exist_ok=True)

map_path = report_dir / "map.svg"

primary = primary_param
unit = param_units.get(primary, "")

USE_ONLY_FRESH_FOR_SCALE = True

def pick_latest_value_by_location(parameter: str) -> dict[int, tuple[float, str, bool]]:
    out: dict[int, tuple[float, str, bool]] = {}
    for r in rows:
        if (r.get("parameter_name") or "unknown") != parameter:
            continue
        loc_id = r.get("locationsId")
        val = r.get("value")
        dt = r.get("datetime_utc") or ""
        stale = bool(r.get("stale"))
        if not isinstance(loc_id, int) or not isinstance(val, (int, float)):
            continue
        prev = out.get(loc_id)
        if prev is None or dt > prev[1]:
            out[loc_id] = (float(val), dt, stale)
    return out

loc_value = pick_latest_value_by_location(primary)

points: list[tuple[float, float, float | None, bool]] = []
for loc in ny_locations:
    loc_id = loc.get("id")
    c = loc.get("coordinates") or {}
    try:
        lon = float(c.get("longitude"))
        lat = float(c.get("latitude"))
    except (TypeError, ValueError):
        continue
    v = loc_value.get(loc_id) if isinstance(loc_id, int) else None
    if v:
        points.append((lon, lat, v[0], v[2]))
    else:
        points.append((lon, lat, None, True))

def percentile(values: list[float], p: float) -> float:
    if not values:
        return 0.0
    xs = sorted(values)
    k = (len(xs) - 1) * p
    f = int(k)
    c = min(f + 1, len(xs) - 1)
    if c == f:
        return xs[f]
    return xs[f] + (xs[c] - xs[f]) * (k - f)

def clamp01(x: float) -> float:
    return max(0.0, min(1.0, x))

def lerp(a: float, b: float, t: float) -> float:
    return a + (b - a) * t

def color_ramp(t: float) -> str:
    t = clamp01(t)
    if t < 0.5:
        tt = t / 0.5
        r = int(lerp(52, 243, tt))
        g = int(lerp(152, 156, tt))
        b = int(lerp(219, 18, tt))
    else:
        tt = (t - 0.5) / 0.5
        r = int(lerp(243, 231, tt))
        g = int(lerp(156, 76, tt))
        b = int(lerp(18, 60, tt))
    return f"rgb({r},{g},{b})"

vals_all = [v for _, _, v, _ in points if isinstance(v, (int, float))]
vals_fresh = [v for _, _, v, stale in points if isinstance(v, (int, float)) and (not stale)]
scale_vals = vals_fresh if (USE_ONLY_FRESH_FOR_SCALE and vals_fresh) else vals_all

vmin = percentile(scale_vals, 0.05)
vmax = percentile(scale_vals, 0.95)
if vmax <= vmin:
    vmax = vmin + 1.0

def value_to_t(v: float) -> float:
    return clamp01((v - vmin) / (vmax - vmin))

def color_for(v: float | None, stale: bool) -> str:
    if v is None:
        return "lightgray"
    if stale:
        return "rgb(180,180,180)"
    return color_ramp(value_to_t(v))

def radius_for(v: float | None, stale: bool) -> float:
    if v is None:
        return 3.5
    if stale:
        return 3.5
    t = value_to_t(v)
    return 3.0 + t * 6.0  # 3 -> 9

# NY polygon bounds for projection
geom = ny_feature["geometry"]
coords = geom["coordinates"]

lons, lats = [], []
for poly in coords:
    for lon, lat in poly[0]:
        lons.append(float(lon))
        lats.append(float(lat))

min_lon, max_lon = min(lons), max(lons)
min_lat, max_lat = min(lats), max(lats)

width, height = 900, 520
pad = 20

def project(lon: float, lat: float) -> tuple[float, float]:
    x = pad + (lon - min_lon) / (max_lon - min_lon) * (width - 2 * pad)
    y = pad + (max_lat - lat) / (max_lat - min_lat) * (height - 2 * pad)
    return x, y

parts: list[str] = []
parts.append(f'<svg xmlns="http://www.w3.org/2000/svg" width="{width}" height="{height}">')
parts.append(f'<rect x="0" y="0" width="{width}" height="{height}" fill="white"/>')

title = f"NY sample map — {primary}"
parts.append(f'<text x="{pad}" y="18" font-family="Arial, sans-serif" font-size="16">{escape(title)}</text>')

# Outline
for poly in coords:
    outer = poly[0]
    d = []
    for i, (lon, lat) in enumerate(outer):
        x, y = project(float(lon), float(lat))
        d.append(("M" if i == 0 else "L") + f"{x:.2f},{y:.2f}")
    d.append("Z")
    parts.append(f'<path d="{" ".join(d)}" fill="none" stroke="black" stroke-width="1"/>')

# Points
for lon, lat, v, stale in points:
    x, y = project(lon, lat)
    fill = color_for(v, stale)
    r = radius_for(v, stale)
    parts.append(f'<circle cx="{x:.2f}" cy="{y:.2f}" r="{r:.2f}" fill="{fill}" stroke="black" stroke-width="0.6"/>')

# Legend
legend_y = height - 44
legend_x = pad
sw = 18

parts.append(f'<text x="{legend_x}" y="{legend_y}" font-family="Arial, sans-serif" font-size="12">Scale (p5–p95):</text>')
for i, (label, t) in enumerate([("low", 0.0), ("mid", 0.5), ("high", 1.0)]):
    x = legend_x + 100 + i * 120
    parts.append(f'<rect x="{x}" y="{legend_y - 12}" width="{sw}" height="{sw}" fill="{color_ramp(t)}" stroke="black" stroke-width="0.6"/>')
    parts.append(f'<text x="{x + 26}" y="{legend_y + 2}" font-family="Arial, sans-serif" font-size="12">{escape(label)}</text>')

parts.append(
    f'<text x="{pad}" y="{height - 18}" font-family="Arial, sans-serif" font-size="12">'
    f"p5={vmin:.2f}, p95={vmax:.2f} {escape(unit)}; stale points shown in gray"
    f"</text>"
)

parts.append("</svg>")

map_path.write_text("\n".join(parts) + "\n", encoding="utf-8")
print("Wrote:", map_path)


In [None]:
# Daily AQ notebook — baseline config + sanity checks.

from __future__ import annotations

import os
from pathlib import Path
from datetime import datetime, timezone

# Assumption: this notebook lives in ./notebooks, so repo root is one directory up.
REPO_ROOT = Path("..").resolve()

DATA_DIR = REPO_ROOT / "data"
NOTES_DIR = REPO_ROOT / "notes"
REPORTS_DIR = REPO_ROOT / "reports"

# Required secret: expected to be provided via your environment (not committed to git).
API_KEY_ENV = "OPENAQ_API_KEY"
OPENAQ_API_KEY = os.getenv(API_KEY_ENV)
if not OPENAQ_API_KEY:
    raise RuntimeError(f"Missing environment variable: {API_KEY_ENV}")

# NYS rough bounding box (minLon,minLat,maxLon,maxLat). Adjust later if you want tighter bounds.
BBOX = os.getenv("BBOX", "-79.8,40.4,-71.6,45.1")

# Stable sample size (locations). Keep modest to avoid slow daily runs.
SAMPLE_SIZE = int(os.getenv("SAMPLE_SIZE", "100"))

# Data quality: consider readings older than this "stale".
STALE_HOURS = int(os.getenv("STALE_HOURS", "12"))

# Today's date (UTC) is used for file paths.
TODAY_UTC = datetime.now(timezone.utc).strftime("%Y-%m-%d")

print("Repo root:", REPO_ROOT)
print("BBOX:", BBOX)
print("SAMPLE_SIZE:", SAMPLE_SIZE)
print("STALE_HOURS:", STALE_HOURS)
print("TODAY_UTC:", TODAY_UTC)
print("API key loaded:", "yes")

In [None]:
from nys_aq.daily import run_daily

cfg = run_daily()
cfg