In [None]:
# --- requirements: requests, pandas, numpy, scipy, scikit-learn ---
# pip install requests pandas numpy scipy scikit-learn
# (optional) pip install tqdm

import sys, math, time, itertools, datetime as dt, json
import pandas as pd
import numpy as np
from collections import defaultdict
from scipy.stats import linregress
import requests
from typing import Dict, Any

# Optional pretty progress
try:
    from tqdm import tqdm
    HAS_TQDM = True
except Exception:
    HAS_TQDM = False

# -----------------------
# CONFIG
# -----------------------
GRAPHQL_URL = "https://app.birdweather.com/graphql"

# Duval County, FL rough bounding box (WGS84); tweak if desired
BBOX = {
    "ne": {"lat": 30.60, "lon": -81.30},
    "sw": {"lat": 30.10, "lon": -82.10},
}

# Date window (inclusive start, up to today by default)
START_DATE = "2017-01-01"
END_DATE = dt.date.today().isoformat()

# Chunk size for station queries to daily counts
STATION_CHUNK = 50

# -----------------------
# Minimal progress helper
# -----------------------
class Progress:
    def __init__(self, use_tqdm: bool = HAS_TQDM):
        self.use_tqdm = use_tqdm

    def bar(self, total: int, desc: str):
        if self.use_tqdm:
            return tqdm(total=total, desc=desc, unit="step", leave=True)
        # Fallback minimal bar API
        class _Bar:
            def __init__(self, total, desc):
                self.total = total
                self.n = 0
                self.desc = desc
                print(f"{desc} (0/{total})")
            def update(self, k=1):
                self.n += k
                print(f"{self.desc} ({self.n}/{self.total})")
            def set_postfix(self, *args, **kwargs):
                pass
            def close(self):
                pass
        return _Bar(total, desc)

    def note(self, msg: str):
        ts = time.strftime("%H:%M:%S")
        print(f"[{ts}] {msg}")

progress = Progress()

# -----------------------
# Requests session with retries
# -----------------------
def make_session():
    from requests.adapters import HTTPAdapter
    try:
        # Retry is in urllib3; import defensively
        from urllib3.util.retry import Retry
        retries = Retry(
            total=5, backoff_factor=0.5,
            status_forcelist=(429, 500, 502, 503, 504),
            allowed_methods=frozenset(["POST"])
        )
        s = requests.Session()
        s.mount("https://", HTTPAdapter(max_retries=retries))
        s.mount("http://", HTTPAdapter(max_retries=retries))
        return s
    except Exception:
        return requests.Session()

SESSION = make_session()

# -----------------------
# GraphQL helper with basic error surfacing
# -----------------------
def gql(query: str, variables: Dict[str, Any]) -> Dict[str, Any]:
    r = SESSION.post(GRAPHQL_URL, json={"query": query, "variables": variables}, timeout=60)
    r.raise_for_status()
    out = r.json()
    if "errors" in out and out["errors"]:
        raise RuntimeError(json.dumps(out["errors"], indent=2))
    return out["data"]

# -----------------------
# Queries
# -----------------------
STATIONS_QUERY = """
query stations($first:Int,$after:String,$ne:InputLocation,$sw:InputLocation){
  stations(first:$first, after:$after, ne:$ne, sw:$sw){
    pageInfo{ endCursor hasNextPage }
    nodes{
      id
      name
      country
      state
      coords { lat lon }
      type
      timezone
      locationPrivacy
    }
    totalCount
  }
}
"""

DAILY_COUNTS_QUERY = """
query dailyDetectionCounts($period: InputDuration, $stationIds: [ID!]) {
  dailyDetectionCounts(period: $period, stationIds: $stationIds){
    date
    total
    counts { key count }
  }
}
"""

# -----------------------
# Utils
# -----------------------
def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

# -----------------------
# 1) Discover stations in bbox (with paging progress)
# -----------------------
def fetch_station_ids(ne, sw, page_size=100):
    ids, nodes = [], []
    after = None
    page_i = 0
    progress.note("Discovering public BirdWeather stations in bbox...")
    start_t = time.time()

    # Unknown total pages; we use totalCount if provided to show %.
    while True:
        page_i += 1
        data = gql(STATIONS_QUERY, {"first": page_size, "after": after, "ne": ne, "sw": sw})
        batch = data["stations"]["nodes"]
        nodes.extend(batch)
        ids.extend([n["id"] for n in batch])

        pi = data["stations"]["pageInfo"]
        totalCount = data["stations"].get("totalCount", None)

        if totalCount is not None and totalCount > 0:
            pct = min(100, int(100 * len(ids) / totalCount))
            progress.note(f"Stations page {page_i}: {len(ids)}/{totalCount} (~{pct}%).")
        else:
            progress.note(f"Stations page {page_i}: accumulated {len(ids)} (totalCount unavailable).")

        if not pi["hasNextPage"]:
            break
        after = pi["endCursor"]

    elapsed = time.time() - start_t
    progress.note(f"Station discovery complete: {len(ids)} stations in {elapsed:.1f}s.")
    return ids, pd.DataFrame(nodes)

# -----------------------
# 2) Pull daily counts with progress + ETA
# -----------------------
def fetch_daily_counts(station_ids, start_date, end_date, chunk_size=STATION_CHUNK):
    daily_frames = []
    chunk_list = list(chunks(station_ids, chunk_size))
    if not chunk_list:
        return pd.DataFrame()

    bar = progress.bar(total=len(chunk_list), desc="Pulling dailyDetectionCounts")
    records_so_far = 0
    t0 = time.time()

    for i, chunk in enumerate(chunk_list, start=1):
        t_chunk_start = time.time()

        data = gql(DAILY_COUNTS_QUERY, {"period": {"from": start_date, "to": end_date}, "stationIds": chunk})
        rows = data["dailyDetectionCounts"]

        local_count = 0
        for r in rows:
            date = r["date"]
            total = r["total"]
            for c in r["counts"]:
                daily_frames.append({
                    "date": date,
                    "speciesId": c["key"],
                    "count": c["count"],
                    "total": total
                })
                records_so_far += 1
                local_count += 1

        # Progress/ETA
        elapsed = time.time() - t0
        done = i
        remaining = len(chunk_list) - done
        avg_per_chunk = elapsed / max(done, 1)
        eta_sec = remaining * avg_per_chunk

        if HAS_TQDM:
            bar.update(1)
            bar.set_postfix({
                "chunk": f"{i}/{len(chunk_list)}",
                "rows": records_so_far,
                "eta": f"{int(eta_sec)}s"
            })
        else:
            bar.update(1)
            progress.note(f"Chunk {i}/{len(chunk_list)} (+{local_count} rows, {records_so_far} total). "
                          f"ETA ~{int(eta_sec)}s")

        # Gentle pacing to be kind to the API (tweak or remove if needed)
        dt_chunk = time.time() - t_chunk_start
        if dt_chunk < 0.2:
            time.sleep(0.2 - dt_chunk)

    bar.close()
    progress.note(f"Daily counts download complete: {records_so_far} rows in {time.time() - t0:.1f}s.")
    return pd.DataFrame(daily_frames)

# -----------------------
# 3) Compute metrics and save
# -----------------------
def compute_and_save_metrics(daily: pd.DataFrame):
    progress.note("Computing daily/weekly metrics…")

    daily["date"] = pd.to_datetime(daily["date"])
    daily["week"] = daily["date"].dt.to_period("W").apply(lambda r: r.start_time.date())
    daily["year"] = daily["date"].dt.year

    # County-level totals by day
    totals = daily.groupby("date", as_index=False)["total"].max().sort_values("date")

    # CV of daily detections
    cv_daily = (totals["total"].std(ddof=1) / totals["total"].mean()) if len(totals) > 1 else np.nan

    # Richness and Shannon H' per day
    richness = daily.groupby("date")["speciesId"].nunique().rename("richness")
    day_species = daily.pivot_table(index="date", columns="speciesId",
                                    values="count", aggfunc="sum", fill_value=0)
    p = day_species.div(day_species.sum(axis=1), axis=0).replace(0, np.nan)
    shannon = (-(p * np.log(p)).sum(axis=1)).rename("shannon_H")

    metrics = pd.concat([totals.set_index("date"), richness, shannon], axis=1).reset_index()

    # Weekly beta diversity: Sørensen dissimilarity between adjacent weeks
    weekly_presence = (daily.groupby(["week", "speciesId"])["count"]
                       .sum().unstack(fill_value=0) > 0).astype(int)
    sorensen = []
    weeks_sorted = sorted(weekly_presence.index)
    for w_prev, w_curr in zip(weeks_sorted[:-1], weeks_sorted[1:]):
        a = weekly_presence.loc[w_prev]
        b = weekly_presence.loc[w_curr]
        a_sum = a.sum(); b_sum = b.sum(); intersection = (a & b).sum()
        denom = (a_sum + b_sum)
        d_s = np.nan if denom == 0 else 1 - (2 * intersection / denom)
        sorensen.append({"week": w_curr, "sorensen_dissimilarity": d_s})
    beta_df = pd.DataFrame(sorensen)

    # Simple trend per year on daily totals
    trend_rows = []
    totals = totals.sort_values("date")
    for yr, sub in totals.groupby(totals["date"].dt.year):
        if len(sub) > 5:
            x = (sub["date"] - sub["date"].min()).dt.days.values
            y = sub["total"].values
            slope, intercept, r, p, se = linregress(x, y)
            trend_rows.append({
                "year": yr,
                "slope_det_per_day": slope,
                "r": r,
                "p": p
            })
    trend = pd.DataFrame(trend_rows)

    progress.note("Writing CSV outputs…")
    metrics.to_csv("duval_daily_metrics.csv", index=False)
    beta_df.to_csv("duval_weekly_beta_diversity.csv", index=False)
    trend.to_csv("duval_trends_by_year.csv", index=False)

    try:
        n_days = metrics["date"].nunique()
        n_weeks = beta_df["week"].nunique() if not beta_df.empty else 0
        n_species = daily["speciesId"].nunique()
        progress.note(
            f"SUMMARY → Days: {n_days} | Weeks: {n_weeks} | Species: {n_species} | Daily CV: {cv_daily:.3f}"
        )
    except Exception:
        pass

    return metrics, beta_df, trend, cv_daily

# -----------------------
# MAIN
# -----------------------
def main():
    # 1) Stations
    station_ids, stations_df = fetch_station_ids(BBOX["ne"], BBOX["sw"])
    if stations_df.empty or not station_ids:
        print("No public stations found in bbox. Try expanding the bbox or checking dates.")
        return
    stations_df.to_csv("duval_stations.csv", index=False)
    progress.note(f"Saved station manifest → duval_stations.csv")

    # 2) Daily counts
    daily = fetch_daily_counts(station_ids, START_DATE, END_DATE, STATION_CHUNK)
    if daily.empty:
        print("No daily data returned for this bbox/period. Try adjusting dates or bbox.")
        return

    # 3) Metrics & outputs
    compute_and_save_metrics(daily)

    progress.note("All files saved:")
    print("- duval_stations.csv")
    print("- duval_daily_metrics.csv")
    print("- duval_weekly_beta_diversity.csv")
    print("- duval_trends_by_year.csv")

if __name__ == "__main__":
    main()
