In [1]:
import sys
import time
from datetime import date
from dateutil.relativedelta import relativedelta
from pathlib import Path

import requests
import pandas as pd
import data.data_source as data_source
from utils.artifact_saver import get_artifact_path

API_BASE   = "https://api.fiscaldata.treasury.gov/services/api/fiscal_service/v1/accounting/od/auctions_query"
PAGE_SIZE  = 10000   # plenty for a single year
BATCH_SIZE = 1000     # rows per upsert batch

ds = data_source.get_data_source()

def fetch_all_auctions_last_y(years_to_backfill) -> list[dict]:
    rows = []
    start_year = (date.today() - relativedelta(years=years_to_backfill)).year
    end_year   = date.today().year

    for yr in range(start_year, end_year + 1):
        print('year', yr)
        start = date(yr, 1, 1).isoformat()
        end   = date(yr, 12, 31).isoformat()
        params = [
            ("filter", f"record_date:gte:{start}"),
            ("filter", f"record_date:lte:{end}"),
            ("page[size]", str(PAGE_SIZE)),
            ("sort", "record_date"),
        ]
        resp = requests.get(API_BASE, params=params)
        resp.raise_for_status()
        data = resp.json().get("data", [])
        print(f"  {yr}: fetched {len(data)} rows")
        rows.extend(data)
    return rows

def quote_value(val):
    if val is None or val == "null":
        return "NULL"
    try:
        float(val)
        return str(val)
    except:
        return "'" + str(val).replace("'", "''") + "'"

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

def dedupe(records: list[dict]) -> list[dict]:
    seen = set()
    deduped = []
    for r in records:
        key = (r.get("record_date"), r.get("cusip"))
        if key in seen:
            continue
        seen.add(key)
        deduped.append(r)
    return deduped

def upsert_batch(records: list[dict], batch_size: int = BATCH_SIZE):
    if not records:
        return
    cols = list(records[0].keys())
    col_list = ", ".join(cols)
    total = len(records)
    print(f"Upserting {total} rows in batches of {batch_size}...")
    for idx, chunk in enumerate(chunks(records, batch_size), start=1):
        print(f"  Batch {idx}: {len(chunk)} rows")
        values_sql = []
        for r in chunk:
            vals = [quote_value(r.get(c)) for c in cols]
            values_sql.append("(" + ", ".join(vals) + ")")
        values_str = ",\n".join(values_sql)
        set_list = ", ".join([f"{c} = EXCLUDED.{c}" for c in cols])
        sql = f"""
        INSERT INTO tsy_auction_results ({col_list})
        VALUES
        {values_str}
        ON CONFLICT (record_date, cusip)
        DO UPDATE SET
          {set_list};
        """
        ds.query(sql)

def main(years_to_backfill):
    t0 = time.time()

    data = fetch_all_auctions_last_y(years_to_backfill)
    data = dedupe(data)
    upsert_batch(data)

    duration = time.time() - t0
    print(f"✅ Loaded {len(data)} rows in {duration:.1f}s")

    def find_mount_root(start: Path, target: str = "mnt") -> Path:
        """Climb up until we find the given folder name."""
        current = start.resolve()
        while current.name != target:
            if current.parent == current:
                raise FileNotFoundError(f"Could not find folder named '{target}' in parent paths.")
            current = current.parent
        return current
    
    df = pd.DataFrame(data)
    fn = get_artifact_path("tsy_auction_results.csv")
    df.to_csv(fn, index=False)

if __name__ == "__main__":
    main(years_to_backfill = 1)


'data_env' not found in environment. Defaulting to 'sandbox' env.
setting env to sandbox data
getting data source for sandbox
year 2024
  2024: fetched 10000 rows
year 2025
  2025: fetched 10000 rows
Upserting 10000 rows in batches of 1000...
  Batch 1: 1000 rows
  Batch 2: 1000 rows
  Batch 3: 1000 rows
  Batch 4: 1000 rows
  Batch 5: 1000 rows
  Batch 6: 1000 rows
  Batch 7: 1000 rows
  Batch 8: 1000 rows
  Batch 9: 1000 rows
  Batch 10: 1000 rows
✅ Loaded 10000 rows in 40.3s
