In [2]:
import sys
import time
from datetime import date
from dateutil.relativedelta import relativedelta
from pathlib import Path

import requests
import pandas as pd
import mlflow
from domino.data_sources import DataSourceClient

API_BASE   = "https://api.fiscaldata.treasury.gov/services/api/fiscal_service/v1/accounting/od/auctions_query"
PAGE_SIZE  = 10000   # plenty for a single year
BATCH_SIZE = 1000     # rows per upsert batch

mlflow.set_experiment("Populate DB (treasury_auction_results)")
ds = DataSourceClient().get_datasource("market_data")

def fetch_all_auctions_last_y(years_to_backfill) -> list[dict]:
    rows = []
    start_year = (date.today() - relativedelta(years=years_to_backfill)).year
    end_year   = date.today().year

    for yr in range(start_year, end_year + 1):
        print('year', yr)
        start = date(yr, 1, 1).isoformat()
        end   = date(yr, 12, 31).isoformat()
        params = [
            ("filter", f"record_date:gte:{start}"),
            ("filter", f"record_date:lte:{end}"),
            ("page[size]", str(PAGE_SIZE)),
            ("sort", "record_date"),
        ]
        resp = requests.get(API_BASE, params=params)
        resp.raise_for_status()
        data = resp.json().get("data", [])
        print(f"  {yr}: fetched {len(data)} rows")
        rows.extend(data)
    return rows

def quote_value(val):
    if val is None or val == "null":
        return "NULL"
    try:
        float(val)
        return str(val)
    except:
        return "'" + str(val).replace("'", "''") + "'"

def chunks(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i : i + n]

def dedupe(records: list[dict]) -> list[dict]:
    seen = set()
    deduped = []
    for r in records:
        key = (r.get("record_date"), r.get("cusip"))
        if key in seen:
            continue
        seen.add(key)
        deduped.append(r)
    return deduped

def upsert_batch(records: list[dict], batch_size: int = BATCH_SIZE):
    if not records:
        return
    cols = list(records[0].keys())
    col_list = ", ".join(cols)
    total = len(records)
    print(f"Upserting {total} rows in batches of {batch_size}...")
    for idx, chunk in enumerate(chunks(records, batch_size), start=1):
        print(f"  Batch {idx}: {len(chunk)} rows")
        values_sql = []
        for r in chunk:
            vals = [quote_value(r.get(c)) for c in cols]
            values_sql.append("(" + ", ".join(vals) + ")")
        values_str = ",\n".join(values_sql)
        set_list = ", ".join([f"{c} = EXCLUDED.{c}" for c in cols])
        sql = f"""
        INSERT INTO treasury_auction_results ({col_list})
        VALUES
        {values_str}
        ON CONFLICT (record_date, cusip)
        DO UPDATE SET
          {set_list};
        """
        ds.query(sql)

def main(years_to_backfill):
    with mlflow.start_run(run_name="Load 30yr Auction Results"):
        t0 = time.time()

        data = fetch_all_auctions_last_y(years_to_backfill)
        mlflow.log_metric("rows_fetched", len(data))
        data = dedupe(data)
        upsert_batch(data)

        duration = time.time() - t0
        mlflow.log_metric("duration_seconds", duration)
        print(f"✅ Loaded {len(data)} rows in {duration:.1f}s")

        def find_mount_root(start: Path, target: str = "mnt") -> Path:
            """Climb up until we find the given folder name."""
            current = start.resolve()
            while current.name != target:
                if current.parent == current:
                    raise FileNotFoundError(f"Could not find folder named '{target}' in parent paths.")
                current = current.parent
            return current
        
        df = pd.DataFrame(data)
        mnt_root = find_mount_root(Path.cwd())
        out = mnt_root / "artifacts" / "results" / "treasury_auction_results.csv"
        out.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(out, index=False)
        mlflow.log_artifact(str(out), artifact_path="treasury_auction_results")

if __name__ == "__main__":
    main(years_to_backfill = 3)


year 2022
  2022: fetched 1437 rows
year 2023
  2023: fetched 1057 rows
year 2024
  2024: fetched 629 rows
year 2025
  2025: fetched 185 rows
Upserting 1437 rows in batches of 1000...
  Batch 1: 1000 rows
  Batch 2: 437 rows
✅ Loaded 1437 rows in 6.2s
🏃 View run Load 30yr Auction Results at: http://127.0.0.1:8768/#/experiments/1447/runs/73f1bfe35ea240559855eeb87f4d4d1b
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1447
