In [2]:
import data.data_source as data_source

from datetime import date
import requests
from typing import List, Dict
import pandas as pd
import mlflow
import os
from config import env
from utils.artifact_saver import get_artifact_path


experiment_name = f"Populate Reference Rates test [{env}]"
mlflow.set_experiment(experiment_name)

# ─── CONFIG ─────────────────────────────────────────────────────────────────

BASE_URL = 'https://markets.newyorkfed.org'
MAX_RECORDS = 900

REFERENCE_RATE_MAPPINGS = {
    'secured':   [('sofr', 'Secured Overnight Financing Rate'), 
                  ('bgcr', 'Broad General Collateral Rate'), 
                  ('tgcr', 'Tri-Party General Collateral Rate')],
    'unsecured': [('effr', 'Effective Fed Funds Rate'), 
                  ('obfr', 'Overnight Bank Funding Rate')],
}

# ─── DATASOURCE ─────────────────────────────────────────────────────────────

ds = data_source.get_data_source()

# ─── FETCHER ────────────────────────────────────────────────────────────────

def fetch_reference_rates(ticker: str, category: str, limit: int = MAX_RECORDS) -> List[Dict]:
    if limit > MAX_RECORDS:
        raise ValueError(f"Can't fetch more than {MAX_RECORDS}; you asked for {limit}")
    url = f"{BASE_URL}/api/rates/{category}/{ticker}/last/{limit}.json"
    resp = requests.get(url)
    resp.raise_for_status()
    return resp.json().get('refRates', [])


# ─── POPULATOR ─────────────────────────────────────────────────────────────

def populate_reference_rates(category: str, limit: int = MAX_RECORDS, batch_size: int = 500):
    rows = []
    for ticker, display_name in REFERENCE_RATE_MAPPINGS[category]:
        try:
            data = fetch_reference_rates(ticker, category, limit)
            print(f"{category.upper()} {ticker.upper()}: {len(data)} rows")
        except Exception as e:
            print(f"Skipping {category}/{ticker}: {e}")
            continue

        for entry in data:
            rows.append((
                ticker,
                display_name,
                entry['effectiveDate'],
                entry['percentRate'],
                entry['volumeInBillions'],
                entry.get('percentPercentile1'),
                entry.get('percentPercentile25'),
                entry.get('percentPercentile75'),
                entry.get('percentPercentile99'),
                entry.get('revisionIndicator') or ''
            ))

    # batch-insert in chunks
    for i in range(0, len(rows), batch_size):
        batch = rows[i : i + batch_size]
        vals = ",\n".join(
            f"('{r[0]}','{r[1]}','{r[2]}',{r[3]},{r[4]},{r[5]},{r[6]},{r[7]},{r[8]},'{r[9]}')"
            for r in batch
        )
        sql = f"""
        INSERT INTO reference_rates
          (rate_ticker, rate_type, rate_date, rate, volume_in_billions,
           percentile_1, percentile_25, percentile_75, percentile_99, revision_indicator)
        VALUES
          {vals}
        ON CONFLICT (rate_ticker, rate_type, rate_date) DO UPDATE SET
          rate               = EXCLUDED.rate,
          volume_in_billions = EXCLUDED.volume_in_billions,
          percentile_1       = EXCLUDED.percentile_1,
          percentile_25      = EXCLUDED.percentile_25,
          percentile_75      = EXCLUDED.percentile_75,
          percentile_99      = EXCLUDED.percentile_99,
          revision_indicator = EXCLUDED.revision_indicator;
        """
        ds.query(sql)

    print(f"✅ Loaded {category} rates ({len(rows)} rows).")
    return rows


def populate(days, batch_size=500):
    with mlflow.start_run():
        mlflow.log_param("days_requested", days)
        mlflow.log_param("starting_domino_user", os.environ["DOMINO_STARTING_USERNAME"])
        mlflow.log_param("batch_size", batch_size)

        secured_rows = populate_reference_rates('secured', limit=900, batch_size=500)
        unsecured_rows = populate_reference_rates('unsecured', limit=900, batch_size=500)
        mlflow.log_metric("rows_loaded", len(secured_rows) + len(unsecured_rows))
        mlflow.log_metric("rows_loaded_secured_only", len(secured_rows))
        mlflow.log_metric("rows_loaded_unsecured_only", len(unsecured_rows))

        df_all = pd.DataFrame(
            secured_rows + unsecured_rows,
            columns=[
                "rate_ticker",
                "rate_type",
                "rate_date",
                "rate",
                "volume_in_billions",
                "percentile_1",
                "percentile_25",
                "percentile_75",
                "percentile_99",
                "revision_indicator",
            ],
        )
        csv_path = get_artifact_path("reference_rates_loaded.csv")
        df_all.to_csv(csv_path, index=False)
        mlflow.log_artifact(csv_path, artifact_path="reference_rates")


# ─── MAIN ───────────────────────────────────────────────────────────────────
# arg1 is the number of days to backdate.
# 1 => yesterday's curve, 100 => last 100 days.
default_backdated_days = 10

if __name__ == '__main__':
    print('name was main')
    d = default_backdated_days
else:
    print('name was not main')
    try:
        days_to_backdate = sys.argv[1]
        d = int(days_to_backdate)
    except Exception as e:
        d = default_backdated_days

populate(days=d)

2025/06/23 20:18:37 INFO mlflow.tracking.fluent: Experiment with name 'Populate Reference Rates test [sandbox]' does not exist. Creating a new experiment.


getting data source for sandbox
name was main
SECURED SOFR: 900 rows
SECURED BGCR: 900 rows
SECURED TGCR: 900 rows
✅ Loaded secured rates (2700 rows).
UNSECURED EFFR: 900 rows
UNSECURED OBFR: 900 rows
✅ Loaded unsecured rates (1800 rows).
🏃 View run gentle-cow-831 at: http://127.0.0.1:8768/#/experiments/1550/runs/cc15c7c6a4c74487a22f3b935eedf6ba
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1550
