In [16]:
## scripts/run_valuation.py
#!/usr/bin/env python3
import sys, ast
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

import os
import time
import pandas as pd
import json
import numpy as np
import mlflow
from datetime import date, timedelta, datetime
from dateutil.relativedelta import relativedelta
from concurrent.futures import ThreadPoolExecutor, as_completed

from data.data_source import get_data_source
from data.treasury_curve import get_yield_curve, bump_curve, shocks
from models.pricing_models.bond_model import Bond
from config import env

experiment_name = f"PCA Training [{env}]"
mlflow.set_experiment(experiment_name)

ds = get_data_source()

def parse_pg_array(val):
    if isinstance(val, str) or isinstance(val, bytes):
        # Decode bytes if needed
        if isinstance(val, bytes):
            val = val.decode('utf-8')
        # Convert Postgres array string to Python list
        val = val.strip('{}')
        return np.array([float(x) for x in val.split(',')], dtype=float)
    return np.array(val, dtype=float)

def run_valuation(asof_str):
    # 0) Parse / validate date
    asof = pd.to_datetime(asof_str)
    if pd.isna(asof):
        raise ValueError(f"Could not parse date '{asof_str}'")

    # 1) Pull inventory
    inv_sql = f"""
    SELECT DISTINCT ON(cusip)
        cusip,
        int_rate,
        issue_date,
        maturity_date,
        price_per100,
        quantity,
        int_payment_frequency
    FROM tsy_inventory
    WHERE inventory_date = '{asof.date()}'
    ORDER BY cusip, inventory_date DESC;
    """
    inv = ds.query(inv_sql).to_pandas()
    if inv.empty:
        print(f"No inventory on {asof.date()}")
        return

    # 2) Load base yield curve
    base_yc = get_yield_curve(asof, ds)
    if base_yc is None:
        print(f"No yield curve for {asof.date()}")
        return

    # 3) Build Bond objects
    bonds = [Bond(r.cusip, r.issue_date, r.maturity_date, r.int_rate, r.int_payment_frequency, r.quantity) for r in inv.itertuples()]

    # 4) Price base curve and sensitivities
    pvs_dirty, accrued_arr, pvs_clean, dv01s, krds_mat = Bond.price_batch_with_sensitivities(bonds, asof, base_yc)

    # 5) Prepare results DataFrame
    results = inv.copy().reset_index(drop=True)
    results['price_closedform']            = pvs_dirty
    results['clean_price_closedform']      = pvs_clean
    results['accrued_interest_closedform'] = accrued_arr
    results['dv01']                        = dv01s
    key_cols = ['krd1y','krd2y','krd3y','krd5y','krd7y','krd10y','krd20y','krd30y']
    for i, col in enumerate(key_cols):
        results[col] = krds_mat[:, i]

    # 6) Fetch PCA components
    pca_sql = f"""
    SELECT components, explained_variance_ratios
    FROM pca_results
    WHERE curve_type = 'US Treasury Par'
      AND curve_date = '{asof.date()}'
      AND n_components >= 3
    LIMIT 1;
    """
    pca_df = ds.query(pca_sql).to_pandas()
    
    if pca_df.empty:
        raise RuntimeError(f"No PCA results for {asof.date()}")
    
    components_raw = pca_df.loc[0, 'components']
    evr_raw = pca_df.loc[0, 'explained_variance_ratios']
    
    comps = np.array(json.loads(components_raw), dtype=float)
    pc1, pc2, pc3 = comps[0], comps[1], comps[2]
    explained_var = parse_pg_array(evr_raw)
    v1, v2, v3 = explained_var[:3]


    # 7) Tenor grid
    tenors = np.array([0.25, 0.5, 1.0, 2.0, 3.0, 5.0, 7.0, 10.0, 20.0, 30.0])
    if pc1.shape[0] != tenors.shape[0]:
        raise RuntimeError("Mismatch PCA length vs tenor grid.")

    # 8) PCA‐bumped curve helper
    def make_pca_bumped_curve(base_yc, tenors, loading, shift_bp):
        base_rates = base_yc(tenors)
        bumped = base_rates + loading * (shift_bp/100.0)
        def f(ttm_arr):
            flat = np.interp(ttm_arr.ravel(), tenors, bumped, left=bumped[0], right=bumped[-1])
            return flat.reshape(ttm_arr.shape)
        return f

    # 9) Compute PCA‑shocked dirty prices
    for col_label, (loading, bp) in {
        'price_closedform_pca1_u25bps': (pc1, +25),
        'price_closedform_pca1_d25bps': (pc1, -25),
        'price_closedform_pca2_u25bps': (pc2, +25),
        'price_closedform_pca2_d25bps': (pc2, -25),
        'price_closedform_pca3_u25bps': (pc3, +25),
        'price_closedform_pca3_d25bps': (pc3, -25),
        'price_closedform_pca1_u100bps': (pc1, +100),
        'price_closedform_pca1_d100bps': (pc1, -100),
        'price_closedform_pca2_u100bps': (pc2, +100),
        'price_closedform_pca2_d100bps': (pc2, -100),
        'price_closedform_pca3_u100bps': (pc3, +100),
        'price_closedform_pca3_d100bps': (pc3, -100),
        'price_closedform_pca1_u200bps': (pc1, +200),
        'price_closedform_pca1_d200bps': (pc1, -200),
        'price_closedform_pca2_u200bps': (pc2, +200),
        'price_closedform_pca2_d200bps': (pc2, -200),
        'price_closedform_pca3_u200bps': (pc3, +200),
        'price_closedform_pca3_d200bps': (pc3, -200)
    }.items():
        yc_bumped = make_pca_bumped_curve(base_yc, tenors, loading, bp)
        pvs_bump, _, _, _, _ = Bond.price_batch_with_sensitivities(bonds, asof, yc_bumped)
        results[col_label] = pvs_bump

    # 10) PCA DV01s (1bp shift)) PCA DV01s (1bp shift)
    krd_matrix = np.zeros((len(results), 10))
    krd_matrix[:, 2:] = krds_mat  # insert into 1y to 30y slots
    proj_pca1 = krd_matrix @ pc1
    proj_pca2 = krd_matrix @ pc2
    proj_pca3 = krd_matrix @ pc3
    results['pca1_dv01'] = v1 * results['dv01']
    results['pca2_dv01'] = v2 * results['dv01']
    results['pca3_dv01'] = v3 * results['dv01']

    
    # 11) Parallel shocks
    for lab in shocks:
        results[f'price_closedform_{lab}bps'] = 0.0
    for lab, bp in shocks.items():
        yc_b = bump_curve(base_yc, bp)
        pvs_b, _, _, _, _ = Bond.price_batch_with_sensitivities(bonds, asof, yc_b)
        results[f'price_closedform_{lab}bps'] = pvs_b

    # 12) Housekeeping + drop near‐maturity
    results['valuation_date'] = asof.date()
    results['time_to_maturity'] = (pd.to_datetime(results['maturity_date']) - asof).dt.days/365.25
    results['coupon'] = results['int_rate'].fillna(0.0)
    alive = results['time_to_maturity'] > 1e-4
    if not alive.all():
        dropped = results.loc[~alive, 'cusip'].tolist()
        print(f"⚠️ Dropping mature bonds: {dropped}")
    results = results[alive].reset_index(drop=True)

    # 13) Batch upsert data
    values_sql = []
    for r in results.itertuples(index=False):
        values_sql.append(
            f"('{r.cusip}','{r.valuation_date}',{r.price_per100},{r.coupon},"
            f"'{r.maturity_date}',{r.time_to_maturity},{r.dv01},"
            f"{r.krd1y},{r.krd2y},{r.krd3y},{r.krd5y},{r.krd7y},{r.krd10y},{r.krd20y},{r.krd30y},"
            f"{r.price_closedform},{r.price_closedform_u25bps},{r.price_closedform_d25bps},"
            f"{r.price_closedform_u100bps},{r.price_closedform_d100bps},"
            f"{r.price_closedform_u200bps},{r.price_closedform_d200bps},"
            # ← now include both “u” and “d” sides for each of the three PCs at 25bps
            f"{r.price_closedform_pca1_u25bps},{r.price_closedform_pca1_d25bps},"
            f"{r.price_closedform_pca2_u25bps},{r.price_closedform_pca2_d25bps},"
            f"{r.price_closedform_pca3_u25bps},{r.price_closedform_pca3_d25bps},"
            # ← same for 100bps bumps
            f"{r.price_closedform_pca1_u100bps},{r.price_closedform_pca1_d100bps},"
            f"{r.price_closedform_pca2_u100bps},{r.price_closedform_pca2_d100bps},"
            f"{r.price_closedform_pca3_u100bps},{r.price_closedform_pca3_d100bps},"
            # ← same for 200bps bumps
            f"{r.price_closedform_pca1_u200bps},{r.price_closedform_pca1_d200bps},"
            f"{r.price_closedform_pca2_u200bps},{r.price_closedform_pca2_d200bps},"
            f"{r.price_closedform_pca3_u200bps},{r.price_closedform_pca3_d200bps},"
            # ← then the 1bps PCA‐DV01s:
            f"{r.pca1_dv01},{r.pca2_dv01},{r.pca3_dv01},"
            # ← finally the remaining fields
            f"{r.quantity},{r.clean_price_closedform},{r.accrued_interest_closedform})"
        )
    vals = ",".join(values_sql)

    upsert_sql = f"""
    INSERT INTO tsy_valuations (
      cusip,
      valuation_date,
      entry_price,
      coupon,
      maturity_date,
      time_to_maturity,
      dv01,
      krd1y,
      krd2y,
      krd3y,
      krd5y,
      krd7y,
      krd10y,
      krd20y,
      krd30y,
      price_closedform,
      price_closedform_u25bps,
      price_closedform_d25bps,
      price_closedform_u100bps,
      price_closedform_d100bps,
      price_closedform_u200bps,
      price_closedform_d200bps,
      price_closedform_pca1_u25bps,
      price_closedform_pca1_d25bps,
      price_closedform_pca2_u25bps,
      price_closedform_pca2_d25bps,
      price_closedform_pca3_u25bps,
      price_closedform_pca3_d25bps,
      price_closedform_pca1_u100bps,
      price_closedform_pca1_d100bps,
      price_closedform_pca2_u100bps,
      price_closedform_pca2_d100bps,
      price_closedform_pca3_u100bps,
      price_closedform_pca3_d100bps,
      price_closedform_pca1_u200bps,
      price_closedform_pca1_d200bps,
      price_closedform_pca2_u200bps,
      price_closedform_pca2_d200bps,
      price_closedform_pca3_u200bps,
      price_closedform_pca3_d200bps,
      pca1_dv01,
      pca2_dv01,
      pca3_dv01,
      quantity,
      clean_price_closedform,
      accrued_interest_closedform
    ) VALUES {vals}
    ON CONFLICT(cusip,valuation_date) DO UPDATE SET
      entry_price=EXCLUDED.entry_price,
      coupon=EXCLUDED.coupon,
      maturity_date=EXCLUDED.maturity_date,
      time_to_maturity=EXCLUDED.time_to_maturity,
      dv01=EXCLUDED.dv01,
      krd1y=EXCLUDED.krd1y,
      krd2y=EXCLUDED.krd2y,
      krd3y=EXCLUDED.krd3y,
      krd5y=EXCLUDED.krd5y,
      krd7y=EXCLUDED.krd7y,
      krd10y=EXCLUDED.krd10y,
      krd20y=EXCLUDED.krd20y,
      krd30y=EXCLUDED.krd30y,
      price_closedform=EXCLUDED.price_closedform,
      price_closedform_u25bps=EXCLUDED.price_closedform_u25bps,
      price_closedform_d25bps=EXCLUDED.price_closedform_d25bps,
      price_closedform_u100bps=EXCLUDED.price_closedform_u100bps,
      price_closedform_d100bps=EXCLUDED.price_closedform_d100bps,
      price_closedform_u200bps=EXCLUDED.price_closedform_u200bps,
      price_closedform_d200bps=EXCLUDED.price_closedform_d200bps,
      price_closedform_pca1_u25bps=EXCLUDED.price_closedform_pca1_u25bps,
      price_closedform_pca1_d25bps=EXCLUDED.price_closedform_pca1_d25bps,
      price_closedform_pca2_u25bps=EXCLUDED.price_closedform_pca2_u25bps,
      price_closedform_pca2_d25bps=EXCLUDED.price_closedform_pca2_d25bps,
      price_closedform_pca3_u25bps=EXCLUDED.price_closedform_pca3_u25bps,
      price_closedform_pca3_d25bps=EXCLUDED.price_closedform_pca3_d25bps,
      price_closedform_pca1_u100bps=EXCLUDED.price_closedform_pca1_u100bps,
      price_closedform_pca1_d100bps=EXCLUDED.price_closedform_pca1_d100bps,
      price_closedform_pca2_u100bps=EXCLUDED.price_closedform_pca2_u100bps,
      price_closedform_pca2_d100bps=EXCLUDED.price_closedform_pca2_d100bps,
      price_closedform_pca3_u100bps=EXCLUDED.price_closedform_pca3_u100bps,
      price_closedform_pca3_d100bps=EXCLUDED.price_closedform_pca3_d100bps,
      price_closedform_pca1_u200bps=EXCLUDED.price_closedform_pca1_u200bps,
      price_closedform_pca1_d200bps=EXCLUDED.price_closedform_pca1_d200bps,
      price_closedform_pca2_u200bps=EXCLUDED.price_closedform_pca2_u200bps,
      price_closedform_pca2_d200bps=EXCLUDED.price_closedform_pca2_d200bps,
      price_closedform_pca3_u200bps=EXCLUDED.price_closedform_pca3_u200bps,
      price_closedform_pca3_d200bps=EXCLUDED.price_closedform_pca3_d200bps,
      pca1_dv01=EXCLUDED.pca1_dv01,
      pca2_dv01=EXCLUDED.pca2_dv01,
      pca3_dv01=EXCLUDED.pca3_dv01,
      quantity=EXCLUDED.quantity,
      clean_price_closedform=EXCLUDED.clean_price_closedform,
      accrued_interest_closedform=EXCLUDED.accrued_interest_closedform,
      updated_at=CURRENT_TIMESTAMP;
    """
    ds.query(upsert_sql)
    print(f"✅ Valued {len(bonds)} bonds on {asof.date()}.")



def populate(
    days: int,
    max_workers: int = 4,
    years_back: int = 0
):
    """
    Backfill bond valuations for the last `days` days (up to today), not before 2010‑01‑01.
    """
    end_date = date.today()
    start_date = end_date - relativedelta(days=days, years=years_back)
    min_date = date(2010, 1, 1)
    if start_date < min_date:
        start_date = min_date

    all_dates = pd.date_range(start=start_date, end=end_date, freq='D').date
    print(f"Populating {len(all_dates)} days from {start_date} to {end_date}...")

    with mlflow.start_run() as run:
        mlflow.log_param("days_requested", days)
        mlflow.log_param("start_date", str(start_date))
        mlflow.log_param("end_date", str(end_date))

        errors = []
        def task(d):
            try:
                run_valuation(str(d))
            except Exception as e:
                return (d, str(e))
            return None

        with ThreadPoolExecutor(max_workers=max_workers) as exe:
            futures = {exe.submit(task, d): d for d in all_dates}
            for fut in as_completed(futures):
                res = fut.result()
                if res is not None:
                    errors.append(res)

        mlflow.log_metric("dates_processed", len(all_dates) - len(errors))
        mlflow.log_metric("errors", len(errors))

        if errors:
            print(f"⚠️  {len(errors)} dates failed:")
            for d, msg in errors:
                print(f"  • {d}: {msg}")

    print("✅ Backfill complete.")


# ─── MAIN ───────────────────────────────────────────────────────────────────
if __name__ == '__main__':
    if len(sys.argv) > 1 and sys.argv[1].isdigit():
        d = int(sys.argv[1])
    else:
        d = 10
    populate(days=d, years_back=0)


getting data source for sandbox
Populating 11 days from 2025-05-26 to 2025-06-05...
✅ Valued 285 bonds on 2025-05-27.
✅ Valued 284 bonds on 2025-05-29.
✅ Valued 281 bonds on 2025-05-30.
✅ Valued 284 bonds on 2025-05-28.
✅ Valued 281 bonds on 2025-06-02.
✅ Valued 282 bonds on 2025-06-03.
⚠️  5 dates failed:
  • 2025-05-26: No yield curve data found for 2025-05-26
  • 2025-05-31: No yield curve data found for 2025-05-31
  • 2025-06-01: No yield curve data found for 2025-06-01
  • 2025-06-05: No yield curve data found for 2025-06-05
  • 2025-06-04: No PCA results for 2025-06-04
🏃 View run nervous-fawn-94 at: http://127.0.0.1:8768/#/experiments/1462/runs/aa039723a48e432cac2446c080e24503
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1462
✅ Backfill complete.
