In [12]:
import sys, os
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor, as_completed

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.models.signature import infer_signature
import mlflow.sklearn

from data.data_source import get_data_source
from data.treasury_curve import get_yield_curve
from models.empirical_covariance import EmpiricalCovarianceModel
from config import env

# ─── CONFIG ─────────────────────────────────────────────────────────────────
CURVE_TYPE       = "US Treasury Par"
TENORS           = [0.25, 0.5, 1, 2, 3, 5, 7, 10, 20, 30]
N_SIMS           = 1000
MLFLOW_EXPERIMENT = "IR Cone Monte Carlo Optimized5"
ds               = get_data_source()

def batch_insert_rate_cones(records, batch_size=200):
    total_inserted = 0
    for i in range(0, len(records), batch_size):
        batch = records[i:i + batch_size]
        values_clause = ",\n".join([
            f"('{r['curve_type']}', '{r['cone_type']}', '{r['cone_date']}', '{r['tenor_str']}', {r['rate']:.8f}, {r['tenor_num']:.8f})"
            for r in batch
        ])
        insert_sql = f"""
        INSERT INTO rate_cones (curve_type, cone_type, cone_date, tenor_str, rate, tenor_num)
        VALUES {values_clause}
        ON CONFLICT DO NOTHING;
        """
        ds.query(insert_sql)
        total_inserted += len(batch)
    return total_inserted

# ─── UTILS ───────────────────────────────────────────────────────────────────
def plot_ir_cones_matplotlib(base_curve: pd.Series, ir_cone_df: pd.DataFrame, title: str = ""):
    plt.figure(figsize=(10, 6))
    sample_ids = np.random.choice(
        ir_cone_df["sim_id"].unique(),
        size=min(100, len(ir_cone_df["sim_id"].unique())),
        replace=False
    )
    sample_df = ir_cone_df[ir_cone_df["sim_id"].isin(sample_ids)]
    for sim_id in sample_df["sim_id"].unique():
        sim_data = sample_df[sample_df["sim_id"] == sim_id]
        plt.plot(sim_data["tenor_num"], sim_data["rate_simulated"], color="gray", alpha=0.1)

    plt.plot(base_curve.index, base_curve.values,
             color="crimson", linewidth=2.5, label="Base Curve")
    plt.xlabel("Tenor (years)")
    plt.ylabel("Yield (%)")
    plt.title(title or f"Monte Carlo IR Cones ({N_SIMS} sims) on {base_curve.name}")
    plt.grid(True, linestyle="--", alpha=0.3)
    plt.legend()
    plt.tight_layout()

    filename = f"../../artifacts/results/tsy_cones_{base_curve.name}.png"
    plt.savefig(filename, dpi=150)
    plt.close()
    return filename

def generate_ir_cone(base_curve: pd.Series,
                     cov_model: EmpiricalCovarianceModel,
                     n_sims: int = N_SIMS) -> pd.DataFrame:
    tenor_order = list(base_curve.index)
    cov_mat     = cov_model.covariance_
    rand_deltas = np.random.multivariate_normal(
        mean=np.zeros(len(tenor_order)),
        cov=cov_mat,
        size=n_sims
    )
    base_vals = base_curve.values.reshape((1, -1))
    sims      = base_vals + rand_deltas

    records = [
        {
            "sim_id": sim,
            "tenor_num": tenor_order[i],
            "rate_simulated": sims[sim, i]
        }
        for sim in range(n_sims)
        for i in range(len(tenor_order))
    ]
    return pd.DataFrame.from_records(records)

# ─── MAIN JOB ────────────────────────────────────────────────────────────────
def populate_ir_cones(days: int, years_back: int = 0, max_workers: int = 4):
    end_date  = datetime.today().date()
    start_date = max(
        end_date - relativedelta(days=days, years=years_back),
        datetime(2010, 1, 1).date()
    )
    all_dates = pd.date_range(start=start_date, end=end_date, freq='D').date

    mlflow.set_experiment(MLFLOW_EXPERIMENT)
    client = MlflowClient()

    # Parent run
    with mlflow.start_run(run_name=f"populate_ir_cones_{end_date}") as parent:
        parent_id = parent.info.run_id
        mlflow.log_params({
            "as_of_date": start_date,
            "days_requested": days,
            "curve_type": CURVE_TYPE,
            "n_sims": N_SIMS,
        })

        # accumulate for parent‐level summary
        total_vars, trace_covs, total_obs = [], [], []
        errors = []

        def task(asof_date):
            try:
                # 1) Fetch & pivot
                sql = f"""
                SELECT curve_date, tenor_num, rate FROM rate_curves
                WHERE curve_type = '{CURVE_TYPE}'
                  AND curve_date <= '{asof_date}'
                  AND tenor_num IN ({', '.join(map(str, TENORS))})
                ORDER BY curve_date DESC, tenor_num;
                """
                df = ds.query(sql).to_pandas()
                if df.empty:
                    return (asof_date, "No curve data")

                pivot = (
                    df.pivot(index="curve_date", columns="tenor_num", values="rate")
                      .sort_index()
                      .interpolate(method="linear", axis=0)
                      .dropna()
                )

                asof_actual = (
                    asof_date
                    if asof_date in pivot.index
                    else pivot.index[pivot.index <= asof_date].max()
                )
                base_curve = pivot.loc[asof_actual]
                deltas     = pivot.diff().dropna()

                # 2) Fit & simulate
                model    = EmpiricalCovarianceModel().fit(deltas.values)
                cone_df  = generate_ir_cone(base_curve, model, n_sims=N_SIMS)
                chart_fp = plot_ir_cones_matplotlib(base_curve, cone_df)

                percentiles = [1, 5, 10, 50, 90, 95, 99]
                
                percentile_curves = (
                    cone_df
                    .groupby("tenor_num")["rate_simulated"]
                    .quantile([p / 100 for p in percentiles])
                    .unstack(level=1)
                    .reset_index()
                    .melt(id_vars="tenor_num", var_name="percentile", value_name="rate")
                )
                
                # Convert percentile column back to float
                percentile_curves["percentile"] = percentile_curves["percentile"].astype(float)
                
                # Add required fields
                percentile_curves["curve_type"] = CURVE_TYPE
                percentile_curves["cone_date"] = asof_date
                percentile_curves["tenor_str"] = percentile_curves["tenor_num"].apply(lambda x: f"{int(x)}Y" if x.is_integer() else f"{x}Y")
                percentile_curves["cone_type"] = percentile_curves["percentile"].apply(lambda p: f"{int(p*100)}%")
                
                records = percentile_curves[[
                    "curve_type", "cone_type", "cone_date", "tenor_str", "rate", "tenor_num"
                ]].to_dict(orient="records")
                
                n_inserted = batch_insert_rate_cones(records, batch_size=200)
                print(f"✅ Inserted {n_inserted} rows into rate_cones for ASOF={asof_date}.")

                
                # summary stats
                n_obs     = len(deltas)
                total_var = float(np.var(deltas.values))
                trace_cov = float(np.trace(model.covariance_))

                # collect for parent summary
                total_obs.append(n_obs)
                total_vars.append(total_var)
                trace_covs.append(trace_cov)

                # 3) Log & register in a nested run
                #    Give MLflow an input_example so it auto-inferrs a signature
                input_example = deltas.values[:1]  # one row of deltas
                with mlflow.start_run(
                    run_name=f"IR_{asof_date}",
                    nested=True,
                    tags={"mlflow.parentRunId": parent_id}
                ):
                    mlflow.log_param("as_of_date", str(asof_actual))
                    mlflow.log_param("days_requested", days)
                    mlflow.log_param("curve_type", CURVE_TYPE)
                    mlflow.log_param("n_sims", N_SIMS)
                    mlflow.log_metric("n_obs", n_obs)
                    mlflow.log_metric("total_var", total_var)
                    mlflow.log_metric("trace_cov", trace_cov)
                    mlflow.log_metric("dates_processed", 1)

                    # Log+register
                    mlflow.sklearn.log_model(
                        sk_model=model,
                        artifact_path="model",
                        registered_model_name="EmpiricalCovarianceModel",
                        input_example=input_example
                    )

                    # Chart
                    mlflow.log_artifact(chart_fp, artifact_path="charts")

                return None

            except Exception as e:
                return (asof_date, str(e))

        # dispatch
        with ThreadPoolExecutor(max_workers=max_workers) as exe:
            futures = [exe.submit(task, d) for d in all_dates]
            for fut in as_completed(futures):
                if (err := fut.result()):
                    errors.append(err)

        # Parent summary metrics
        mlflow.log_metric("dates_processed", len(all_dates) - len(errors))
        mlflow.log_metric("n_errors", len(errors))
        mlflow.log_metric("n_obs", sum(total_obs))
        mlflow.log_metric("total_var", float(np.mean(total_vars)))
        mlflow.log_metric("trace_cov", float(np.mean(trace_covs)))

        # report
        if errors:
            print(f"⚠️  {len(errors)} errors:")
            for d, msg in errors:
                print(f"  • {d}: {msg}")
        else:
            print("✅ All cones processed and logged.")

if __name__ == '__main__':
    days=5
    populate_ir_cones(days=days, years_back=0)


getting data source for sandbox
✅ Inserted 70 rows into rate_cones for ASOF=2025-06-02.
✅ Inserted 70 rows into rate_cones for ASOF=2025-06-04.
✅ Inserted 70 rows into rate_cones for ASOF=2025-06-03.
✅ Inserted 70 rows into rate_cones for ASOF=2025-06-01.


Registered model 'EmpiricalCovarianceModel' already exists. Creating a new version of this model...
2025/06/06 23:14:07 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: EmpiricalCovarianceModel, version 19
Created version '19' of model 'EmpiricalCovarianceModel'.


🏃 View run IR_2025-06-02 at: http://127.0.0.1:8768/#/experiments/1510/runs/56b81f4a0eb6482c89701527df12cc4f
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1510


Registered model 'EmpiricalCovarianceModel' already exists. Creating a new version of this model...
Registered model 'EmpiricalCovarianceModel' already exists. Creating a new version of this model...
Registered model 'EmpiricalCovarianceModel' already exists. Creating a new version of this model...
2025/06/06 23:14:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: EmpiricalCovarianceModel, version 20
Created version '20' of model 'EmpiricalCovarianceModel'.
2025/06/06 23:14:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: EmpiricalCovarianceModel, version 21
Created version '21' of model 'EmpiricalCovarianceModel'.
2025/06/06 23:14:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: EmpiricalCovarianceModel, version 22
Created version '22' of model 'Empiric

🏃 View run IR_2025-06-01 at: http://127.0.0.1:8768/#/experiments/1510/runs/d68b1c5ce53a439d95824bdce165bb30
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1510
🏃 View run IR_2025-06-04 at: http://127.0.0.1:8768/#/experiments/1510/runs/40b2556b67ab4042888a89e95f8326f5
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1510
🏃 View run IR_2025-06-03 at: http://127.0.0.1:8768/#/experiments/1510/runs/a81ca10ff663406bbe08e76b14089d02
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1510
✅ Inserted 70 rows into rate_cones for ASOF=2025-06-05.
✅ Inserted 70 rows into rate_cones for ASOF=2025-06-06.


Registered model 'EmpiricalCovarianceModel' already exists. Creating a new version of this model...
2025/06/06 23:14:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: EmpiricalCovarianceModel, version 23
Created version '23' of model 'EmpiricalCovarianceModel'.


🏃 View run IR_2025-06-05 at: http://127.0.0.1:8768/#/experiments/1510/runs/6eb43e288cd7405fb8c03408dcf2944d
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1510


Registered model 'EmpiricalCovarianceModel' already exists. Creating a new version of this model...
2025/06/06 23:14:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: EmpiricalCovarianceModel, version 24
Created version '24' of model 'EmpiricalCovarianceModel'.


🏃 View run IR_2025-06-06 at: http://127.0.0.1:8768/#/experiments/1510/runs/7c6f7b6bae274086a82e25f6043a3052
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1510
✅ All cones processed and logged.
🏃 View run populate_ir_cones_2025-06-06 at: http://127.0.0.1:8768/#/experiments/1510/runs/90a8cd6d48bf40e2ad12a2b1e6486809
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1510
