In [21]:
import sys, os
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

import pandas as pd
import numpy as np
from datetime import datetime
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor, as_completed

import mlflow
from mlflow.tracking import MlflowClient

from data.data_source import get_data_source
from data.treasury_curve import get_yield_curve
from models.empirical_covariance import EmpiricalCovarianceModel
from config import env

# ─── CONFIG ─────────────────────────────────────────────────────────────────
CURVE_TYPE = "US Treasury Par"
TENORS = [0.25, 0.5, 1, 2, 3, 5, 7, 10, 20, 30]
N_SIMS = 1000
MLFLOW_EXPERIMENT = "IR Cone Monte Carlo Optimized3"
ds = get_data_source()

# ─── UTILS ───────────────────────────────────────────────────────────────────
def plot_ir_cones_matplotlib(base_curve: pd.Series, ir_cone_df: pd.DataFrame, title: str = ""):
    plt.figure(figsize=(10, 6))
    sample_ids = np.random.choice(ir_cone_df["sim_id"].unique(), size=min(100, len(ir_cone_df["sim_id"].unique())), replace=False)
    sample_df = ir_cone_df[ir_cone_df["sim_id"].isin(sample_ids)]
    for sim_id in sample_df["sim_id"].unique():
        sim_data = sample_df[sample_df["sim_id"] == sim_id]
        plt.plot(sim_data["tenor_num"], sim_data["rate_simulated"], color="gray", alpha=0.1)
    plt.plot(base_curve.index, base_curve.values, color="crimson", linewidth=2.5, label="Base Curve")
    plt.xlabel("Tenor (years)")
    plt.ylabel("Yield (%)")
    plt.title(title or f"Monte Carlo IR Cones ({N_SIMS} sims) on {base_curve.name}")
    plt.grid(True, linestyle="--", alpha=0.3)
    plt.legend()
    plt.tight_layout()
    filename = f"../../artifacts/results/tsy_cones_{base_curve.name}.png"
    plt.savefig(filename, dpi=150)
    plt.close()
    return filename

def generate_ir_cone(base_curve: pd.Series, cov_model: EmpiricalCovarianceModel, n_sims: int = 1000) -> pd.DataFrame:
    tenor_order = list(base_curve.index)
    cov_mat = cov_model.covariance_
    rand_deltas = np.random.multivariate_normal(mean=np.zeros(len(tenor_order)), cov=cov_mat, size=n_sims)
    base_vals = base_curve.values.reshape((1, -1))
    sims = base_vals + rand_deltas
    records = [
        {"sim_id": sim, "tenor_num": tenor_order[i], "rate_simulated": sims[sim, i]}
        for sim in range(n_sims) for i in range(len(tenor_order))
    ]
    return pd.DataFrame.from_records(records)

# ─── MAIN JOB ────────────────────────────────────────────────────────────────
def populate_ir_cones(days: int, years_back: int = 0, max_workers: int = 4):
    end_date = datetime.today().date()
    start_date = max(end_date - relativedelta(days=days, years=years_back), datetime(2010, 1, 1).date())
    all_dates = pd.date_range(start=start_date, end=end_date, freq='D').date

    mlflow.set_experiment(MLFLOW_EXPERIMENT)
    client = MlflowClient()

    with mlflow.start_run(run_name=f"populate_ir_cones_{end_date}") as parent:
        parent_id = parent.info.run_id
        mlflow.log_params({
            "as_of_date": start_date,
            "days_requested": days,
            "curve_type": CURVE_TYPE,
            "n_sims": N_SIMS,
        })

        total_vars, trace_covs, total_obs = [], [], []
        def task(asof_date):
            try:
                sql = f"""
                SELECT curve_date, tenor_num, rate FROM rate_curves
                WHERE curve_type = '{CURVE_TYPE}'
                  AND curve_date <= '{asof_date}'
                  AND tenor_num IN ({', '.join(str(x) for x in TENORS)})
                ORDER BY curve_date DESC, tenor_num;
                """
                df = ds.query(sql).to_pandas()
                if df.empty:
                    return (asof_date, "No curve data")
                pivot = df.pivot(index="curve_date", columns="tenor_num", values="rate").sort_index().interpolate(method="linear", axis=0).dropna()
                asof_actual = asof_date if asof_date in pivot.index else pivot.index[pivot.index <= asof_date].max()
                base_curve = pivot.loc[asof_actual]
                deltas = pivot.diff().dropna()
                model = EmpiricalCovarianceModel().fit(deltas.values)
                cone_df = generate_ir_cone(base_curve=base_curve, cov_model=model, n_sims=N_SIMS)

                filename = plot_ir_cones_matplotlib(base_curve, cone_df)
                run = client.create_run(
                    experiment_id=client.get_experiment_by_name(MLFLOW_EXPERIMENT).experiment_id,
                    tags={"mlflow.parentRunId": parent_id, "mlflow.runName": f"IR_{asof_date}"}
                )
                mlflow.start_run(run_id=run.info.run_id)
                mlflow.pyfunc.log_model(
                    artifact_path="emp_cov_model",
                    python_model=model,
                    registered_model_name="EmpiricalCovarianceModel"
                )
            
                # 4) Close it out
                mlflow.end_run()
                 
                total_var = float(np.var(deltas.values))
                trace_cov = float(np.trace(model.covariance_))
                total_vars.append(total_var)
                trace_covs.append(trace_cov)
                total_obs.append(len(deltas))

                run_id = run.info.run_id
                client.log_param(run_id, "as_of_date", str(asof_actual))
                client.log_param(run_id, "days_requested", days)
                client.log_param(run_id, "n_sims", N_SIMS)
                client.log_param(run_id, "curve_type", CURVE_TYPE)
                client.log_metric(run_id, "n_obs", len(deltas))
                client.log_metric(run_id, "total_var", total_var)
                client.log_metric(run_id, "trace_cov", trace_cov)
                client.log_metric(run_id, "dates_processed", 1)
                client.log_artifact(run_id, filename, artifact_path="charts")
                client.set_terminated(run_id, status="FINISHED")

                return None
            except Exception as e:
                return (asof_date, str(e))

        errors = []
        with ThreadPoolExecutor(max_workers=max_workers) as exe:
            futures = [exe.submit(task, d) for d in all_dates]
            for fut in as_completed(futures):
                res = fut.result()
                if res:
                    errors.append(res)

        mlflow.log_metric("dates_processed", len(all_dates) - len(errors))
        mlflow.log_metric("n_errors", len(errors))
        mlflow.log_metric("n_obs", sum(total_obs))
        mlflow.log_metric("total_var", float(np.mean(total_vars)))
        mlflow.log_metric("trace_cov", float(np.mean(trace_covs)))

        if errors:
            print(f"⚠️  {len(errors)} errors:")
            for d, msg in errors:
                print(f"  • {d}: {msg}")
        else:
            print("✅ All cones processed and logged.")

if __name__ == '__main__':
    days=5
    populate_ir_cones(days=days, years_back=0)


getting data source for sandbox


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


⚠️  6 errors:
  • 2025-06-03: `python_model` must be a PythonModel instance, callable object, or path to a script that uses set_model() to set a PythonModel instance or callable object.
  • 2025-06-01: `python_model` must be a PythonModel instance, callable object, or path to a script that uses set_model() to set a PythonModel instance or callable object.
  • 2025-06-02: `python_model` must be a PythonModel instance, callable object, or path to a script that uses set_model() to set a PythonModel instance or callable object.
  • 2025-06-04: `python_model` must be a PythonModel instance, callable object, or path to a script that uses set_model() to set a PythonModel instance or callable object.
  • 2025-06-06: Run with UUID 445b0c9d2944462f9e75930868097b1a is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True
  • 2025-06-05: Run with UUID 316f4cf9919546799e1bf0ce232bd6b0 is already active. To start a