In [8]:
import sys, os
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

import pandas as pd
import numpy as np
import altair as alt
from datetime import datetime
from dateutil.relativedelta import relativedelta

import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature
from sklearn.base import BaseEstimator


from data.data_source import get_data_source
from data.treasury_curve import get_yield_curve
from models.empirical_covariance import EmpiricalCovarianceModel
from concurrent.futures import ThreadPoolExecutor, as_completed
import matplotlib.pyplot as plt
from config import env

CURVE_TYPE = "US Treasury Par"
ASOF: datetime.date = datetime.today().date() - relativedelta(days=1, years=0)

HIST_YEARS = 1
START_DATE = ASOF - relativedelta(years=HIST_YEARS)

TENORS = [0.25, 0.5, 1, 2, 3, 5, 7, 10, 20, 30]

N_SIMS = 1000

MLFLOW_EXPERIMENT = "IR Cone Monte Carlo"

ds = get_data_source()

sql = f"""
SELECT
    curve_date,
    tenor_num,
    rate
FROM rate_curves
WHERE curve_type = '{CURVE_TYPE}'
  AND curve_date BETWEEN '{START_DATE}' AND '{ASOF}'
  AND tenor_num IN ({', '.join(str(x) for x in TENORS)})
ORDER BY curve_date, tenor_num
;"""
historic = ds.query(sql).to_pandas()

pivot = (
    historic
    .pivot(index="curve_date", columns="tenor_num", values="rate")
    .sort_index()
)

# Interpolage any rows with missing tenors
pivot = pivot.interpolate(method="linear", axis=0)

print(f">>> Loaded {pivot.shape[0]} historical daily curves (dates) with {pivot.shape[1]} tenors.")

deltas = pivot.diff().dropna(axis=0, how="any")
print(f">>> Computed {deltas.shape[0]} daily change observations for {deltas.shape[1]} tenors.")

wrapped = EmpiricalCovarianceModel().fit(deltas.values)
cov_model = wrapped

example_X = deltas.values[:1]
signature = infer_signature(
    example_X,
    wrapped.predict(example_X)
)

mlflow.set_experiment(MLFLOW_EXPERIMENT)
with mlflow.start_run(run_name=f"cov_fit_{ASOF}") as run:
    mlflow.sklearn.log_model(
        sk_model=wrapped,
        artifact_path="ir_covariance_model",
        registered_model_name="ir_covariance_model",
        input_example=example_X,
        signature=signature
    )
    mlflow.log_param("cov_fit_window_years", HIST_YEARS)
    mlflow.log_param("n_observations", deltas.shape[0])
    mlflow.log_param("n_tenors", deltas.shape[1])

print(f">>> Wrapped EmpCovariance logged to MLflow run ID {run.info.run_id}.")

def plot_ir_cones_matplotlib(base_curve: pd.Series, ir_cone_df: pd.DataFrame, title: str = ""):
    plt.figure(figsize=(10, 6))

    # Sample a subset of simulation lines
    sample_ids = np.random.choice(ir_cone_df["sim_id"].unique(), size=100, replace=False)
    sample_df = ir_cone_df[ir_cone_df["sim_id"].isin(sample_ids)]

    # Plot simulation lines
    for sim_id in sample_df["sim_id"].unique():
        sim_data = sample_df[sample_df["sim_id"] == sim_id]
        plt.plot(sim_data["tenor_num"], sim_data["rate_simulated"], color="gray", alpha=0.1)

    # Plot base curve
    plt.plot(base_curve.index, base_curve.values, color="crimson", linewidth=2.5, label="Base Curve")

    plt.xlabel("Tenor (years)")
    plt.ylabel("Yield (%)")
    plt.title(title or f"Monte Carlo IR Cones ({N_SIMS} sims) on {base_curve.name}")
    plt.grid(True, linestyle="--", alpha=0.3)
    plt.legend()
    plt.tight_layout()

    filename = f"../../artifacts/results/tsy_cones_{base_curve.name}.png"
    plt.savefig(filename, dpi=150)
    mlflow.log_artifact(filename, artifact_path="charts")
    plt.close()
    print(f"📈 Saved matplotlib chart: {filename}")


def generate_ir_cone(
    base_curve: pd.Series,
    cov_model: EmpiricalCovarianceModel,
    n_sims: int = 1000
) -> pd.DataFrame:
    """
    Generate Monte Carlo IR cone simulations for a single base curve.

    Args:
        base_curve: pd.Series indexed by tenor_num, values = yield rates
        cov_model:  fitted EmpiricalCovarianceModel on historical daily deltas
        n_sims:    number of scenarios to draw

    Returns:
        DataFrame with columns [sim_id, tenor_num, rate_simulated].
        - sim_id runs from 0 to n_sims-1
        - tenor_num is one of the tenor keys (e.g. 0.25, 0.5, 1, …)
        - rate_simulated = base_rate + Δr_sampled
    """
    tenor_order = list(base_curve.index)
    cov_mat = cov_model.covariance_

    rand_deltas = np.random.multivariate_normal(
        mean=np.zeros(len(tenor_order)),
        cov=cov_mat,
        size=n_sims
    )

    base_vals = base_curve.values.reshape((1, -1))
    sims = base_vals + rand_deltas

    records = []
    for sim_id in range(n_sims):
        for idx, tenor in enumerate(tenor_order):
            records.append({
                "sim_id": sim_id,
                "tenor_num": tenor,
                "rate_simulated": sims[sim_id, idx]
            })
    return pd.DataFrame.from_records(records)

def batch_insert_rate_cones(records, batch_size=200):
    total_inserted = 0
    for i in range(0, len(records), batch_size):
        batch = records[i:i + batch_size]
        values_clause = ",\n".join([
            f"('{r['curve_type']}', '{r['cone_type']}', '{r['cone_date']}', '{r['tenor_str']}', {r['rate']:.8f}, {r['tenor_num']:.8f})"
            for r in batch
        ])
        insert_sql = f"""
        INSERT INTO rate_cones (curve_type, cone_type, cone_date, tenor_str, rate, tenor_num)
        VALUES {values_clause}
        ON CONFLICT DO NOTHING;
        """
        ds.query(insert_sql)
        total_inserted += len(batch)
    return total_inserted

# # 7.1: Sample a subset of simulation IDs to plot (e.g., 100)
# sample_ids = np.random.choice(ir_cone_df["sim_id"].unique(), size=100, replace=False)
# sample_df = ir_cone_df[ir_cone_df["sim_id"].isin(sample_ids)].copy()

# # 7.2: Prepare base‐curve DataFrame for Altair
# base_df = pd.DataFrame({
#     "tenor_num": TENORS,
#     "rate": [base_curve.loc[t] for t in TENORS]
# })

# # 7.3: Build Altair chart: gray simulation lines with smooth interpolation,
# #      y‐axis autoscaled (zero=False ensures it doesn’t force 0 at baseline)
# sim_lines = (
#     alt.Chart(sample_df)
#     .mark_line(color="gray", opacity=0.1, strokeWidth=1, interpolate="monotone")
#     .encode(
#         x=alt.X("tenor_num:Q", title="Tenor (years)"),
#         y=alt.Y(
#             "rate_simulated:Q",
#             title="Yield (%)",
#             scale=alt.Scale(zero=False)
#         ),
#         detail="sim_id:N"
#     )
# )

# # 7.4: Build Altair chart: red base curve, also smooth‐interpolated
# base_line = (
#     alt.Chart(base_df)
#     .mark_line(color="crimson", strokeWidth=3, interpolate="monotone")
#     .encode(
#         x="tenor_num:Q",
#         y=alt.Y(
#             "rate:Q",
#             scale=alt.Scale(zero=False)
#         )
#     )
# )

# # 7.5: Layer them together
# chart = (
#     (sim_lines + base_line)
#     .properties(
#         width=700,
#         height=400,
#         title=f"Monte Carlo IR Cones ({N_SIMS} sims) on {base_curve.name}"
#     )
#     .configure_title(fontSize=16, fontWeight="bold")
#     .configure_axis(
#         labelFontSize=12,
#         titleFontSize=13
#     )
# )

# chart

def populate_ir_cones(
    days: int,
    years_back: int = 0,
    max_workers: int = 4,
    curve_type: str = CURVE_TYPE,
    tenors: list = TENORS,
    n_sims: int = N_SIMS
):
    """
    Populate rate_cones table with Monte Carlo IR simulations over a historical date range.
    Also saves a PNG Altair chart for each day's cone and logs it to MLflow.
    """
    end_date = datetime.today().date()
    start_date = end_date - relativedelta(days=days, years=years_back)
    min_date = datetime(2010, 1, 1).date()
    if start_date < min_date:
        start_date = min_date

    all_dates = pd.date_range(start=start_date, end=end_date, freq='D').date
    print(f"Populating IR cones for {len(all_dates)} days from {start_date} to {end_date}...")

    os.makedirs("../../artifacts/results/", exist_ok=True)

    def task(asof_date):
        try:
            sql = f"""
            SELECT
                curve_date,
                tenor_num,
                rate
            FROM rate_curves
            WHERE curve_type = '{curve_type}'
              AND curve_date <= '{asof_date}'
              AND tenor_num IN ({', '.join(str(x) for x in tenors)})
            ORDER BY curve_date DESC, tenor_num;
            """
            df = ds.query(sql).to_pandas()
            if df.empty:
                return (asof_date, "No curve data")

            pivot = (
                df.pivot(index="curve_date", columns="tenor_num", values="rate")
                .sort_index()
                .interpolate(method="linear", axis=0)
                .dropna()
            )

            if asof_date not in pivot.index:
                asof_actual = pivot.index[pivot.index <= asof_date].max()
            else:
                asof_actual = asof_date

            base_curve = pivot.loc[asof_actual]
            deltas = pivot.diff().dropna()
            model = EmpiricalCovarianceModel().fit(deltas.values)
            cone_df = generate_ir_cone(base_curve=base_curve, cov_model=model, n_sims=n_sims)

            # Compute percentiles
            percentiles = [1, 5, 10, 50, 90, 95, 99]
            percentile_curves = (
                cone_df.groupby("tenor_num")["rate_simulated"]
                .quantile([p / 100 for p in percentiles])
                .unstack(level=1)
                .reset_index()
                .melt(id_vars="tenor_num", var_name="percentile", value_name="rate")
            )
            percentile_curves["percentile"] = percentile_curves["percentile"].astype(float)
            percentile_curves["curve_type"] = curve_type
            percentile_curves["cone_date"] = asof_date
            percentile_curves["tenor_str"] = percentile_curves["tenor_num"].apply(
                lambda x: f"{int(x)}Y" if x.is_integer() else f"{x}Y"
            )
            percentile_curves["cone_type"] = percentile_curves["percentile"].apply(lambda p: f"{int(p*100)}%")

            # Write to DB
            records = percentile_curves[[
                "curve_type", "cone_type", "cone_date", "tenor_str", "rate", "tenor_num"
            ]].to_dict(orient="records")
            batch_insert_rate_cones(records)

            # Save chart
            sample_ids = np.random.choice(cone_df["sim_id"].unique(), size=min(100, n_sims), replace=False)
            sample_df = cone_df[cone_df["sim_id"].isin(sample_ids)].copy()
            base_df = pd.DataFrame({
                "tenor_num": tenors,
                "rate": [base_curve.loc[t] for t in tenors]
            })

            sim_lines = (
                alt.Chart(sample_df)
                .mark_line(color="gray", opacity=0.1, strokeWidth=1, interpolate="monotone")
                .encode(
                    x=alt.X("tenor_num:Q", title="Tenor (years)"),
                    y=alt.Y("rate_simulated:Q", title="Yield (%)", scale=alt.Scale(zero=False)),
                    detail="sim_id:N"
                )
            )
            base_line = (
                alt.Chart(base_df)
                .mark_line(color="crimson", strokeWidth=3, interpolate="monotone")
                .encode(x="tenor_num:Q", y=alt.Y("rate:Q", scale=alt.Scale(zero=False)))
            )
            chart = (
                (sim_lines + base_line)
                .properties(width=700, height=400, title=f"Monte Carlo IR Cones ({n_sims} sims) on {asof_actual}")
                .configure_title(fontSize=16, fontWeight="bold")
                .configure_axis(labelFontSize=12, titleFontSize=13)
            )
            plot_ir_cones_matplotlib(base_curve, ir_cone_df)

            # filename = f"../../artifacts/results/tsy_cones_{asof_actual}.png"
            # altair_save(chart, filename)

            return None  # success
        except Exception as e:
            return (asof_date, str(e))

    with mlflow.start_run(run_name=f"populate_ir_cones_{end_date}") as run:
        mlflow.log_param("days", days)
        mlflow.log_param("years_back", years_back)
        mlflow.log_param("curve_type", curve_type)
        mlflow.log_param("n_sims", n_sims)

        errors = []
        with ThreadPoolExecutor(max_workers=max_workers) as exe:
            futures = {exe.submit(task, d): d for d in all_dates}
            for fut in as_completed(futures):
                res = fut.result()
                if res:
                    errors.append(res)

        mlflow.log_metric("dates_processed", len(all_dates) - len(errors))
        mlflow.log_metric("errors", len(errors))

        if errors:
            print(f"⚠️  {len(errors)} errors:")
            for d, msg in errors:
                print(f"  • {d}: {msg}")
        else:
            print("✅ All cones processed and logged.")

# ─── MAIN ───────────────────────────────────────────────────────────────────
# arg1 is the number of days to backdate.
# 1 => yesterday's curve, 100 => last 100 days.
default_backdated_days = 10

if __name__ == '__main__':
    d = default_backdated_days
else:
    try:
        days_to_backdate = sys.argv[1]
        d = int(days_to_backdate)
    except Exception as e:
        d = default_backdated_days

populate_ir_cones(days=d, years_back=0)    
    

getting data source for sandbox
>>> Loaded 250 historical daily curves (dates) with 10 tenors.
>>> Computed 249 daily change observations for 10 tenors.


Registered model 'ir_covariance_model' already exists. Creating a new version of this model...
2025/06/06 21:26:05 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ir_covariance_model, version 62
Created version '62' of model 'ir_covariance_model'.


🏃 View run cov_fit_2025-06-05 at: http://127.0.0.1:8768/#/experiments/1501/runs/4260ed4142aa4a48b5657d6552aa7e72
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1501
>>> Wrapped EmpCovariance logged to MLflow run ID 4260ed4142aa4a48b5657d6552aa7e72.
Populating IR cones for 11 days from 2025-05-27 to 2025-06-06...
📈 Saved matplotlib chart: ../../artifacts/results/tsy_cones_2025-05-27.png
📈 Saved matplotlib chart: ../../artifacts/results/tsy_cones_2025-05-30.png
📈 Saved matplotlib chart: ../../artifacts/results/tsy_cones_2025-05-28.png
📈 Saved matplotlib chart: ../../artifacts/results/tsy_cones_2025-05-29.png
📈 Saved matplotlib chart: ../../artifacts/results/tsy_cones_2025-05-30.png
📈 Saved matplotlib chart: ../../artifacts/results/tsy_cones_2025-05-30.png
📈 Saved matplotlib chart: ../../artifacts/results/tsy_cones_2025-06-02.png
📈 Saved matplotlib chart: ../../artifacts/results/tsy_cones_2025-06-03.png
📈 Saved matplotlib chart: ../../artifacts/results/tsy_cones_2025-06-04.png
