In [4]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

import data.data_source as data_source

import time
import uuid
import json
from datetime import date, timedelta
from functools import lru_cache

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mlflow
from dateutil.relativedelta import relativedelta
import os
from config import env
from models.pca_model import legacy_pca


# ─── CONFIGURATION ─────────────────────────────────────────────────────────
TENORS = [0.25, 0.5, 1, 2, 3, 5, 7, 10, 20, 30]
ROLLING_YEARS = 3
N_COMPONENTS = 3
CURVE_TYPE = "US Treasury Par"
pca_model = legacy_pca

# MLflow experiment
experiment_name = f"PCA Training [{env}]"
mlflow.set_experiment(experiment_name)

# ─── DATASOURCE ──────────────────────────────────────────────────────────────
ds = data_source.get_data_source()

# ─── HELPERS ──────────────────────────────────────────────────────────────────
@lru_cache(maxsize=100)
def load_curve_data(start_date: date, end_date: date) -> pd.DataFrame:
    """
    Load treasury curves from the database between start_date and end_date.
    """
    sql = f"""
        SELECT curve_date, tenor_num AS tenor, rate
          FROM rate_curves
         WHERE curve_date BETWEEN '{start_date}' AND '{end_date}'
           AND curve_type = '{CURVE_TYPE}'
        ORDER BY curve_date
    """
    df = ds.query(sql).to_pandas()
    df['curve_date'] = pd.to_datetime(df['curve_date'])
    return df

# ─── PCA + DB INSERT + MLflow LOGGING ─────────────────────────────────────────
def run_pca_and_log(as_of_date: date):
    # Calculate rolling window dates
    start_date = as_of_date - relativedelta(years=ROLLING_YEARS)
    end_date = as_of_date

    # Start timing
    start_time = time.time()

    # Load data
    df = load_curve_data(start_date, end_date)
    pivot = (
        df.pivot(index="curve_date", columns="tenor", values="rate")
          .reindex(columns=[t for t in TENORS if t in df['tenor'].unique()])
    )
    pivot = pivot.ffill().bfill()
    X = pivot.to_numpy()
    num_obs, num_tenors = X.shape
    means     = X.mean(axis=0)
    total_var = ((X - means)**2).mean()

    # Fit PCA
    components, explained_ratio, mean_curve, all_scores = pca_model(X, N_COMPONENTS)
    today_scores = all_scores[-1]   # last row is “today”

    # Compute reconstruction errors
    X_recon  = all_scores @ components + mean_curve
    mse      = ((X - X_recon)**2).mean()
    r2      = 1 - mse      / total_var
    print('mse', mse)
    print('r2', r2)

    total_explained = float(explained_ratio.sum())
    run_duration = time.time() - start_time

    stats = {
        "run_duration": run_duration,
        "reconstruction_mse": float(mse),
        "pc1_variance":       float(explained_ratio[0]),
        "pc2_variance":       float(explained_ratio[1]),
        "pc3_variance":       float(explained_ratio[2]),
        "total_explained":    total_explained
    }
    with open("../../artifacts/results/dominostats.json", "w") as f:
        json.dump(stats, f)

    # Generate a run_id
    run_id = str(uuid.uuid4())

    # Insert into DB
    insert_sql = f"""
    INSERT INTO pca_results (
      run_id, curve_type, curve_date, n_components,
      total_explained_variance_ratio, explained_variance_ratios,
      mean_curve, components, scores
    ) VALUES (
      '{run_id}', '{CURVE_TYPE}', '{as_of_date}',
      {N_COMPONENTS}, {total_explained},
      ARRAY{explained_ratio.tolist()},
      ARRAY{mean_curve.tolist()},
      '{json.dumps(components.tolist()).replace("'", "''")}',
      ARRAY{today_scores.tolist()}
    )
    ON CONFLICT (curve_type, curve_date)
    DO UPDATE SET
      run_id                        = EXCLUDED.run_id,
      run_timestamp                 = CLOCK_TIMESTAMP(),
      n_components                  = EXCLUDED.n_components,
      total_explained_variance_ratio= EXCLUDED.total_explained_variance_ratio,
      explained_variance_ratios     = EXCLUDED.explained_variance_ratios,
      mean_curve                    = EXCLUDED.mean_curve,
      components                    = EXCLUDED.components,
      scores                        = EXCLUDED.scores;
    """
    ds.query(insert_sql)
    
    # MLflow logging
    mlflow.log_param("as_of_date", as_of_date)
    mlflow.log_param("as_of_date_ordinal", as_of_date.toordinal() - 733773)
    mlflow.log_param("rolling_years", ROLLING_YEARS)
    mlflow.log_param("n_components", N_COMPONENTS)
    mlflow.log_param("curve_type", CURVE_TYPE)
    mlflow.log_param("num_tenors", num_tenors)
    mlflow.log_param("num_observations", num_obs)
    mlflow.log_param("pca_model", pca_model.__name__)
    mlflow.log_param("starting_domino_user", os.environ["DOMINO_STARTING_USERNAME"])


    mlflow.log_metric("reconstruction_mse", float(mse))
    mlflow.log_metric("total_explained_variance", total_explained)
    for i, ratio in enumerate(explained_ratio, start=1):
        mlflow.log_metric(f"explained_variance_ratio_{i}", float(ratio))
    mlflow.log_metric("run_duration_seconds", run_duration)

    # Create DataFrame for artifact
    metrics_df = pd.DataFrame({
        'component': list(range(1, N_COMPONENTS+1)),
        'explained_variance_ratio': explained_ratio
    })
    metrics_df['cumulative_variance'] = metrics_df['explained_variance_ratio'].cumsum()

    # Save artifact
    csv_path = "../../artifacts/results/rate_curves_loaded.csv"
    metrics_df.to_csv(csv_path, index=False)
    mlflow.log_artifact(csv_path, artifact_path="pca_metrics")

    fig, ax = plt.subplots()
    ax.plot(
        np.arange(1, N_COMPONENTS + 1),
        explained_ratio,
        marker='o',
        linestyle='-',
    )
    ax.set_xlabel("Principal Component")
    ax.set_ylabel("Explained Variance Ratio")
    ax.set_title(f"Scree Plot (as_of={as_of_date})")
    
    # 2) save locally
    plot_path = f"../../artifacts/results/scree_{as_of_date}.png"
    fig.savefig(plot_path, bbox_inches='tight')
    plt.close(fig)
    
    # 3) log to MLflow
    mlflow.log_artifact(plot_path, artifact_path="scree_plots")

    print(f"✅ PCA run complete (run_id: {run_id}). Logged metrics and artifact.")

def populate(days, starting_date):
    d = starting_date
    start_time = time.time()
    end_date = date.today()
    start_date = end_date - relativedelta(days=days)
    min_date = date(2010, 3, 15)
    unique_dates = set()
    if start_date < min_date:
        start_date = min_date

    with mlflow.start_run(run_name="Rolling PCA", nested=False):
        mlflow.log_param("days_requested", days)
        mlflow.log_param("starting_domino_user", os.environ["DOMINO_STARTING_USERNAME"])
        mlflow.log_param("curve_type", CURVE_TYPE)
        mlflow.log_param("rolling_years", ROLLING_YEARS)
        mlflow.log_param("n_components", N_COMPONENTS)
        mlflow.log_param("curve_type", CURVE_TYPE)
        mlflow.log_param("pca_model", pca_model.__name__)


        for i in range(days):
            as_of_date = d - relativedelta(days=i)
            print(f'running for {as_of_date}')
            with mlflow.start_run(nested=True, run_name=f"PCA_{as_of_date}"):
                run_pca_and_log(as_of_date)
                mlflow.end_run()

# ─── MAIN ─────────────────────────────────────────────────────────────────────
    
default_backdated_days = 5
default_as_of = date.today()

if __name__ == "__main__":
    if len(sys.argv) > 1:
        try:
            as_of = date.fromisoformat(sys.argv[1])
        except ValueError:
            as_of = default_as_of
    else:
        as_of = default_as_of

    populate(default_backdated_days, as_of)

2025/05/30 15:29:04 INFO mlflow.tracking.fluent: Experiment with name 'PCA Training [sandbox]' does not exist. Creating a new experiment.


getting data source for sandbox
running for 2025-05-30
mse 0.0011685214452618155
r2 0.996804050339113
✅ PCA run complete (run_id: 0c530424-1f47-4bb2-b1b2-b890a5198ea2). Logged metrics and artifact.
🏃 View run PCA_2025-05-30 at: http://127.0.0.1:8768/#/experiments/1462/runs/2dfdf98854674cc580f6d408243c9eed
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1462
running for 2025-05-29
mse 0.0011685214452618155
r2 0.996804050339113
✅ PCA run complete (run_id: 55391c5e-d67f-4ab8-bab8-38358ee40571). Logged metrics and artifact.
🏃 View run PCA_2025-05-29 at: http://127.0.0.1:8768/#/experiments/1462/runs/c01b705dee00445daab8a6f67b0e34e0
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1462
running for 2025-05-28
mse 0.0011575379293504858
r2 0.9968362578333282
✅ PCA run complete (run_id: 56e70a37-5a5b-4f5a-b7a2-44222e7a731b). Logged metrics and artifact.
🏃 View run PCA_2025-05-28 at: http://127.0.0.1:8768/#/experiments/1462/runs/c7937db3bc0d496eaffe560bc4364329
🧪 View experimen