In [1]:
import sys
import time
import uuid
import json
from datetime import date, timedelta
from functools import lru_cache

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import mlflow
from domino.data_sources import DataSourceClient
from dateutil.relativedelta import relativedelta
import os

# ─── CONFIGURATION ─────────────────────────────────────────────────────────
TENORS = [0.25, 0.5, 1, 2, 3, 5, 7, 10, 20, 30]
ROLLING_YEARS = 3
N_COMPONENTS = 3
CURVE_TYPE = "US Treasury Par"

# MLflow experiment
EXPERIMENT_NAME = "PCA_Curve_Analysis"
mlflow.set_experiment(EXPERIMENT_NAME)

# ─── DATASOURCE ──────────────────────────────────────────────────────────────
ds = DataSourceClient().get_datasource("market_data")

# ─── HELPERS ──────────────────────────────────────────────────────────────────
@lru_cache(maxsize=100)
def load_curve_data(start_date: date, end_date: date) -> pd.DataFrame:
    """
    Load treasury curves from the database between start_date and end_date.
    """
    sql = f"""
        SELECT curve_date, tenor_num AS tenor, rate
          FROM rate_curves
         WHERE curve_date BETWEEN '{start_date}' AND '{end_date}'
           AND curve_type = '{CURVE_TYPE}'
        ORDER BY curve_date
    """
    df = ds.query(sql).to_pandas()
    df['curve_date'] = pd.to_datetime(df['curve_date'])
    return df

# Updated `run_pca_and_log` with an existence check for the as_of_date

    # Check if there's data for exactly as_of_date
    available_dates = df['curve_date'].dt.date.unique()
    if as_of_date not in available_dates:
        print(f"No curve data for {as_of_date}, skipping PCA.")
        return

    # Build pivot and include only dates up to the target
    pivot = (
        df[df['curve_date'].dt.date <= as_of_date]
          .pivot(index="curve_date", columns="tenor", values="rate")
          .reindex(columns=[t for t in TENORS if t in df['tenor'].unique()])
          .ffill()
          .bfill()
    )

    # Now pivot.index[-1].date() is guaranteed to be as_of_date
    X = pivot.to_numpy()
    num_obs, num_tenors = X.shape

    # Proceed with PCA as before...
    pca = PCA(n_components=N_COMPONENTS)
    X_pca = pca.fit_transform(X)
    # ... rest of the logic ...

    print(f"PCA run for {as_of_date} completed with {num_obs} observations.")


# ─── PCA + DB INSERT + MLflow LOGGING ─────────────────────────────────────────
def run_pca_and_log(as_of_date: date):
    # Calculate rolling window dates
    start_date = as_of_date - relativedelta(years=ROLLING_YEARS)
    end_date = as_of_date

    # Start timing
    start_time = time.time()

    # Load data
    df = load_curve_data(start_date, end_date)
    pivot = (
        df.pivot(index="curve_date", columns="tenor", values="rate")
          .reindex(columns=[t for t in TENORS if t in df['tenor'].unique()])
    )
    pivot = pivot.ffill().bfill()
    X = pivot.to_numpy()
    num_obs, num_tenors = X.shape

    # Fit PCA
    pca = PCA(n_components=N_COMPONENTS)
    X_pca = pca.fit_transform(X)

    X_recon = pca.inverse_transform(X_pca)
    recon_errors = (X - X_recon) ** 2
    mse = recon_errors.mean()
    mlflow.log_metric("reconstruction_mse", float(mse))

    # Today's scores
    today_curve = pivot.iloc[-1].to_numpy()
    today_scores = pca.transform([today_curve])[0]

    # Compute run metrics
    explained_ratio = pca.explained_variance_ratio_
    total_explained = float(explained_ratio.sum())
    run_duration = time.time() - start_time

    stats = {
        "run_duration": run_duration,
        "reconstruction_mse": float(mse),
        "pc1_variance":       float(explained_ratio[0]),
        "pc2_variance":       float(explained_ratio[1]),
        "pc3_variance":       float(explained_ratio[2]),
        "total_explained":    total_explained
    }
    with open("dominostats.json", "w") as f:
        json.dump(stats, f)

    # Generate a run_id
    run_id = str(uuid.uuid4())

    # Insert into DB
    insert_sql = f"""
    INSERT INTO curve_pca_results (
      run_id, curve_type, curve_date, n_components,
      total_explained_variance_ratio, explained_variance_ratios,
      mean_curve, components, scores
    ) VALUES (
      '{run_id}', '{CURVE_TYPE}', '{as_of_date}',
      {N_COMPONENTS}, {total_explained},
      ARRAY{explained_ratio.tolist()},
      ARRAY{pca.mean_.tolist()},
      '{json.dumps(pca.components_.tolist()).replace("'", "''")}',
      ARRAY{today_scores.tolist()}
    )
    ON CONFLICT (curve_type, curve_date)
    DO UPDATE SET
      run_id                        = EXCLUDED.run_id,
      n_components                  = EXCLUDED.n_components,
      total_explained_variance_ratio= EXCLUDED.total_explained_variance_ratio,
      explained_variance_ratios     = EXCLUDED.explained_variance_ratios,
      mean_curve                    = EXCLUDED.mean_curve,
      components                    = EXCLUDED.components,
      scores                        = EXCLUDED.scores;
    """
    ds.query(insert_sql)
    
    # MLflow logging
    mlflow.log_param("as_of_date", as_of_date)
    mlflow.log_param("as_of_date_ordinal", as_of_date.toordinal() - 733773)
    mlflow.log_param("rolling_years", ROLLING_YEARS)
    mlflow.log_param("n_components", N_COMPONENTS)
    mlflow.log_param("curve_type", CURVE_TYPE)
    mlflow.log_param("num_tenors", num_tenors)
    mlflow.log_param("num_observations", num_obs)

    mlflow.log_metric("total_explained_variance", total_explained)
    for i, ratio in enumerate(explained_ratio, start=1):
        mlflow.log_metric(f"explained_variance_ratio_{i}", float(ratio))
    mlflow.log_metric("run_duration_seconds", run_duration)

    # Create DataFrame for artifact
    metrics_df = pd.DataFrame({
        'component': list(range(1, N_COMPONENTS+1)),
        'explained_variance_ratio': explained_ratio
    })
    metrics_df['cumulative_variance'] = metrics_df['explained_variance_ratio'].cumsum()

    # Save artifact
    csv_path = "../../artifacts/results/rate_curves_loaded.csv"
    metrics_df.to_csv(csv_path, index=False)
    mlflow.log_artifact(csv_path, artifact_path="pca_metrics")
    # mlflow.sklearn.log_model(pca, artifact_path="pca_model")

    fig, ax = plt.subplots()
    ax.plot(
        np.arange(1, N_COMPONENTS + 1),
        pca.explained_variance_ratio_,
        marker='o',
        linestyle='-',
    )
    ax.set_xlabel("Principal Component")
    ax.set_ylabel("Explained Variance Ratio")
    ax.set_title(f"Scree Plot (as_of={as_of_date})")
    
    # 2) save locally
    plot_path = f"../../artifacts/results/scree_{as_of_date}.png"
    fig.savefig(plot_path, bbox_inches='tight')
    plt.close(fig)
    
    # 3) log to MLflow
    mlflow.log_artifact(plot_path, artifact_path="scree_plots")

    print(f"✅ PCA run complete (run_id: {run_id}). Logged metrics and artifact.")

def populate(days, starting_date):
    d = starting_date
    start_time = time.time()
    end_date = date.today()
    start_date = end_date - relativedelta(days=days)
    min_date = date(2010, 3, 15)
    unique_dates = set()
    if start_date < min_date:
        start_date = min_date

    with mlflow.start_run(run_name="Rolling PCA", nested=False):
        mlflow.log_param("days_requested", days)
        mlflow.log_param("starting_domino_user", os.environ["DOMINO_STARTING_USERNAME"])

        for i in range(days):
            as_of_date = d - relativedelta(days=i)
            print(f'running for {as_of_date}')
            with mlflow.start_run(nested=True, run_name=f"PCA_{as_of_date}"):
                run_pca_and_log(as_of_date)
                mlflow.end_run()

# ─── MAIN ─────────────────────────────────────────────────────────────────────
    
default_backdated_days = 5000
default_as_of = date.today()

if __name__ == "__main__":
    if len(sys.argv) > 1:
        try:
            as_of = date.fromisoformat(sys.argv[1])
        except ValueError:
            as_of = default_as_of
    else:
        as_of = default_as_of

    populate(default_backdated_days, as_of)


running for 2025-05-22
✅ PCA run complete (run_id: dfdd7a35-bd40-4522-9022-889f0a1e60d3). Logged metrics and artifact.
🏃 View run PCA_2025-05-22 at: http://127.0.0.1:8768/#/experiments/1443/runs/80efc7b721014121a6b80359f1c0e965
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1443
running for 2025-05-21
🏃 View run PCA_2025-05-21 at: http://127.0.0.1:8768/#/experiments/1443/runs/f1fe3b8235074267b8b0a3e071940d0b
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1443
🏃 View run Rolling PCA at: http://127.0.0.1:8768/#/experiments/1443/runs/c29fe24b3a6e468cb7ba915ad03bfd03
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1443


KeyboardInterrupt: 