In [3]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))


import data.data_source as data_source

import time
import uuid
import json
from datetime import date
from functools import lru_cache

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mlflow
from dateutil.relativedelta import relativedelta
import os
from config import env

from models.pca_model import legacy_pca, sklearn_pca

# ─── CONFIGURATION ─────────────────────────────────────────────────────────
TENORS = [0.25, 0.5, 1, 2, 3, 5, 7, 10, 20, 30]
ROLLING_YEARS = 3
N_COMPONENTS = 3
CURVE_TYPE = "US Treasury Par"
pca_model = sklearn_pca

# MLflow experiment
experiment_name = f"PCA Training [{env}]"
mlflow.set_experiment(experiment_name)

# ─── DATASOURCE ──────────────────────────────────────────────────────────────
ds = data_source.get_data_source()

# ─── ONE‐TIME LOAD & PIVOT ────────────────────────────────────────────────────
def load_and_pivot_all(earliest_date: date, latest_date: date) -> pd.DataFrame:
    """
    Pull every curve row between earliest_date and latest_date once,
    pivot to a Date×Tenor matrix, then forward/backfill missing values
    across the entire range. Return a pivoted DataFrame with tenor columns.
    """
    sql = f"""
        SELECT curve_date, tenor_num AS tenor, rate
          FROM rate_curves
         WHERE curve_date BETWEEN '{earliest_date}'::date AND '{latest_date}'::date
           AND curve_type = '{CURVE_TYPE}'
        ORDER BY curve_date
    """
    df_all = ds.query(sql).to_pandas()
    df_all["curve_date"] = pd.to_datetime(df_all["curve_date"])
    # pivot once
    pivot = df_all.pivot(index="curve_date", columns="tenor", values="rate")
    # ensure all TENORS are present
    pivot = pivot.reindex(columns=TENORS)
    # forward‐fill & back‐fill entire matrix
    pivot_filled = pivot.ffill().bfill()
    return pivot_filled

# ─── PCA‐AND‐LOG FOR A SLICE ──────────────────────────────────────────────────
def run_pca_and_log_slice(as_of_date: date, pivot_filled: pd.DataFrame):
    """
    Perform PCA on the slice of pivot_filled from (as_of_date - 3y) to as_of_date.
    Insert the results into the DB and log metrics/artifacts to MLflow.
    """
    start_date = as_of_date - relativedelta(years=ROLLING_YEARS)
    end_date = as_of_date

    # Extract the sub‐matrix for this date range:
    # because we've already forward/backfilled, this slice has no NaNs.
    slice_df = pivot_filled.loc[start_date:end_date]
    X = slice_df.to_numpy()
    num_obs, num_tenors = X.shape
    means = X.mean(axis=0)
    total_var = ((X - means) ** 2).mean()

    # Fit PCA. If using sklearn, you'd do something like:
    # sklearn_pca = SklearnPCA(n_components=N_COMPONENTS)
    # all_scores = sklearn_pca.fit_transform(X)
    # components = sklearn_pca.components_
    # explained_ratio = sklearn_pca.explained_variance_ratio_
    #
    # If you stick with legacy_pca, assume it returns (components, explained_ratio, mean_curve, all_scores).
    components, explained_ratio, mean_curve, all_scores = pca_model(X, N_COMPONENTS)
    today_scores = all_scores[-1]  # last row corresponds to as_of_date

    # Compute reconstruction error & R²
    X_recon = all_scores @ components + mean_curve
    mse = ((X - X_recon) ** 2).mean()
    r2 = 1 - mse / total_var

    total_explained = float(explained_ratio.sum())

    # INSERT/UPSERT into DB
    run_id = str(uuid.uuid4())
    insert_sql = f"""
    INSERT INTO pca_results (
      run_id, curve_type, curve_date, n_components,
      total_explained_variance_ratio, explained_variance_ratios,
      mean_curve, components, scores
    ) VALUES (
      '{run_id}', '{CURVE_TYPE}', '{as_of_date}',
      {N_COMPONENTS}, {total_explained},
      ARRAY{explained_ratio.tolist()},
      ARRAY{mean_curve.tolist()},
      '{json.dumps(components.tolist()).replace("'", "''")}',
      ARRAY{today_scores.tolist()}
    )
    ON CONFLICT (curve_type, curve_date)
    DO UPDATE SET
      run_id                        = EXCLUDED.run_id,
      run_timestamp                 = CLOCK_TIMESTAMP(),
      n_components                  = EXCLUDED.n_components,
      total_explained_variance_ratio= EXCLUDED.total_explained_variance_ratio,
      explained_variance_ratios     = EXCLUDED.explained_variance_ratios,
      mean_curve                    = EXCLUDED.mean_curve,
      components                    = EXCLUDED.components,
      scores                        = EXCLUDED.scores;
    """
    ds.query(insert_sql)

    # MLflow logging for this slice
    mlflow.log_param("as_of_date", as_of_date)
    mlflow.log_param("num_tenors", num_tenors)
    mlflow.log_param("num_observations", num_obs)

    mlflow.log_metric("reconstruction_mse", float(mse))
    mlflow.log_metric("total_explained_variance", total_explained)
    for i, ratio in enumerate(explained_ratio, start=1):
        mlflow.log_metric(f"explained_variance_ratio_{i}", float(ratio))
    mlflow.log_metric("run_duration_seconds", time.time() - mlflow_start_time_per_slice[0])

    return explained_ratio  # return this so we can build the scree plot later

# ─── POPULATE LOOP (ONE‐TIME LOAD + SLICE) ───────────────────────────────────
def populate(days: int, as_of: date):
    """
    Instead of calling load_curve_data 1×/day, we:
    1) Compute the earliest date we’ll need (3 years + days back).  
    2) Pull everything once, pivot & fill.  
    3) Loop over each as_of_date slice, run PCA & log.  
    4) After the loop, build a consolidated scree‐plot or CSV if desired.
    """
    end_date = date.today()
    earliest_possible = as_of - relativedelta(years=ROLLING_YEARS) - relativedelta(days=days)
    min_date = date(2010, 3, 15)
    if earliest_possible < min_date:
        earliest_possible = min_date

    # 1) ONE‐TIME: load & pivot entire range
    print(f"Loading data from {earliest_possible} to {end_date} (one‐time)...")
    pivot_filled = load_and_pivot_all(earliest_possible, end_date)

    # 2) Start MLflow parent run
    with mlflow.start_run(run_name="Rolling PCA", nested=False):
        mlflow.log_param("days_requested", days)
        mlflow.log_param("rolling_years", ROLLING_YEARS)
        mlflow.log_param("n_components", N_COMPONENTS)
        mlflow.log_param("curve_type", CURVE_TYPE)
        mlflow.log_param("pca_model", pca_model.__name__)
        mlflow.log_param("starting_domino_user", os.environ.get("DOMINO_STARTING_USERNAME", ""))

        # We'll collect all explained_variance_ratios to make one scree plot at the end
        scree_data = []

        # 3) Loop over each day
        for i in range(days):
            as_of_date = as_of - relativedelta(days=i)
            print(f"→ Running PCA for {as_of_date}...")

            # Start a nested run for this date
            with mlflow.start_run(nested=True, run_name=f"PCA_{as_of_date}") as nested_run:
                global mlflow_start_time_per_slice
                mlflow_start_time_per_slice = (time.time(),)  # just to measure per‐slice latency
                explained_ratio = run_pca_and_log_slice(as_of_date, pivot_filled)
                scree_data.append((as_of_date, explained_ratio))
                mlflow.end_run()

        # 4) After all slices are done, optionally write a combined scree‐plot & CSV once:
        #    This avoids 𝐍 file writes ⇒ only 1 final write.
        all_components = pd.DataFrame(
            {
                "as_of_date": [d for d, ratios in scree_data],
                **{
                    f"pc{i+1}_ratio": [ratios[i] for d, ratios in scree_data]
                    for i in range(N_COMPONENTS)
                },
            }
        )
        # Save once:
        csv_path = "../../artifacts/results/all_scree_data.csv"
        all_components.to_csv(csv_path, index=False)
        mlflow.log_artifact(csv_path, artifact_path="pca_metrics")

        # And make one combined scree‐plot (chains of markers per date)
        fig, ax = plt.subplots(figsize=(8, 5))
        for as_of_date, ratios in scree_data:
            ax.plot(
                np.arange(1, N_COMPONENTS + 1),
                ratios,
                marker="o",
                linestyle="-",
                label=str(as_of_date),
            )
        ax.set_xlabel("Principal Component")
        ax.set_ylabel("Explained Variance Ratio")
        ax.set_title(f"Scree Plot Over Time (last {days} days)")
        ax.legend(fontsize="small", ncol=2, loc="upper right", bbox_to_anchor=(1.2, 1.0))

        plot_path = "../../artifacts/results/all_scree_over_time.png"
        fig.savefig(plot_path, bbox_inches="tight")
        plt.close(fig)
        mlflow.log_artifact(plot_path, artifact_path="scree_plots")

    print("✅ All PCA runs complete.")

# ─── MAIN ─────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    default_backdated_days = 5
    default_as_of = date.today()

    if len(sys.argv) > 1:
        try:
            as_of = date.fromisoformat(sys.argv[1])
        except ValueError:
            as_of = default_as_of
    else:
        as_of = default_as_of

    populate(default_backdated_days, as_of)


getting data source for sandbox
Loading data from 2022-05-28 to 2025-06-02 (one‐time)...
→ Running PCA for 2025-06-02...
🏃 View run PCA_2025-06-02 at: http://127.0.0.1:8768/#/experiments/1462/runs/dc7ed805ae784e2483000989b6c83340
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1462
🏃 View run Rolling PCA at: http://127.0.0.1:8768/#/experiments/1462/runs/e487699c842443869fb7d66c27533875
🧪 View experiment at: http://127.0.0.1:8768/#/experiments/1462


NameError: name 'raw_model' is not defined