In [None]:
# ==============================================================
# MODEL TRAINING PIPELINE (Ridge Regularization Version)
# ==============================================================

import os
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer

# --------------------------------------------------------------
# LOAD GOLD DATA
# --------------------------------------------------------------
feature_path = "datamart/gold/feature_store.csv"
label_path   = "datamart/gold/label_store.csv"
df_feat = pd.read_csv(feature_path)
df_label = pd.read_csv(label_path)

# Merge on unit + cycle
df = df_feat.merge(df_label, on=["unit", "cycle"], how="inner")
print(f"✅ Loaded {len(df):,} rows across {df['unit'].nunique()} engines")

# --------------------------------------------------------------
# PREPARE FEATURES / LABELS
# --------------------------------------------------------------
drop_cols = ["unit", "cycle", "RUL"]
X = df.drop(columns=drop_cols)
y = df["RUL"]
units = df["unit"]

# --------------------------------------------------------------
# SPLIT ENGINE-WISE (avoid leakage)
# --------------------------------------------------------------
np.random.seed(42)
engine_ids = df["unit"].unique()
np.random.shuffle(engine_ids)

n = len(engine_ids)
train_cut = int(n * 0.7)
val_cut   = int(n * 0.85)

train_units = engine_ids[:train_cut]
val_units   = engine_ids[train_cut:val_cut]
test_units  = engine_ids[val_cut:]

def subset(units_subset):
    mask = df["unit"].isin(units_subset)
    return df.loc[mask, X.columns], df.loc[mask, "RUL"]

X_train, y_train = subset(train_units)
X_val, y_val     = subset(val_units)
X_test, y_test   = subset(test_units)

print(f"🧩 Split by engines → Train: {len(train_units)}, Val: {len(val_units)}, Test: {len(test_units)}")

# --------------------------------------------------------------
# METRIC HELPER
# --------------------------------------------------------------
def evaluate(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"📊 {name} → MAE: {mae:.3f}, RMSE: {rmse:.3f}, R²: {r2:.3f}")
    return mae, rmse, r2

# --------------------------------------------------------------
# MODEL 1 — RIDGE REGRESSION (Regularized Linear)
# --------------------------------------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

print("\n🔹 MODEL 1: RIDGE REGRESSION (Regularized Linear)")

ridge_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge())
])

param_grid_ridge = {"ridge__alpha": [0.1, 1.0, 10.0, 50.0, 100.0]}
grid_ridge = GridSearchCV(
    ridge_pipe,
    param_grid_ridge,
    scoring="r2",
    cv=3,
    n_jobs=-1
)
grid_ridge.fit(X_train, y_train)

best_ridge = grid_ridge.best_estimator_
print(f"🏆 Best Ridge Alpha: {grid_ridge.best_params_['ridge__alpha']}")

pred_val_ridge = best_ridge.predict(X_val)
mae_ridge, rmse_ridge, r2_ridge = evaluate("Ridge Regression (Val)", y_val, pred_val_ridge)

# --------------------------------------------------------------
# MODEL 2 — RANDOM FOREST (GridSearch)
# --------------------------------------------------------------
print("\n🔹 MODEL 2: RANDOM FOREST (GridSearch)")
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt", "log2"]
}

def rmse_scorer(y_true, y_pred):
    return -sqrt(mean_squared_error(y_true, y_pred))

grid = GridSearchCV(
    rf,
    param_grid,
    scoring=make_scorer(rmse_scorer),
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train, y_train)

best_rf = grid.best_estimator_
print(f"🏆 Best Params: {grid.best_params_}")

pred_val_rf = best_rf.predict(X_val)
mae_rf, rmse_rf, r2_rf = evaluate("Random Forest (Val)", y_val, pred_val_rf)

# --------------------------------------------------------------
# TEST EVALUATION
# --------------------------------------------------------------
pred_test_rf = best_rf.predict(X_test)
mae_test, rmse_test, r2_test = evaluate("Random Forest (Test)", y_test, pred_test_rf)

# --------------------------------------------------------------
# SUMMARY
# --------------------------------------------------------------
print("\n============================================================")
print("🏁 SUMMARY")
print(f"Ridge Regression Val R²: {r2_ridge:.3f}")
print(f"Random Forest Val R²:     {r2_rf:.3f}")
print(f"Random Forest Test R²:    {r2_test:.3f}")
print("============================================================")


✅ Loaded 105,571 rows across 600 engines
🧩 Split by engines → Train: 420, Val: 90, Test: 90

🔹 MODEL 1: RIDGE REGRESSION (Regularized Linear)
🏆 Best Ridge Alpha: 100.0
📊 Ridge Regression (Val) → MAE: 14.078, RMSE: 18.244, R²: 0.891

🔹 MODEL 2: RANDOM FOREST (GridSearch)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
🏆 Best Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
📊 Random Forest (Val) → MAE: 13.634, RMSE: 18.219, R²: 0.892
📊 Random Forest (Test) → MAE: 17.176, RMSE: 24.853, R²: 0.836

🏁 SUMMARY
Ridge Regression Val R²: 0.891
Random Forest Val R²:     0.892
Random Forest Test R²:    0.836


In [8]:
# ==============================================================
# OUT-OF-TIME (OOT) BOOTSTRAP FROM REAL ENGINE TRAJECTORIES
# ==============================================================

# 1️⃣ Identify unique engines in the test set
test_units_unique = df.loc[df["unit"].isin(test_units), "unit"].unique()

# 2️⃣ Randomly sample 90 engines (with replacement)
n_oot = 90
boot_units = np.random.choice(test_units_unique, size=n_oot, replace=True)
oot_units = [f"OOT_{i+1}" for i in range(n_oot)]

oot_list = []

for new_id, base_unit in zip(oot_units, boot_units):
    # Extract that engine's real sequence
    engine_df = df[df["unit"] == base_unit].copy()
    engine_df = engine_df.sort_values("cycle").reset_index(drop=True)
    # Keep only selected features (after VIF drop)
    engine_df = engine_df[["cycle"] + X.columns.tolist() + ["RUL"]]
    engine_df["unit"] = new_id
    oot_list.append(engine_df)

oot_df = pd.concat(oot_list, ignore_index=True)

# 3️⃣ Rename for clarity
oot_X = oot_df[X.columns]
oot_y_true = oot_df["RUL"]

# 4️⃣ Predict using best model
oot_df["RUL_pred"] = best_rf.predict(oot_X)

# 5️⃣ Evaluate
mae_oot, rmse_oot, r2_oot = evaluate("Random Forest (OOT)", oot_y_true, oot_df["RUL_pred"])

# 6️⃣ Save results
oot_df.to_csv("datamart/gold/oot_predictions_bootstrap.csv", index=False)
print("\n✅ OOT results saved to datamart/gold/oot_predictions_bootstrap.csv")


📊 Random Forest (OOT) → MAE: 19.689, RMSE: 29.872, R²: 0.794

✅ OOT results saved to datamart/gold/oot_predictions_bootstrap.csv
