In [9]:
# ==============================================================
# MODEL TRAINING PIPELINE (Ridge Regularization Version)
# ==============================================================

import os
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer

# --------------------------------------------------------------
# LOAD GOLD DATA (from Parquet)
# --------------------------------------------------------------
import pandas as pd

feature_path = "datamart/gold/feature_store.parquet"
label_path   = "datamart/gold/label_store.parquet"

# Load Parquet files instead of CSVs
df_feat = pd.read_parquet(feature_path)
df_label = pd.read_parquet(label_path)

# Ensure unit & cycle columns are aligned for merge
if not all(col in df_feat.columns for col in ["unit", "cycle"]):
    raise KeyError("Feature store missing required columns ['unit', 'cycle']")
if not all(col in df_label.columns for col in ["unit", "cycle"]):
    raise KeyError("Label store missing required columns ['unit', 'cycle']")

# Merge features and labels on identifiers
df = df_feat.merge(df_label, on=["unit", "cycle"], how="inner")

print(f"‚úÖ Loaded {len(df):,} rows across {df['unit'].nunique()} engines")
print(f"üß© Features: {df_feat.shape[1]-2:,} | Target: 'RUL'")


# --------------------------------------------------------------
# PREPARE FEATURES / LABELS
# --------------------------------------------------------------
drop_cols = ["unit", "cycle", "RUL"]
X = df.drop(columns=drop_cols)
y = df["RUL"]
units = df["unit"]

# --------------------------------------------------------------
# SPLIT ENGINE-WISE (avoid leakage)
# --------------------------------------------------------------
np.random.seed(42)
engine_ids = df["unit"].unique()
np.random.shuffle(engine_ids)

n = len(engine_ids)
train_cut = int(n * 0.7)
val_cut   = int(n * 0.85)

train_units = engine_ids[:train_cut]
val_units   = engine_ids[train_cut:val_cut]
test_units  = engine_ids[val_cut:]

def subset(units_subset):
    mask = df["unit"].isin(units_subset)
    return df.loc[mask, X.columns], df.loc[mask, "RUL"]

X_train, y_train = subset(train_units)
X_val, y_val     = subset(val_units)
X_test, y_test   = subset(test_units)

print(f"üß© Split by engines ‚Üí Train: {len(train_units)}, Val: {len(val_units)}, Test: {len(test_units)}")

# --------------------------------------------------------------
# METRIC HELPER
# --------------------------------------------------------------
def evaluate(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"üìä {name} ‚Üí MAE: {mae:.3f}, RMSE: {rmse:.3f}, R¬≤: {r2:.3f}")
    return mae, rmse, r2

# --------------------------------------------------------------
# MODEL 1 ‚Äî RIDGE REGRESSION (Regularized Linear)
# --------------------------------------------------------------
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

print("\nüîπ MODEL 1: RIDGE REGRESSION (Regularized Linear)")

ridge_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge())
])

param_grid_ridge = {"ridge__alpha": [0.1, 1.0, 10.0, 50.0, 100.0]}
grid_ridge = GridSearchCV(
    ridge_pipe,
    param_grid_ridge,
    scoring="r2",
    cv=3,
    n_jobs=-1
)
grid_ridge.fit(X_train, y_train)

best_ridge = grid_ridge.best_estimator_
print(f"üèÜ Best Ridge Alpha: {grid_ridge.best_params_['ridge__alpha']}")

pred_val_ridge = best_ridge.predict(X_val)
mae_ridge, rmse_ridge, r2_ridge = evaluate("Ridge Regression (Val)", y_val, pred_val_ridge)

# --------------------------------------------------------------
# MODEL 2 ‚Äî RANDOM FOREST (GridSearch)
# --------------------------------------------------------------
print("\nüîπ MODEL 2: RANDOM FOREST (GridSearch)")
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt", "log2"]
}

def rmse_scorer(y_true, y_pred):
    return -sqrt(mean_squared_error(y_true, y_pred))

grid = GridSearchCV(
    rf,
    param_grid,
    scoring=make_scorer(rmse_scorer),
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train, y_train)

best_rf = grid.best_estimator_
print(f"üèÜ Best Params: {grid.best_params_}")

pred_val_rf = best_rf.predict(X_val)
mae_rf, rmse_rf, r2_rf = evaluate("Random Forest (Val)", y_val, pred_val_rf)

# --------------------------------------------------------------
# TEST EVALUATION
# --------------------------------------------------------------
pred_test_rf = best_rf.predict(X_test)
mae_test, rmse_test, r2_test = evaluate("Random Forest (Test)", y_test, pred_test_rf)

# --------------------------------------------------------------
# SUMMARY
# --------------------------------------------------------------
print("\n============================================================")
print("üèÅ SUMMARY")
print(f"Ridge Regression Val R¬≤: {r2_ridge:.3f}")
print(f"Random Forest Val R¬≤:     {r2_rf:.3f}")
print(f"Random Forest Test R¬≤:    {r2_test:.3f}")
print("============================================================")


‚úÖ Loaded 105,571 rows across 600 engines
üß© Features: 35 | Target: 'RUL'
üß© Split by engines ‚Üí Train: 420, Val: 90, Test: 90

üîπ MODEL 1: RIDGE REGRESSION (Regularized Linear)
üèÜ Best Ridge Alpha: 100.0
üìä Ridge Regression (Val) ‚Üí MAE: 13.318, RMSE: 17.507, R¬≤: 0.892

üîπ MODEL 2: RANDOM FOREST (GridSearch)
Fitting 3 folds for each of 48 candidates, totalling 144 fits
üèÜ Best Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
üìä Random Forest (Val) ‚Üí MAE: 12.875, RMSE: 18.164, R¬≤: 0.884
üìä Random Forest (Test) ‚Üí MAE: 15.527, RMSE: 20.881, R¬≤: 0.867

üèÅ SUMMARY
Ridge Regression Val R¬≤: 0.892
Random Forest Val R¬≤:     0.884
Random Forest Test R¬≤:    0.867


In [14]:
# ==============================================================
# SAVE BEST MODEL + RETEST BOTH ON TEST SET
# ==============================================================

import os
import pickle
import json
from datetime import datetime

print("\nüîÅ Re-evaluating both models on TEST set...")

# --------------------------------------------------------------
# 1Ô∏è‚É£ Re-test Ridge Regression on test set
# --------------------------------------------------------------
pred_test_ridge = best_ridge.predict(X_test)
mae_test_ridge, rmse_test_ridge, r2_test_ridge = evaluate("Ridge Regression (Test)", y_test, pred_test_ridge)

# --------------------------------------------------------------
# 2Ô∏è‚É£ Re-test Random Forest on test set
# --------------------------------------------------------------
pred_test_rf = best_rf.predict(X_test)
mae_test_rf, rmse_test_rf, r2_test_rf = evaluate("Random Forest (Test)", y_test, pred_test_rf)

# --------------------------------------------------------------
# 3Ô∏è‚É£ Select best model based on validation R¬≤
# --------------------------------------------------------------
if r2_rf >= r2_ridge:
    best_model = best_rf
    model_name = "RandomForestRegressor"
    mae_val, rmse_val, r2_val = mae_rf, rmse_rf, r2_rf
    mae_test, rmse_test, r2_test = mae_test_rf, rmse_test_rf, r2_test_rf
else:
    best_model = best_ridge
    model_name = "RidgeRegression"
    mae_val, rmse_val, r2_val = mae_ridge, rmse_ridge, r2_ridge
    mae_test, rmse_test, r2_test = mae_test_ridge, rmse_test_ridge, r2_test_ridge

print("\nüèÜ BEST MODEL SELECTED")
print("------------------------------------------------------------")
print(f"Model Type : {model_name}")
print(f"Val MAE     : {mae_val:.3f}")
print(f"Val RMSE    : {rmse_val:.3f}")
print(f"Val R¬≤      : {r2_val:.3f}")
print(f"Test MAE    : {mae_test:.3f}")
print(f"Test RMSE   : {rmse_test:.3f}")
print(f"Test R¬≤     : {r2_test:.3f}")
print("------------------------------------------------------------")

# --------------------------------------------------------------
# 4Ô∏è‚É£ Save best model and metadata
# --------------------------------------------------------------
model_bank_directory = "model_bank/"
os.makedirs(model_bank_directory, exist_ok=True)

model_artefact = {
    "model_name": model_name,
    "model_version": f"engine_rul_prediction_{datetime.now():%Y-%m-%d}",
    "train_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "metrics": {
        "Ridge": {
            "Val": {"MAE": mae_ridge, "RMSE": rmse_ridge, "R2": r2_ridge},
            "Test": {"MAE": mae_test_ridge, "RMSE": rmse_test_ridge, "R2": r2_test_ridge},
        },
        "RandomForest": {
            "Val": {"MAE": mae_rf, "RMSE": rmse_rf, "R2": r2_rf},
            "Test": {"MAE": mae_test_rf, "RMSE": rmse_test_rf, "R2": r2_test_rf},
        },
        "Selected": {
            "Model": model_name,
            "Val_R2": r2_val,
            "Test_R2": r2_test
        }
    },
    "features_used": X.columns.tolist(),
    "train_size": len(X_train),
    "val_size": len(X_val),
    "test_size": len(X_test),
    "random_seed": 42
}

# Save model binary
model_filename = os.path.join(model_bank_directory, model_artefact["model_version"] + ".pkl")
with open(model_filename, "wb") as f:
    pickle.dump(best_model, f)

# Save metadata JSON
artefact_filename = os.path.join(model_bank_directory, model_artefact["model_version"] + "_meta.json")
with open(artefact_filename, "w") as f:
    json.dump(model_artefact, f, indent=4)

print(f"‚úÖ Best model ({model_name}) saved successfully!")
print(f"üíæ Model file     ‚Üí {model_filename}")
print(f"üìò Metadata file  ‚Üí {artefact_filename}")
print("------------------------------------------------------------")



üîÅ Re-evaluating both models on TEST set...
üìä Ridge Regression (Test) ‚Üí MAE: 14.504, RMSE: 19.231, R¬≤: 0.887
üìä Random Forest (Test) ‚Üí MAE: 15.527, RMSE: 20.881, R¬≤: 0.867

üèÜ BEST MODEL SELECTED
------------------------------------------------------------
Model Type : RidgeRegression
Val MAE     : 13.318
Val RMSE    : 17.507
Val R¬≤      : 0.892
Test MAE    : 14.504
Test RMSE   : 19.231
Test R¬≤     : 0.887
------------------------------------------------------------
‚úÖ Best model (RidgeRegression) saved successfully!
üíæ Model file     ‚Üí model_bank/engine_rul_prediction_2025-11-02.pkl
üìò Metadata file  ‚Üí model_bank/engine_rul_prediction_2025-11-02_meta.json
------------------------------------------------------------


In [15]:
# ==============================================================
# OUT-OF-TIME (OOT) BOOTSTRAP USING BEST-SAVED MODEL
# ==============================================================

import os
import numpy as np
import pandas as pd
import pickle
import json

# --------------------------------------------------------------
# 1Ô∏è‚É£ Load best model metadata
# --------------------------------------------------------------
# Latest artefact saved by model training
model_bank_directory = "model_bank/"
meta_files = sorted(
    [f for f in os.listdir(model_bank_directory) if f.endswith("_meta.json")],
    reverse=True
)
if not meta_files:
    raise FileNotFoundError("No model metadata found in model_bank/. Did you run model training?")

latest_meta_path = os.path.join(model_bank_directory, meta_files[0])
with open(latest_meta_path, "r") as f:
    model_artefact = json.load(f)

model_name = model_artefact["model_name"]
model_version = model_artefact["model_version"]
print(f"\nüì¶ Loaded model artefact: {model_name} ({model_version})")

# Load the corresponding trained model (.pkl)
model_file = os.path.join(model_bank_directory, model_version + ".pkl")
with open(model_file, "rb") as f:
    best_model = pickle.load(f)
print(f"‚úÖ Model loaded from {model_file}")

# --------------------------------------------------------------
# 2Ô∏è‚É£ Bootstrap OOT sample (from real test engines)
# --------------------------------------------------------------
print("\nüöÄ Generating OOT bootstrap from test set...")

test_units_unique = df.loc[df["unit"].isin(test_units), "unit"].unique()
n_oot = 90
boot_units = np.random.choice(test_units_unique, size=n_oot, replace=True)
oot_units = [f"OOT_{i+1}" for i in range(n_oot)]

oot_list = []

for new_id, base_unit in zip(oot_units, boot_units):
    engine_df = df[df["unit"] == base_unit].copy()
    engine_df = engine_df.sort_values("cycle").reset_index(drop=True)
    engine_df = engine_df[["cycle"] + X.columns.tolist() + ["RUL"]]
    engine_df["unit"] = new_id
    oot_list.append(engine_df)

oot_df = pd.concat(oot_list, ignore_index=True)

# --------------------------------------------------------------
# 3Ô∏è‚É£ Predict using loaded best model
# --------------------------------------------------------------
oot_X = oot_df[X.columns]
oot_y_true = oot_df["RUL"]
oot_df["RUL_pred"] = best_model.predict(oot_X)

# --------------------------------------------------------------
# 4Ô∏è‚É£ Evaluate performance
# --------------------------------------------------------------
mae_oot, rmse_oot, r2_oot = evaluate(f"{model_name} (OOT)", oot_y_true, oot_df["RUL_pred"])

# --------------------------------------------------------------
# 5Ô∏è‚É£ Save results
# --------------------------------------------------------------
oot_path = f"datamart/gold/oot_predictions_{model_name.lower()}.parquet"
os.makedirs(os.path.dirname(oot_path), exist_ok=True)
oot_df.to_parquet(oot_path, index=False)

print("\n‚úÖ OOT results saved successfully!")
print(f"üíæ Path: {oot_path}")
print(f"üìä {model_name} (OOT) ‚Üí MAE: {mae_oot:.3f}, RMSE: {rmse_oot:.3f}, R¬≤: {r2_oot:.3f}")



üì¶ Loaded model artefact: RidgeRegression (engine_rul_prediction_2025-11-02)
‚úÖ Model loaded from model_bank/engine_rul_prediction_2025-11-02.pkl

üöÄ Generating OOT bootstrap from test set...
üìä RidgeRegression (OOT) ‚Üí MAE: 13.864, RMSE: 18.757, R¬≤: 0.883

‚úÖ OOT results saved successfully!
üíæ Path: datamart/gold/oot_predictions_ridgeregression.parquet
üìä RidgeRegression (OOT) ‚Üí MAE: 13.864, RMSE: 18.757, R¬≤: 0.883
