In [None]:
# ==============================================================
# MODEL TRAINING PIPELINE — PROPER TRAIN/VALIDATE/TEST SPLIT
# ==============================================================

import os
import pandas as pd
import pickle
import numpy as np
import pprint
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from itertools import product

# --------------------------------------------------------------
# 1️⃣ CONFIGURATION
# --------------------------------------------------------------
config = {}
config["model_train_date_str"] = datetime.now().strftime("%Y-%m-%d")
config["train_ratio"] = 0.70
config["validation_ratio"] = 0.15
config["test_ratio"] = 0.15
config["random_seed"] = 42
config["model_name"] = "engine_rul_prediction"
config["models_directory"] = "model_bank/"
config["enable_hyperparameter_tuning"] = True

pprint.pprint(config)

np.random.seed(config["random_seed"])

# --------------------------------------------------------------
# 2️⃣ LOAD GOLD FEATURE & LABEL STORES (MERGED)
# --------------------------------------------------------------
print("\n📦 Loading Gold data...")

feature_path = "datamart/gold/feature_store.parquet"
label_path = "datamart/gold/label_store.parquet"

# Load both
X_feat = pd.read_parquet(feature_path)
y_df = pd.read_parquet(label_path)

# Merge safely (aligning by index)
df = X_feat.reset_index(drop=True).merge(y_df.reset_index(drop=True), left_index=True, right_index=True)

print(f"Loaded {len(df)} rows with {len(df['unit'].unique())} engines.")

# Separate features and target
y = df["RUL"]
X = df.drop(columns=["RUL"])

# Optional NaN safeguard
X = X.fillna(X.mean())

# --------------------------------------------------------------
# 3️⃣ TRAIN/VALIDATE/TEST SPLIT (BY ENGINE UNIT ID) - 70/15/15
# --------------------------------------------------------------
engine_ids = X["unit"].unique()

# First split: 70% train, 30% temp (for validation + test)
train_engines, temp_engines = train_test_split(
    engine_ids, 
    test_size=(config["validation_ratio"] + config["test_ratio"]), 
    random_state=config["random_seed"]
)

# Second split: Split temp into 50/50 for validation and test (15% each of total)
validation_engines, test_engines = train_test_split(
    temp_engines, 
    test_size=0.5, 
    random_state=config["random_seed"]
)

# Create datasets
X_train = X[X["unit"].isin(train_engines)]
X_val = X[X["unit"].isin(validation_engines)]
X_test = X[X["unit"].isin(test_engines)]

y_train = y[X["unit"].isin(train_engines)]
y_val = y[X["unit"].isin(validation_engines)]
y_test = y[X["unit"].isin(test_engines)]

print(f"\n📊 Data Split (by engine IDs):")
print(f"  → Train:      {len(train_engines):3d} engines ({len(X_train):6d} samples) - {len(train_engines)/len(engine_ids)*100:.1f}%")
print(f"  → Validation: {len(validation_engines):3d} engines ({len(X_val):6d} samples) - {len(validation_engines)/len(engine_ids)*100:.1f}%")
print(f"  → Test:       {len(test_engines):3d} engines ({len(X_test):6d} samples) - {len(test_engines)/len(engine_ids)*100:.1f}%")

# Drop non-feature columns before scaling
drop_cols = [c for c in ["unit", "cycle", "processing_timestamp", "op_regime", "early_life"] if c in X_train.columns]
X_train_clean = X_train.drop(columns=drop_cols, errors="ignore")
X_val_clean = X_val.drop(columns=drop_cols, errors="ignore")
X_test_clean = X_test.drop(columns=drop_cols, errors="ignore")

# --------------------------------------------------------------
# 4️⃣ DATA SCALING (fit on train, transform on val & test)
# --------------------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_clean)
X_val_scaled = scaler.transform(X_val_clean)
X_test_scaled = scaler.transform(X_test_clean)

print("\n✅ Scaling complete.")

# --------------------------------------------------------------
# 5️⃣ TRAIN LINEAR REGRESSION (NO HYPERPARAMETER TUNING)
# --------------------------------------------------------------
print("\n" + "="*60)
print("🔹 MODEL 1: LINEAR REGRESSION (Baseline)")
print("="*60)
print("   No hyperparameters to tune - training directly on train set...")

lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Evaluate on validation set
y_val_pred_lr = lr_model.predict(X_val_scaled)
val_mae_lr = mean_absolute_error(y_val, y_val_pred_lr)
val_rmse_lr = np.sqrt(mean_squared_error(y_val, y_val_pred_lr))
val_r2_lr = r2_score(y_val, y_val_pred_lr)

print(f"\n   📊 Validation Performance:")
print(f"      MAE:  {val_mae_lr:.3f}")
print(f"      RMSE: {val_rmse_lr:.3f}")
print(f"      R²:   {val_r2_lr:.3f}")

# --------------------------------------------------------------
# 6️⃣ TRAIN RANDOM FOREST (WITH HYPERPARAMETER TUNING ON VALIDATION)
# --------------------------------------------------------------
print("\n" + "="*60)
print("🔹 MODEL 2: RANDOM FOREST (With Hyperparameter Tuning)")
print("="*60)

# STEP 1: Train baseline Random Forest with default parameters
print("   STEP 1: Training baseline Random Forest (default params)...")
rf_baseline = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=config["random_seed"],
    n_jobs=-1
)
rf_baseline.fit(X_train_scaled, y_train)

# Evaluate baseline on validation set
y_val_pred_baseline = rf_baseline.predict(X_val_scaled)
val_mae_baseline = mean_absolute_error(y_val, y_val_pred_baseline)
val_rmse_baseline = np.sqrt(mean_squared_error(y_val, y_val_pred_baseline))
val_r2_baseline = r2_score(y_val, y_val_pred_baseline)

print(f"\n   📊 Baseline Performance (PRE-TUNING):")
print(f"      MAE:  {val_mae_baseline:.3f}")
print(f"      RMSE: {val_rmse_baseline:.3f}")
print(f"      R²:   {val_r2_baseline:.3f}")

# STEP 2: Hyperparameter tuning on validation set
if config["enable_hyperparameter_tuning"]:
    print(f"\n   STEP 2: Tuning hyperparameters on validation set...")
    
    # Define hyperparameter grid
    param_grid = {
        "n_estimators": [50, 100, 150],
        "max_depth": [10, 20, 30],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    }
    
    print(f"   Testing {len(list(product(*param_grid.values())))} parameter combinations...\n")
    
    best_rmse = float('inf')
    best_params = None
    best_rf_model = None
    
    # Manual grid search using validation set
    total_combinations = len(list(product(*param_grid.values())))
    combo_count = 0
    
    for n_est in param_grid["n_estimators"]:
        for max_d in param_grid["max_depth"]:
            for min_split in param_grid["min_samples_split"]:
                for min_leaf in param_grid["min_samples_leaf"]:
                    combo_count += 1
                    
                    # Train model with current parameters
                    rf = RandomForestRegressor(
                        n_estimators=n_est,
                        max_depth=max_d,
                        min_samples_split=min_split,
                        min_samples_leaf=min_leaf,
                        random_state=config["random_seed"],
                        n_jobs=-1
                    )
                    rf.fit(X_train_scaled, y_train)
                    
                    # Evaluate on validation set
                    y_val_pred = rf.predict(X_val_scaled)
                    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
                    
                    print(f"   [{combo_count}/{total_combinations}] n_est={n_est}, max_depth={max_d}, "
                          f"min_split={min_split}, min_leaf={min_leaf} → Val RMSE: {val_rmse:.3f}")
                    
                    # Track best model
                    if val_rmse < best_rmse:
                        best_rmse = val_rmse
                        best_params = {
                            "n_estimators": n_est,
                            "max_depth": max_d,
                            "min_samples_split": min_split,
                            "min_samples_leaf": min_leaf
                        }
                        best_rf_model = rf
    
    print(f"\n   ✅ Best hyperparameters found:")
    pprint.pprint(best_params)
    
    rf_model = best_rf_model
    
    # Evaluate tuned model on validation set
    y_val_pred_rf = rf_model.predict(X_val_scaled)
    val_mae_rf = mean_absolute_error(y_val, y_val_pred_rf)
    val_rmse_rf = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))
    val_r2_rf = r2_score(y_val, y_val_pred_rf)
    
    print(f"\n   📊 Tuned Performance (POST-TUNING):")
    print(f"      MAE:  {val_mae_rf:.3f}")
    print(f"      RMSE: {val_rmse_rf:.3f}")
    print(f"      R²:   {val_r2_rf:.3f}")
    
    # Show improvement
    rmse_improvement = val_rmse_baseline - val_rmse_rf
    improvement_pct = (rmse_improvement / val_rmse_baseline) * 100
    
    print(f"\n   📈 IMPROVEMENT FROM TUNING:")
    print(f"      RMSE Reduction: {rmse_improvement:.3f} ({improvement_pct:+.1f}%)")
    
    if improvement_pct > 0:
        print(f"      ✅ Hyperparameter tuning improved performance!")
    else:
        print(f"      ⚠️ Baseline was already optimal - tuning didn't help")
    
else:
    # Use baseline model
    print("   Skipping hyperparameter tuning (disabled in config)...")
    rf_model = rf_baseline
    val_mae_rf = val_mae_baseline
    val_rmse_rf = val_rmse_baseline
    val_r2_rf = val_r2_baseline
    best_params = None

# --------------------------------------------------------------
# 7️⃣ SELECT BEST MODEL (BASED ON VALIDATION RMSE)
# --------------------------------------------------------------
print("\n" + "="*60)
print("📊 MODEL COMPARISON (Validation Set)")
print("="*60)
print(f"  Linear Regression → RMSE: {val_rmse_lr:.3f}, MAE: {val_mae_lr:.3f}, R²: {val_r2_lr:.3f}")
print(f"  Random Forest     → RMSE: {val_rmse_rf:.3f}, MAE: {val_mae_rf:.3f}, R²: {val_r2_rf:.3f}")

if val_rmse_rf < val_rmse_lr:
    best_model = rf_model
    best_model_name = "Random Forest"
    val_metrics = {"MAE": val_mae_rf, "RMSE": val_rmse_rf, "R2": val_r2_rf}
else:
    best_model = lr_model
    best_model_name = "Linear Regression"
    val_metrics = {"MAE": val_mae_lr, "RMSE": val_rmse_lr, "R2": val_r2_lr}

print(f"\n🏆 Best Model Selected: {best_model_name}")

# --------------------------------------------------------------
# 8️⃣ FINAL EVALUATION ON TEST SET (UNSEEN DATA)
# --------------------------------------------------------------
print("\n" + "="*60)
print("🧪 FINAL TEST SET EVALUATION (Unseen Data)")
print("="*60)

y_test_pred = best_model.predict(X_test_scaled)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

print(f"\n   Model: {best_model_name}")
print(f"   MAE:  {test_mae:.3f}")
print(f"   RMSE: {test_rmse:.3f}")
print(f"   R²:   {test_r2:.3f}")

test_metrics = {"MAE": test_mae, "RMSE": test_rmse, "R2": test_r2}

# --------------------------------------------------------------
# 9️⃣ SAVE BEST MODEL ARTEFACT
# --------------------------------------------------------------
os.makedirs(config["models_directory"], exist_ok=True)

model_artefact = {
    "model_name": best_model_name,
    "model_version": f"{config['model_name']}_{config['model_train_date_str']}",
    "model": best_model,
    "scaler": scaler,
    "validation_metrics": val_metrics,
    "test_metrics": test_metrics,
    "best_hyperparameters": best_params if config["enable_hyperparameter_tuning"] and best_model_name == "Random Forest" else None,
    "data_split": {
        "train_engines": train_engines.tolist(),
        "validation_engines": validation_engines.tolist(),
        "test_engines": test_engines.tolist(),
        "train_ratio": config["train_ratio"],
        "validation_ratio": config["validation_ratio"],
        "test_ratio": config["test_ratio"]
    },
    "created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}

file_path = os.path.join(
    config["models_directory"], model_artefact["model_version"] + ".pkl"
)

with open(file_path, "wb") as f:
    pickle.dump(model_artefact, f)

print(f"\n✅ Saved best model to: {file_path}")

# --------------------------------------------------------------
# 🔟 VALIDATE LOADED MODEL
# --------------------------------------------------------------
with open(file_path, "rb") as f:
    loaded_model = pickle.load(f)

loaded_scaler = loaded_model["scaler"]
loaded_model_instance = loaded_model["model"]

y_test_check = loaded_model_instance.predict(loaded_scaler.transform(X_test_clean))
print(f"✅ Model reload check passed. Mean test prediction: {np.mean(y_test_check):.2f}")

# --------------------------------------------------------------
# 1️⃣1️⃣ FINAL SUMMARY
# --------------------------------------------------------------
print("\n" + "="*60)
print("🎯 FINAL SUMMARY")
print("="*60)
print(f"Best Model: {best_model_name}")
print(f"\nValidation Performance (used for model selection):")
pprint.pprint(val_metrics)
print(f"\nTest Performance (final unseen data evaluation):")
pprint.pprint(test_metrics)
if model_artefact["best_hyperparameters"]:
    print(f"\nBest Hyperparameters:")
    pprint.pprint(model_artefact["best_hyperparameters"])
print("\n✅ Training pipeline complete!")

{'cv_folds': 3,
 'enable_hyperparameter_tuning': True,
 'model_name': 'engine_rul_prediction',
 'model_train_date_str': '2025-11-01',
 'models_directory': 'model_bank/',
 'random_seed': 42,
 'train_test_ratio': 0.8}

📦 Loading Gold data...
Loaded 105571 rows with 600 engines.

Split by engine IDs:
  → Train engines: 480 | Test engines: 120
  → Train samples: 84097 | Test samples: 21474

✅ Scaling complete.

🔹 Training Linear Regression...
   ✅ Trained with default parameters
   📊 Linear Regression → MAE: 48.557, RMSE: 58.190, R²: -0.022

🔹 Training Random Forest...
   Running GridSearchCV with 3-fold cross-validation...
Fitting 3 folds for each of 36 candidates, totalling 108 fits


KeyboardInterrupt: 

In [2]:
# ==============================================================
# ENGINE RUL MODELLING PIPELINE — TRAIN / VALIDATION / TEST / OOT
# ==============================================================
import os
import pandas as pd
import numpy as np
import pickle
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pprint

# --------------------------------------------------------------
# 1️⃣ CONFIGURATION
# --------------------------------------------------------------
config = {
    "model_name": "engine_rul_prediction",
    "model_train_date_str": datetime.now().strftime("%Y-%m-%d"),
    "train_ratio": 0.70,
    "val_ratio": 0.15,
    "test_ratio": 0.15,
    "random_seed": 42,
    "enable_hyperparameter_tuning": True,
    "models_directory": "model_bank/"
}

pprint.pprint(config)
np.random.seed(config["random_seed"])


# --------------------------------------------------------------
# 2️⃣ LOAD GOLD FEATURE + LABEL STORE
# --------------------------------------------------------------
print("\n📦 Loading Gold data...")
feature_path = "datamart/gold/feature_store.parquet"
label_path = "datamart/gold/label_store.parquet"

X_feat = pd.read_parquet(feature_path)
y_df = pd.read_parquet(label_path)
df = X_feat.reset_index(drop=True).merge(y_df.reset_index(drop=True), left_index=True, right_index=True)

print(f"✅ Loaded {len(df):,} rows across {df['unit'].nunique()} engines.")
y = df["RUL"]
X = df.drop(columns=["RUL"])

# Handle missing values
X = X.fillna(X.mean())

# --------------------------------------------------------------
# 3️⃣ TRAIN / VALIDATION / TEST SPLIT (BY ENGINE)
# --------------------------------------------------------------
engine_ids = np.array(sorted(X["unit"].unique()))
train_eng, temp_eng = train_test_split(
    engine_ids,
    test_size=(config["val_ratio"] + config["test_ratio"]),
    random_state=config["random_seed"]
)
val_eng, test_eng = train_test_split(
    temp_eng,
    test_size=0.5,
    random_state=config["random_seed"]
)

def subset_by_engine(df, y, engines):
    return (
        df[df["unit"].isin(engines)],
        y[df["unit"].isin(engines)]
    )

X_train, y_train = subset_by_engine(X, y, train_eng)
X_val, y_val = subset_by_engine(X, y, val_eng)
X_test, y_test = subset_by_engine(X, y, test_eng)

print(f"\n📊 Engine Split Summary:")
print(f"   Train: {len(train_eng)} engines, {len(X_train):,} samples")
print(f"   Val:   {len(val_eng)} engines, {len(X_val):,} samples")
print(f"   Test:  {len(test_eng)} engines, {len(X_test):,} samples")

# Drop identifiers
drop_cols = [c for c in ["unit", "cycle", "processing_timestamp", "op_regime", "early_life"] if c in X_train.columns]
X_train = X_train.drop(columns=drop_cols, errors="ignore")
X_val = X_val.drop(columns=drop_cols, errors="ignore")
X_test = X_test.drop(columns=drop_cols, errors="ignore")

# --------------------------------------------------------------
# 4️⃣ SCALE DATA
# --------------------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
print("\n✅ Scaling complete.")

# --------------------------------------------------------------
# 5️⃣ DEFINE MODEL CANDIDATES
# --------------------------------------------------------------
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=config["random_seed"]),
    "GradientBoosting": GradientBoostingRegressor(random_state=config["random_seed"])
}

# --------------------------------------------------------------
# 6️⃣ TRAIN + VALIDATE MODELS
# --------------------------------------------------------------
def evaluate_model(model, X_train, y_train, X_val, y_val, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2 = r2_score(y_val, y_pred)
    print(f"{model_name:<20} → MAE={mae:.3f}, RMSE={rmse:.3f}, R²={r2:.3f}")
    return {"MAE": mae, "RMSE": rmse, "R2": r2}, model

results = {}
for name, model in models.items():
    print(f"\n🔹 Training {name}...")
    metrics, trained_model = evaluate_model(model, X_train_scaled, y_train, X_val_scaled, y_val, name)
    results[name] = {"metrics": metrics, "model": trained_model}

# --------------------------------------------------------------
# 7️⃣ HYPERPARAMETER TUNING (Optional)
# --------------------------------------------------------------
if config["enable_hyperparameter_tuning"]:
    print("\n⚙️ Starting Hyperparameter Tuning for Random Forest...")
    best_rmse = float("inf")
    best_params = None
    best_model = None
    for n_estimators in [100, 200]:
        for max_depth in [10, 20, None]:
            rf = RandomForestRegressor(
                n_estimators=n_estimators,
                max_depth=max_depth,
                random_state=config["random_seed"],
                n_jobs=-1
            )
            rf.fit(X_train_scaled, y_train)
            rmse = np.sqrt(mean_squared_error(y_val, rf.predict(X_val_scaled)))
            print(f"   n_estimators={n_estimators}, max_depth={max_depth} → RMSE={rmse:.3f}")
            if rmse < best_rmse:
                best_rmse = rmse
                best_params = {"n_estimators": n_estimators, "max_depth": max_depth}
                best_model = rf

    if best_model:
        results["RandomForest_Tuned"] = {
            "metrics": {
                "RMSE": best_rmse,
                "MAE": mean_absolute_error(y_val, best_model.predict(X_val_scaled)),
                "R2": r2_score(y_val, best_model.predict(X_val_scaled))
            },
            "model": best_model,
            "params": best_params
        }
        print(f"\n✅ Best RF params: {best_params}, RMSE={best_rmse:.3f}")

# --------------------------------------------------------------
# 8️⃣ SELECT BEST MODEL (BASED ON VALIDATION RMSE)
# --------------------------------------------------------------
best_name = min(results.keys(), key=lambda k: results[k]["metrics"]["RMSE"])
best_model = results[best_name]["model"]
best_metrics = results[best_name]["metrics"]
print(f"\n🏆 Best Model Selected: {best_name} → RMSE={best_metrics['RMSE']:.3f}")

# --------------------------------------------------------------
# 9️⃣ EVALUATE ON TEST + OOT (Out-of-Time)
# --------------------------------------------------------------
print("\n🧪 Evaluating on Test + OOT splits...")

# Create OOT set = engines with latest cycles (simulate future period)
oot_threshold = df["cycle"].quantile(0.9)
oot_df = df[df["cycle"] >= oot_threshold]
oot_y = oot_df["RUL"]
oot_X = oot_df.drop(columns=["RUL"] + drop_cols, errors="ignore")
oot_X_scaled = scaler.transform(oot_X)

# Evaluate
def evaluate_final(model, X, y, name):
    y_pred = model.predict(X)
    return {
        "MAE": mean_absolute_error(y, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y, y_pred)),
        "R2": r2_score(y, y_pred)
    }

test_metrics = evaluate_final(best_model, X_test_scaled, y_test, "Test")
oot_metrics = evaluate_final(best_model, oot_X_scaled, oot_y, "OOT")

print(f"   Test → RMSE={test_metrics['RMSE']:.3f}, MAE={test_metrics['MAE']:.3f}, R²={test_metrics['R2']:.3f}")
print(f"   OOT  → RMSE={oot_metrics['RMSE']:.3f}, MAE={oot_metrics['MAE']:.3f}, R²={oot_metrics['R2']:.3f}")

# --------------------------------------------------------------
# 🔟 SAVE BEST MODEL ARTEFACT
# --------------------------------------------------------------
os.makedirs(config["models_directory"], exist_ok=True)
model_artefact = {
    "model_name": best_name,
    "model_version": f"{config['model_name']}_{config['model_train_date_str']}",
    "model": best_model,
    "scaler": scaler,
    "validation_metrics": best_metrics,
    "test_metrics": test_metrics,
    "oot_metrics": oot_metrics,
    "created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}
file_path = os.path.join(config["models_directory"], f"{model_artefact['model_version']}.pkl")
with open(file_path, "wb") as f:
    pickle.dump(model_artefact, f)

print(f"\n✅ Saved model artefact to: {file_path}")
print("🏁 Training pipeline complete.")


{'enable_hyperparameter_tuning': True,
 'model_name': 'engine_rul_prediction',
 'model_train_date_str': '2025-11-01',
 'models_directory': 'model_bank/',
 'random_seed': 42,
 'test_ratio': 0.15,
 'train_ratio': 0.7,
 'val_ratio': 0.15}

📦 Loading Gold data...
✅ Loaded 105,571 rows across 600 engines.

📊 Engine Split Summary:
   Train: 420 engines, 73,179 samples
   Val:   90 engines, 16,145 samples
   Test:  90 engines, 16,247 samples

✅ Scaling complete.

🔹 Training LinearRegression...
LinearRegression     → MAE=49.384, RMSE=60.504, R²=-0.025

🔹 Training RandomForest...


KeyboardInterrupt: 