In [12]:
# ==============================================================
# MODEL TRAINING PIPELINE (Notebook Version)
# ==============================================================

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from math import sqrt

# --------------------------------------------------------------
# LOAD GOLD DATA
# --------------------------------------------------------------
feature_path = "datamart/gold/feature_store.csv"
label_path   = "datamart/gold/label_store.csv"
df_feat = pd.read_csv(feature_path)
df_label = pd.read_csv(label_path)

# Merge on unit + cycle
df = df_feat.merge(df_label, on=["unit", "cycle"], how="inner")
print(f"✅ Loaded {len(df):,} rows across {df['unit'].nunique()} engines")

# --------------------------------------------------------------
# PREPARE FEATURES / LABELS
# --------------------------------------------------------------
drop_cols = ["unit", "cycle", "RUL"]
X = df.drop(columns=drop_cols)
y = df["RUL"]
units = df["unit"]

# --------------------------------------------------------------
# SPLIT ENGINE-WISE (avoid leakage)
# --------------------------------------------------------------
np.random.seed(42)
engine_ids = df["unit"].unique()
np.random.shuffle(engine_ids)

n = len(engine_ids)
train_cut = int(n * 0.7)
val_cut   = int(n * 0.85)

train_units = engine_ids[:train_cut]
val_units   = engine_ids[train_cut:val_cut]
test_units  = engine_ids[val_cut:]

def subset(units_subset):
    mask = df["unit"].isin(units_subset)
    return df.loc[mask, X.columns], df.loc[mask, "RUL"]

X_train, y_train = subset(train_units)
X_val, y_val     = subset(val_units)
X_test, y_test   = subset(test_units)

print(f"🧩 Split by engines → Train: {len(train_units)}, Val: {len(val_units)}, Test: {len(test_units)}")

# --------------------------------------------------------------
# METRIC HELPER
# --------------------------------------------------------------
def evaluate(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"📊 {name} → MAE: {mae:.3f}, RMSE: {rmse:.3f}, R²: {r2:.3f}")
    return mae, rmse, r2

# --------------------------------------------------------------
# MODEL 1 — LINEAR REGRESSION
# --------------------------------------------------------------
from sklearn.linear_model import LinearRegression

print("\n🔹 MODEL 1: LINEAR REGRESSION (Baseline)")
lr = LinearRegression()
lr.fit(X_train, y_train)
pred_val_lr = lr.predict(X_val)
mae_lr, rmse_lr, r2_lr = evaluate("Linear Regression (Val)", y_val, pred_val_lr)

# --------------------------------------------------------------
# MODEL 2 — RANDOM FOREST (GridSearch)
# --------------------------------------------------------------
from sklearn.ensemble import RandomForestRegressor

print("\n🔹 MODEL 2: RANDOM FOREST (GridSearch)")
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt", "log2"]
}

def rmse_scorer(y_true, y_pred):
    return -sqrt(mean_squared_error(y_true, y_pred))

from sklearn.metrics import make_scorer
grid = GridSearchCV(
    rf,
    param_grid,
    scoring=make_scorer(rmse_scorer),
    cv=3,
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train, y_train)

best_rf = grid.best_estimator_
print(f"🏆 Best Params: {grid.best_params_}")

pred_val_rf = best_rf.predict(X_val)
mae_rf, rmse_rf, r2_rf = evaluate("Random Forest (Val)", y_val, pred_val_rf)

# --------------------------------------------------------------
# TEST EVALUATION
# --------------------------------------------------------------
pred_test_rf = best_rf.predict(X_test)
mae_test, rmse_test, r2_test = evaluate("Random Forest (Test)", y_test, pred_test_rf)

# --------------------------------------------------------------
# SUMMARY
# --------------------------------------------------------------
print("\n============================================================")
print("🏁 SUMMARY")
print(f"Linear Regression Val R²: {r2_lr:.3f}")
print(f"Random Forest Val R²:     {r2_rf:.3f}")
print(f"Random Forest Test R²:    {r2_test:.3f}")
print("============================================================")


✅ Loaded 105,571 rows across 600 engines
🧩 Split by engines → Train: 420, Val: 90, Test: 90

🔹 MODEL 1: LINEAR REGRESSION (Baseline)
📊 Linear Regression (Val) → MAE: 14.060, RMSE: 18.237, R²: 0.891

🔹 MODEL 2: RANDOM FOREST (GridSearch)
Fitting 3 folds for each of 48 candidates, totalling 144 fits


KeyboardInterrupt: 