In [6]:
# ==============================================================
# MODEL TRAINING PIPELINE ‚Äî LINEAR REGRESSION & SVR (RUL PREDICTION)
# ==============================================================

import os
import pandas as pd
import pickle
import numpy as np
import pprint
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# --------------------------------------------------------------
# 1Ô∏è‚É£ CONFIGURATION
# --------------------------------------------------------------
config = {}
config["model_train_date_str"] = datetime.now().strftime("%Y-%m-%d")
config["train_test_ratio"] = 0.8
config["random_seed"] = 42
config["model_name"] = "engine_rul_prediction"
config["models_directory"] = "model_bank/"

pprint.pprint(config)

np.random.seed(config["random_seed"])

# --------------------------------------------------------------
# 2Ô∏è‚É£ LOAD GOLD FEATURE & LABEL STORES (MERGED)
# --------------------------------------------------------------
print("\nüì¶ Loading Gold data...")

feature_path = "datamart/gold/feature_store.parquet"
label_path = "datamart/gold/label_store.parquet"

# Load both
X_feat = pd.read_parquet(feature_path)
y_df = pd.read_parquet(label_path)

# Merge safely (aligning by index)
df = X_feat.reset_index(drop=True).merge(y_df.reset_index(drop=True), left_index=True, right_index=True)

print(f"Loaded {len(df)} rows with {len(df['unit'].unique())} engines.")

# Separate features and target
y = df["RUL"]
X = df.drop(columns=["RUL"])

# Optional NaN safeguard
X = X.fillna(X.mean())

# --------------------------------------------------------------
# 3Ô∏è‚É£ TRAIN/TEST SPLIT (BY ENGINE UNIT ID)
# --------------------------------------------------------------
engine_ids = X["unit"].unique()
train_engines, test_engines = train_test_split(
    engine_ids, test_size=0.2, random_state=config["random_seed"]
)

X_train = X[X["unit"].isin(train_engines)]
X_test  = X[X["unit"].isin(test_engines)]
y_train = y[X["unit"].isin(train_engines)]
y_test  = y[X["unit"].isin(test_engines)]

print(f"\nSplit by engine IDs:")
print(f"  ‚Üí Train engines: {len(train_engines)} | Test engines: {len(test_engines)}")
print(f"  ‚Üí Train samples: {len(X_train)} | Test samples: {len(X_test)}")

# Drop non-feature columns before scaling
drop_cols = [c for c in ["unit", "cycle", "processing_timestamp", "op_regime", "early_life"] if c in X_train.columns]
X_train = X_train.drop(columns=drop_cols, errors="ignore")
X_test = X_test.drop(columns=drop_cols, errors="ignore")

# --------------------------------------------------------------
# 4Ô∏è‚É£ DATA SCALING
# --------------------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n‚úÖ Scaling complete.")

# --------------------------------------------------------------
# 5Ô∏è‚É£ TRAIN LINEAR REGRESSION MODEL
# --------------------------------------------------------------
print("\nüîπ Training Linear Regression Model...")
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predict & evaluate
y_pred_lr = lr_model.predict(X_test_scaled)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression ‚Üí MAE: {mae_lr:.3f}, RMSE: {rmse_lr:.3f}, R¬≤: {r2_lr:.3f}")

# --------------------------------------------------------------
# 6Ô∏è‚É£ TRAIN SVR MODEL
# --------------------------------------------------------------
print("\nüîπ Training SVR Model...")
svr_model = SVR(kernel="rbf", C=10, epsilon=0.1)
svr_model.fit(X_train_scaled, y_train)

# Predict & evaluate
y_pred_svr = svr_model.predict(X_test_scaled)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
rmse_svr = np.sqrt(mean_squared_error(y_test, y_pred_svr))
r2_svr = r2_score(y_test, y_pred_svr)

print(f"SVR (RBF) ‚Üí MAE: {mae_svr:.3f}, RMSE: {rmse_svr:.3f}, R¬≤: {r2_svr:.3f}")

# --------------------------------------------------------------
# 7Ô∏è‚É£ SELECT BEST MODEL (BASED ON RMSE)
# --------------------------------------------------------------
if rmse_svr < rmse_lr:
    best_model = svr_model
    best_name = "SVR (RBF)"
    metrics = {"MAE": mae_svr, "RMSE": rmse_svr, "R2": r2_svr}
else:
    best_model = lr_model
    best_name = "Linear Regression"
    metrics = {"MAE": mae_lr, "RMSE": rmse_lr, "R2": r2_lr}

print(f"\nüèÜ Best Model Selected: {best_name}")
print(f"   ‚Üí RMSE: {metrics['RMSE']:.3f}, MAE: {metrics['MAE']:.3f}, R¬≤: {metrics['R2']:.3f}")

# --------------------------------------------------------------
# 8Ô∏è‚É£ SAVE BEST MODEL ARTEFACT
# --------------------------------------------------------------
os.makedirs(config["models_directory"], exist_ok=True)

model_artefact = {
    "model_name": best_name,
    "model_version": f"{config['model_name']}_{config['model_train_date_str']}",
    "model": best_model,
    "metrics": metrics,
    "scaler": scaler,
    "train_test_split": {
        "train_engines": train_engines.tolist(),
        "test_engines": test_engines.tolist(),
        "train_ratio": config["train_test_ratio"]
    },
    "created_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}

file_path = os.path.join(
    config["models_directory"], model_artefact["model_version"] + ".pkl"
)

with open(file_path, "wb") as f:
    pickle.dump(model_artefact, f)

print(f"\n‚úÖ Saved best model to: {file_path}")

# --------------------------------------------------------------
# 9Ô∏è‚É£ VALIDATE LOADED MODEL
# --------------------------------------------------------------
with open(file_path, "rb") as f:
    loaded_model = pickle.load(f)

loaded_scaler = loaded_model["scaler"]
loaded_model_instance = loaded_model["model"]

y_pred_check = loaded_model_instance.predict(loaded_scaler.transform(X_test))
print(f"\n‚úÖ Model reload check passed. Mean prediction: {np.mean(y_pred_check):.2f}")

# --------------------------------------------------------------
# üîü PRINT FINAL SUMMARY
# --------------------------------------------------------------
print("\nüìä FINAL MODEL PERFORMANCE SUMMARY")
pprint.pprint(model_artefact["metrics"])


{'model_name': 'engine_rul_prediction',
 'model_train_date_str': '2025-11-01',
 'models_directory': 'model_bank/',
 'random_seed': 42,
 'train_test_ratio': 0.8}

üì¶ Loading Gold data...
Loaded 20631 rows with 100 engines.

Split by engine IDs:
  ‚Üí Train engines: 80 | Test engines: 20
  ‚Üí Train samples: 16561 | Test samples: 4070

‚úÖ Scaling complete.

üîπ Training Linear Regression Model...
Linear Regression ‚Üí MAE: 23.449, RMSE: 36.932, R¬≤: 0.684

üîπ Training SVR Model...
SVR (RBF) ‚Üí MAE: 23.146, RMSE: 36.787, R¬≤: 0.686

üèÜ Best Model Selected: SVR (RBF)
   ‚Üí RMSE: 36.787, MAE: 23.146, R¬≤: 0.686

‚úÖ Saved best model to: model_bank/engine_rul_prediction_2025-11-01.pkl

‚úÖ Model reload check passed. Mean prediction: 101.78

üìä FINAL MODEL PERFORMANCE SUMMARY
{'MAE': 23.146113234911326,
 'R2': 0.6860258979034353,
 'RMSE': np.float64(36.78698968917228)}
