# **INSTALLS AND PACKAGES**

In [14]:
import os, json, joblib, warnings, sys, time
from pprint import pformat

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, PowerTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold, GroupKFold, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, classification_report, accuracy_score, f1_score
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVR, SVC
from sklearn.neural_network import MLPRegressor as SKMLPRegressor, MLPClassifier as SKMLPClassifier
from sklearn.inspection import permutation_importance

USE_TORCH = False
try:
    import torch
    import torch.nn as nn
    import torch.optim as optim
    from torch.utils.data import TensorDataset, DataLoader
    USE_TORCH = True
    TORCH_CUDA = torch.cuda.is_available()
except Exception:
    USE_TORCH = False
    TORCH_CUDA = False

if USE_TORCH:
    print(f"[OK]  PyTorch loaded")
    print(f"       CUDA available: {TORCH_CUDA}")
    if TORCH_CUDA:
        print(f"       GPU device: {torch.cuda.get_device_name(0)}")
    else:
        print("       Running on CPU")
else:
    print("[INFO] PyTorch not installed — using CPU-only sklearn MLP")
print(f"Python version: {sys.version.split()[0]}")

print(f"Packages loaded successfully......!")

[OK]  PyTorch loaded
       CUDA available: False
       Running on CPU
Python version: 3.13.2
Packages loaded successfully......!


# **SETTINGS AND PATH**

In [15]:
DATA_PATH = "Data/data.csv"
BEST_REGRESSOR_PATH = "models/best_regressor.joblib"
BEST_CLASSIFIER_PATH = "models/best_classifier.joblib"
LEADERBOARD_PATH = "reports/leaderboards.json"

RANDOM_STATE = 42
TARGET = "msrp"
TARGET_CLS = "performance_category"

CV = StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE)
CV5 = KFold(5, shuffle=True, random_state=RANDOM_STATE)
GKF = GroupKFold(n_splits=5)
USE_GROUP_CV = True

SCORE_REG = "neg_root_mean_squared_error"
SCORE_CLA = "f1_macro"
CACHE_SIZE = 2000
MAX_ITER = 25000
N_ITER = 25

# Feature lists
NUMERIC = ["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg", "popularity", "number_of_doors"]
CATEGORICAL = ["make", "model", "engine_fuel_type", "transmission_type", "driven_wheels", "market_category", "vehicle_size", "vehicle_style"]

# Hypertune lists
ALPHA  = [1e-4,1e-3,1e-2,1e-1,0.5,1,10,50,100]
COEF = 10**np.linspace(-3,3,101)
EPSILON = np.linspace(0,0.1,11)
DEGREE = [2,3,4,5]
GAMMA = ['scale','auto']
COEF0 = [0.0,1.0,5.0]
N_NEIGHBORS = [1,2,3,5,7,9,11,15,20,25,30,40,50,75,100]
WEIGHTS = ["uniform","distance"]
LOGREG_PENALTY = ["l2"]
LOGREG_C = [0.01,0.1,0.5,1,2,5,10]
LOGREG_SOLVER = ["lbfgs","saga"]
LDA_SOLVER = ["svd","lsqr","eigen"]
QDA_REG_PARAM = np.linspace(0.05, 0.9, 10)

print(f"DATA_PATH:               {DATA_PATH}")
print(f"BEST_REGRESSOR_PATH:     {BEST_REGRESSOR_PATH}")
print(f"BEST_CLASSIFIER_PATH:    {BEST_CLASSIFIER_PATH}")
print(f"LEADERBOARD_PATH:        {LEADERBOARD_PATH}")
print(f"\nRandom State:            {RANDOM_STATE}")
print(f"Use GroupKFold:          {USE_GROUP_CV}")
print(f"Regression Metric:       {SCORE_REG}")
print(f"Classification Metric:   {SCORE_CLA}")
print("\n--- Feature Columns ---")
print(f"Numeric ({len(NUMERIC)}):        {NUMERIC}")
print(f"Categorical ({len(CATEGORICAL)}): {CATEGORICAL}")
print("\n--- Hyperparameter Search Settings ---")
print(f"SVR Iterations:          {N_ITER}")
print(f"SVR Max Iter:            {MAX_ITER}")
print(f"SVR Cache Size:          {CACHE_SIZE} MB")

print("\nCore configuration loaded successfully.\n")


DATA_PATH:               Data/data.csv
BEST_REGRESSOR_PATH:     models/best_regressor.joblib
BEST_CLASSIFIER_PATH:    models/best_classifier.joblib
LEADERBOARD_PATH:        reports/leaderboards.json

Random State:            42
Use GroupKFold:          True
Regression Metric:       neg_root_mean_squared_error
Classification Metric:   f1_macro

--- Feature Columns ---
Numeric (7):        ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity', 'number_of_doors']
Categorical (8): ['make', 'model', 'engine_fuel_type', 'transmission_type', 'driven_wheels', 'market_category', 'vehicle_size', 'vehicle_style']

--- Hyperparameter Search Settings ---
SVR Iterations:          25
SVR Max Iter:            25000
SVR Cache Size:          2000 MB

Core configuration loaded successfully.



# **PREPROCESS AND SPLIT DATA**

In [16]:
car_data = pd.read_csv(DATA_PATH)
car_data.columns = [c.strip().lower().replace(" ","_").replace("-","_") for c in car_data.columns]

# Needed columns check
needed = set(NUMERIC + CATEGORICAL + [TARGET])
missing = [col for col in needed if col not in car_data.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Numeric coercion
for col in NUMERIC + [TARGET]:
    car_data[col] = pd.to_numeric(car_data[col], errors="coerce")

# Create performance_category from engine_hp
hp = pd.to_numeric(car_data["engine_hp"], errors="coerce")
mask = hp.notna()
car_data.loc[mask, TARGET_CLS] = pd.qcut(hp[mask], q=3, labels=["Economy","Mid","Sport"])
car_data = car_data.dropna(subset=[TARGET_CLS])

# Drop rows missing target
car_data = car_data.dropna(subset=[TARGET])

# Fill categoricals
for col in CATEGORICAL:
    car_data[col] = car_data[col].fillna("")

# Cast some ints
for col in ["year","engine_cylinders","number_of_doors","popularity"]:
    if col in car_data.columns:
        car_data[col] = pd.to_numeric(car_data[col], errors="coerce").round().astype("Int64")

# Feature engineering
car_data["combined_mpg"] = (car_data["city_mpg"] + car_data["highway_mpg"]) / 2.0
den = car_data["engine_cylinders"].replace({0:np.nan})
car_data["hp_per_cyl"] = (car_data["engine_hp"] / den).replace([np.inf,-np.inf], np.nan)
car_data["drivetrain_simple"] = car_data["driven_wheels"].astype(str).str.extract(r"(front|rear|all)", expand=False).fillna(car_data["driven_wheels"].astype(str))

# Trim top 1% msrp
upper_limit = car_data["msrp"].quantile(0.99)
car_data = car_data[car_data["msrp"] <= upper_limit]

# Log target
car_data["log_msrp"] = np.log1p(car_data["msrp"])

# Extra features
car_data["hp_x_year"] = car_data["engine_hp"] * car_data["year"]
car_data["mpg_ratio"] = car_data["highway_mpg"] / (car_data["city_mpg"] + 1)
car_data["hp_per_door"] = car_data["engine_hp"] / (car_data["number_of_doors"] + 1)
car_data["is_luxury"] = car_data["market_category"].str.contains("Luxury", case=False, na=False).astype(int)
car_data["is_suv"] = car_data["vehicle_style"].str.contains("SUV", case=False, na=False).astype(int)
car_data["is_performance"] = car_data["market_category"].str.contains("Performance", case=False, na=False).astype(int)
top_makes = car_data["make"].value_counts().nlargest(15).index
car_data["make_grouped"] = car_data["make"].where(car_data["make"].isin(top_makes), "Other")

# year -> age
car_data["age"] = car_data["year"].max() - car_data["year"]

# Final cleaned lists
NUMERIC_CLEAN = ["age","engine_hp","engine_cylinders","highway_mpg","city_mpg","number_of_doors"]
CATEGORICAL_SMALL = ["engine_fuel_type","transmission_type","driven_wheels"]
CATEGORICAL_TE = ["make"]

# Split datasets properly: regression removes performance_category, classification keeps it
X_all = car_data.drop(columns=["msrp","log_msrp"])
y_reg_full = car_data["log_msrp"]
y_cls_full = car_data[TARGET_CLS]

# For regression X, drop derived columns and classification target to prevent leakage
DROP_FOR_REG = ["performance_category","model","model_grouped","make_grouped","market_category","vehicle_style","vehicle_size",
                "is_performance","is_luxury","is_suv","hp_x_year","hp_per_cyl","hp_per_door","combined_mpg","mpg_ratio","popularity"]
X_reg = X_all.drop(columns=[c for c in DROP_FOR_REG if c in X_all.columns], errors="ignore")

# For classification, keep performance_category, but remove engine_hp leakage
X_cls = X_all.copy()

# Train/test splits
X_train, X_test, y_train, y_test = train_test_split(X_reg, y_reg_full, test_size=0.3, random_state=RANDOM_STATE)
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_cls, y_cls_full, test_size=0.3, stratify=y_cls_full, random_state=RANDOM_STATE)

# For classification, drop horsepower-based columns to avoid leakage
leakage_features = ["engine_hp","hp_per_cyl","hp_x_year"]
Xc_train = Xc_train.drop(columns=[c for c in leakage_features if c in Xc_train.columns], errors="ignore")
Xc_test = Xc_test.drop(columns=[c for c in leakage_features if c in Xc_test.columns], errors="ignore")

NUMERIC_CLS = [c for c in NUMERIC_CLEAN if c in Xc_train.columns and c not in leakage_features]
CATEGORICAL_CLS = [c for c in CATEGORICAL if c in Xc_train.columns]

print(f"[OK] Loaded data: {len(car_data)} rows")
print(f"[OK] Required columns present: {missing if missing else 'All good'}")
print(f"[OK] Regression target OK: '{TARGET}'")
print(f"[OK] Classification target OK: '{TARGET_CLS}'")
critical_features = ["age", "hp_per_cyl", "drivetrain_simple"]
for feat in critical_features: print(f"[OK] Feature ready: {feat}" if feat in car_data.columns else f"[FAIL] Missing: {feat}")
print(f"[OK] Regression features after leakage drop: {X_train.shape[1]} columns")
print(f"[OK] Classification features (HP leakage removed): {Xc_train.shape[1]} columns")
print(f"[OK] Regression split: {X_train.shape} train | {X_test.shape} test")
print(f"[OK] Classification split: {Xc_train.shape} train | {Xc_test.shape} test")


[OK] Loaded data: 11726 rows
[OK] Required columns present: All good
[OK] Regression target OK: 'msrp'
[OK] Classification target OK: 'performance_category'
[OK] Feature ready: age
[OK] Feature ready: hp_per_cyl
[OK] Feature ready: drivetrain_simple
[OK] Regression features after leakage drop: 12 columns
[OK] Classification features (HP leakage removed): 24 columns
[OK] Regression split: (8208, 12) train | (3518, 12) test
[OK] Classification split: (8208, 24) train | (3518, 24) test


# **PIPELINE BUILT**

In [18]:
class ToDenseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X): return X.toarray() if hasattr(X, "toarray") else X

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")), 
    ("scaler", RobustScaler())])
categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")), 
    ("ohe", OneHotEncoder(handle_unknown="ignore"))])

preprocessor_reg = ColumnTransformer([
    ("num", numeric_transformer, NUMERIC_CLEAN), 
    ("cat", categorical_transformer, CATEGORICAL_SMALL)], remainder="drop")
preprocessor_cls = ColumnTransformer([
    ("num", numeric_transformer, NUMERIC_CLS), 
    ("cat", categorical_transformer, CATEGORICAL_CLS)], remainder="drop")

if USE_TORCH:
    class TorchMLPRegressor(BaseEstimator):
        def __init__(self, hidden_layer_sizes=(64,64), lr=1e-3, batch_size=128, epochs=50, device=None, random_state=RANDOM_STATE):
            self.hidden_layer_sizes = hidden_layer_sizes
            self.lr = lr
            self.batch_size = batch_size
            self.epochs = epochs
            self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
            self.random_state = random_state
            self.model_ = None

        def _build_model(self, input_dim):
            layers = []
            in_dim = input_dim
            for h in self.hidden_layer_sizes:
                layers.append(nn.Linear(in_dim, h))
                layers.append(nn.ReLU())
                in_dim = h
            layers.append(nn.Linear(in_dim, 1))
            return nn.Sequential(*layers).to(self.device)

        def fit(self, X, y):
            torch.manual_seed(self.random_state)
            if isinstance(X, np.ndarray):
                X_np = X.astype(np.float32)
            else:
                X_np = X.values.astype(np.float32)
            y_np = y.values.astype(np.float32).reshape(-1,1)
            dataset = TensorDataset(torch.from_numpy(X_np), torch.from_numpy(y_np))
            loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
            self.model_ = self._build_model(X_np.shape[1])
            opt = optim.Adam(self.model_.parameters(), lr=self.lr)
            loss_fn = nn.MSELoss()
            self.model_.train()
            for epoch in range(self.epochs):
                epoch_loss = 0.0
                for xb, yb in loader:
                    xb, yb = xb.to(self.device), yb.to(self.device)
                    opt.zero_grad()
                    out = self.model_(xb)
                    loss = loss_fn(out, yb)
                    loss.backward()
                    opt.step()
                    epoch_loss += float(loss.item()) * xb.size(0)
            return self

        def predict(self, X):
            if isinstance(X, np.ndarray):
                X_np = X.astype(np.float32)
            else:
                X_np = X.values.astype(np.float32)
            self.model_.eval()
            with torch.no_grad():
                xb = torch.from_numpy(X_np).to(self.device)
                out = self.model_(xb).cpu().numpy().reshape(-1)
            return out

    class TorchMLPClassifier(BaseEstimator):
        def __init__(self, hidden_layer_sizes=(64,64), lr=1e-3, batch_size=128, epochs=50, device=None, random_state=RANDOM_STATE):
            self.hidden_layer_sizes = hidden_layer_sizes
            self.lr = lr
            self.batch_size = batch_size
            self.epochs = epochs
            self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
            self.random_state = random_state
            self.model_ = None
            self.classes_ = None

        def _build_model(self, input_dim, n_classes):
            layers = []
            in_dim = input_dim
            for h in self.hidden_layer_sizes:
                layers.append(nn.Linear(in_dim, h))
                layers.append(nn.ReLU())
                in_dim = h
            layers.append(nn.Linear(in_dim, n_classes))
            return nn.Sequential(*layers).to(self.device)

        def fit(self, X, y):
            torch.manual_seed(self.random_state)
            if isinstance(X, np.ndarray):
                X_np = X.astype(np.float32)
            else:
                X_np = X.values.astype(np.float32)
            y_np_raw = y.values
            classes, y_np = np.unique(y_np_raw, return_inverse=True)
            self.classes_ = classes
            dataset = TensorDataset(torch.from_numpy(X_np), torch.from_numpy(y_np.astype(np.int64)))
            loader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
            self.model_ = self._build_model(X_np.shape[1], len(classes))
            opt = optim.Adam(self.model_.parameters(), lr=self.lr)
            loss_fn = nn.CrossEntropyLoss()
            self.model_.train()
            for epoch in range(self.epochs):
                for xb, yb in loader:
                    xb, yb = xb.to(self.device), yb.to(self.device)
                    opt.zero_grad()
                    out = self.model_(xb)
                    loss = loss_fn(out, yb)
                    loss.backward()
                    opt.step()
            return self

        def predict(self, X):
            if isinstance(X, np.ndarray):
                X_np = X.astype(np.float32)
            else:
                X_np = X.values.astype(np.float32)
            self.model_.eval()
            with torch.no_grad():
                xb = torch.from_numpy(X_np).to(self.device)
                logits = self.model_(xb).cpu().numpy()
                preds = np.argmax(logits, axis=1)
            return self.classes_[preds]

else:
    TorchMLPRegressor = None
    TorchMLPClassifier = None

def sanitize_for_json(params):
    clean = {}
    for k, v in params.items():
        if isinstance(v, (int, float, str, bool, list, dict)):
            clean[k] = v
        else:
            clean[k] = str(v)
    return clean

print(f" Regression preprocessor ready: {len(NUMERIC_CLEAN)} numeric, {len(CATEGORICAL_SMALL)} categorical")
print(f" Classification preprocessor ready: {len(NUMERIC_CLS)} numeric, {len(CATEGORICAL_CLS)} categorical")
print(f" ToDenseTransformer loaded: {callable(getattr(ToDenseTransformer, 'transform', None))}")
print(f" TorchMLPRegressor: {'Enabled' if TorchMLPRegressor else 'Disabled'}")
print(f" TorchMLPClassifier: {'Enabled' if TorchMLPClassifier else 'Disabled'}")


 Regression preprocessor ready: 6 numeric, 3 categorical
 Classification preprocessor ready: 5 numeric, 8 categorical
 ToDenseTransformer loaded: True
 TorchMLPRegressor: Enabled
 TorchMLPClassifier: Enabled


# **SELECTING AND TRAINING MODEL WITH BEST PARAMS**

In [None]:

if USE_GROUP_CV and "make" in X_train.columns:
    groups = X_train["make"]
    CV_REG = list(GKF.split(X_train, y_train, groups=groups))
    print("Using GroupKFold for regression.")
else:
    CV_REG = CV5
    print("Using standard KFold for regression.")

regressors = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "KNeighborsRegressor": KNeighborsRegressor()
}
if USE_TORCH and TORCH_CUDA:
    regressors["TorchMLPRegressor"] = TorchMLPRegressor(hidden_layer_sizes=(128,64), lr=1e-3, batch_size=128, epochs=60)
elif USE_TORCH and not TORCH_CUDA:
    regressors["TorchMLPRegressor"] = TorchMLPRegressor(hidden_layer_sizes=(128,64), lr=1e-3, batch_size=64, epochs=60, device="cpu")
else:
    regressors["SKMLPRegressor"] = SKMLPRegressor(hidden_layer_sizes=(128,64), max_iter=200, random_state=RANDOM_STATE)

classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=5000, random_state=RANDOM_STATE),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
    "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
    "GaussianNB": GaussianNB()
}
# MLP classifier
if USE_TORCH and TORCH_CUDA:
    classifiers["TorchMLPClassifier"] = TorchMLPClassifier(hidden_layer_sizes=(128,64), lr=1e-3, batch_size=128, epochs=60)
elif USE_TORCH and not TORCH_CUDA:
    classifiers["TorchMLPClassifier"] = TorchMLPClassifier(hidden_layer_sizes=(128,64), lr=1e-3, batch_size=64, epochs=60, device="cpu")
else:
    classifiers["SKMLPClassifier"] = SKMLPClassifier(hidden_layer_sizes=(128,64), max_iter=200, random_state=RANDOM_STATE)

GRID_PARAM = [
    {
        "preprocessor__num__imputer": [SimpleImputer(strategy="median"), KNNImputer(n_neighbors=5)],
        "preprocessor__num__scaler": [StandardScaler(), RobustScaler(), PowerTransformer()],
        "regressor": [Ridge(), Lasso(), ElasticNet()],
        "regressor__alpha": ALPHA
    },
    {
        "preprocessor__num__imputer": [KNNImputer(n_neighbors=5), SimpleImputer(strategy="median")],
        "preprocessor__num__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],
        "regressor": [LinearRegression()]
    },
    {
        "preprocessor__num__imputer": [KNNImputer(n_neighbors=5), SimpleImputer(strategy="median")],
        "preprocessor__num__scaler": [MinMaxScaler(), RobustScaler()],
        "regressor": [KNeighborsRegressor()],
        "regressor__n_neighbors": N_NEIGHBORS,
        "regressor__weights": WEIGHTS
    }
]

BAYES_PARAM_SPACE = {
    "regressor": Categorical([
        SVR(kernel="linear", max_iter=MAX_ITER, cache_size=CACHE_SIZE),
        SVR(kernel="poly", max_iter=MAX_ITER, cache_size=CACHE_SIZE),
        SVR(kernel="rbf", max_iter=MAX_ITER, cache_size=CACHE_SIZE),
    ]),
    "regressor__C": COEF,
    "regressor__epsilon": EPSILON,
    "regressor__gamma": GAMMA,
    "regressor__degree": DEGREE,
    "regressor__coef0": COEF0
}

GRID_PARAM_CLS = [
    {
        "preprocessor__num__imputer": [SimpleImputer(strategy="median")],
        "preprocessor__num__scaler": [StandardScaler(), RobustScaler()],
        "clf": [LogisticRegression(max_iter=5000, random_state=RANDOM_STATE)],
        "clf__C": LOGREG_C,
        "clf__solver": LOGREG_SOLVER,
        "clf__penalty": LOGREG_PENALTY
    },
    {
        "preprocessor__num__imputer": [SimpleImputer(strategy="median")],
        "preprocessor__num__scaler": [StandardScaler()],
        "clf": [KNeighborsClassifier()],
        "clf__n_neighbors": [3,5,7,11],
        "clf__weights": ["uniform","distance"]
    },
]

GRID_PARAM_CLS_2 = [
    {"clf": [LinearDiscriminantAnalysis()], "clf__solver": LDA_SOLVER},
    {"clf": [QuadraticDiscriminantAnalysis()], "clf__reg_param": QDA_REG_PARAM},
    {"clf": [GaussianNB()]},
]

BAYES_PARAM_SPACE_CLS = {
    "clf": Categorical([
        SVC(kernel="linear", probability=True, max_iter=MAX_ITER, cache_size=CACHE_SIZE),
        SVC(kernel="rbf", probability=True, max_iter=MAX_ITER, cache_size=CACHE_SIZE),
        SVC(kernel="poly", probability=True, max_iter=MAX_ITER, cache_size=CACHE_SIZE)
    ]),
    "clf__C": Real(1e-3, 1e3, prior="log-uniform"),
    "clf__gamma": Categorical(["scale","auto"]),
    "clf__degree": Integer(2,5),
    "clf__coef0": Real(0.0,5.0)
}

model_reg = Pipeline([("preprocessor", preprocessor_reg), ("regressor", Ridge())])
model_cls = Pipeline([("preprocessor", preprocessor_cls), ("clf", LogisticRegression(max_iter=5000, random_state=RANDOM_STATE))])
model_cls_2 = Pipeline([("preprocessor", preprocessor_cls), ("to_dense", ToDenseTransformer()), ("clf", LinearDiscriminantAnalysis())])

models_reg = {
    "TraditionalFamily": GridSearchCV(
        estimator=model_reg, param_grid=GRID_PARAM,
        cv=CV5, scoring=SCORE_REG, n_jobs=-1, refit=True
    ),
    "SVR-Bayesian": BayesSearchCV(
        estimator=model_reg, search_spaces=BAYES_PARAM_SPACE,
        n_iter=N_ITER, cv=CV5, n_jobs=-1, scoring=SCORE_REG,
        random_state=RANDOM_STATE, refit=True
    )
}

models_cls = {
    "GridSearch_LogReg_KNN": GridSearchCV(
        estimator=model_cls, param_grid=GRID_PARAM_CLS,
        cv=CV5, scoring=SCORE_CLA, n_jobs=-1
    ),
    "GridSearch_LDA_QDA_NB": GridSearchCV(
        estimator=model_cls_2, param_grid=GRID_PARAM_CLS_2,
        cv=CV5, scoring=SCORE_CLA, n_jobs=-1
    ),
    "BayesSearch_SVM": BayesSearchCV(
        estimator=model_cls, search_spaces=BAYES_PARAM_SPACE_CLS,
        n_iter=N_ITER, cv=CV, n_jobs=-1, scoring=SCORE_CLA,
        random_state=RANDOM_STATE
    )
}

best_regressor = None
best_classifier = None
results_reg, results_cls = {}, {}

os.makedirs(os.path.dirname(BEST_REGRESSOR_PATH), exist_ok=True)
os.makedirs(os.path.dirname(BEST_CLASSIFIER_PATH), exist_ok=True)
os.makedirs(os.path.dirname(LEADERBOARD_PATH), exist_ok=True)

def safe_load_model(path):
    try:
        model = joblib.load(path)
        meta = {}
        try:
            if hasattr(model, "named_steps"):
                if "regressor" in model.named_steps:
                    meta["estimator_name"] = model.named_steps["regressor"].__class__.__name__
                elif "clf" in model.named_steps:
                    meta["estimator_name"] = model.named_steps["clf"].__class__.__name__
        except Exception:
            pass
        return model, meta
    except Exception as e:
        print(f"Failed to load {path}: {e}")
        return None, {}

if os.path.exists(BEST_REGRESSOR_PATH):
    print("\nFound existing best regressor — loading.")
    best_regressor = joblib.load(BEST_REGRESSOR_PATH)
else:
    print("\nNo saved regressor found — training regression models...\n")
    print("================= REGRESSION RESULTS =================\n")
    for name, search in models_reg.items():
        print(f"\n Training {name} ...")
        try:
            t0 = time.time()
            search.fit(X_train, y_train)
            t1 = time.time()
            best_model = search.best_estimator_
            # Evaluate on original scale
            y_pred = np.expm1(best_model.predict(X_test))
            y_true = np.expm1(y_test)
            rmse = np.sqrt(mean_squared_error(y_true, y_pred))
            mae = mean_absolute_error(y_true, y_pred)
            r2 = r2_score(y_true, y_pred)
            reg_name = best_model.named_steps["regressor"].__class__.__name__
            kernel = getattr(best_model.named_steps["regressor"], "kernel", "N/A")
            print(f"   {name} Results:")
            print(f"   Best Params: {search.best_params_}")
            print(f"   Regressor: {reg_name}")
            print(f"   Kernel: {kernel}")
            print(f"   RMSE: {rmse:.2f}")
            print(f"   MAE:  {mae:.2f}")
            print(f"   R²:   {r2:.3f}")
            print(f"   Time: {t1-t0:.1f}s")
            results_reg[name] = {"Regressor": reg_name, "RMSE": rmse, "MAE": mae, "R2": r2, "Kernel": kernel, "Best_Params": sanitize_for_json(search.best_params_)}
        except Exception as e:
            print(f" {name} failed due to: {e}")

    if results_reg:
        print("\n===== REGRESSION MODEL COMPARISON =====")
        print(f"{'Model':<20} | {'Regressor':<20} | {'Kernel':<10} | {'RMSE':>10} | {'MAE':>10} | {'R²':>6}")
        print("-" * 80)
        for name, res in results_reg.items():
            print(f"{name:<20} | {res['Regressor']:<20} | {res['Kernel']:<10} | "
                  f"{res['RMSE']:>10.2f} | {res['MAE']:>10.2f} | {res['R2']:>6.3f}")
    else:
        print(" No regression results available.")
        raise RuntimeError("Regression training failed — no valid models produced.")

    best_reg_name, best_reg = max(results_reg.items(), key=lambda x: x[1]["R2"])
    best_regressor = models_reg[best_reg_name].best_estimator_
    joblib.dump(best_regressor, BEST_REGRESSOR_PATH)
    print(f"\nBest Regressor: {best_reg_name} ({best_reg['Regressor']}) with R²={best_reg['R2']:.2f} saved to {BEST_REGRESSOR_PATH}")


if os.path.exists(BEST_CLASSIFIER_PATH):
    print("\nFound existing best classifier — loading.")
    best_classifier = joblib.load(BEST_CLASSIFIER_PATH)
else:
    print("\nNo saved classifier found — training classification models...\n")
    print("================= CLASSIFICATION RESULTS =================\n")
    for name, search in models_cls.items():
        print(f"\n Training {name} ...")
        try:
            t0 = time.time()
            search.fit(Xc_train, yc_train)
            t1 = time.time()
            best_model = search.best_estimator_
            clf_name = best_model.named_steps["clf"].__class__.__name__
            kernel = getattr(best_model.named_steps["clf"], "kernel", "N/A")
            y_pred = best_model.predict(Xc_test)
            acc = accuracy_score(yc_test, y_pred)
            f1 = f1_score(yc_test, y_pred, average="macro")
            print(f"   {name} ({clf_name}) Results:")
            print(f"   Best Params: {search.best_params_}")
            print(f"   Accuracy: {acc:.2f} | F1_macro: {f1:.2f} | Kernel: {kernel}")
            print("\n   Classification report:")
            print(classification_report(yc_test, y_pred, digits=3))
            print(f"   Time: {t1-t0:.1f}s")
            results_cls[name] = {"Classifier": clf_name, "Accuracy": acc, "F1_macro": f1, "Kernel": kernel, "Best_Params": sanitize_for_json(search.best_params_)
}
        except Exception as e:
            print(f" {name} failed due to: {e}")

    if results_cls:
        print("\n===== CLASSIFICATION MODEL COMPARISON =====")
        print(f"{'Model':<25} | {'Classifier':<20} | {'Kernel':<10} | {'Accuracy':>10} | {'F1_macro':>10}")
        print("-" * 80)
        for name, res in results_cls.items():
            print(f"{name:<25} | {res['Classifier']:<20} | {res['Kernel']:<10} | "
                  f"{res['Accuracy']:>10.2f} | {res['F1_macro']:>10.2f}")
    else:
        print(" No classification results available.")
        raise RuntimeError("Classification training failed — no valid models produced.")

    best_cls_name, best_cls = max(results_cls.items(), key=lambda x: x[1]["Accuracy"])
    best_classifier = models_cls[best_cls_name].best_estimator_
    joblib.dump(best_classifier, BEST_CLASSIFIER_PATH)
    print(f"\nBest Classifier: {best_cls_name} ({best_cls['Classifier']}) with Accuracy={best_cls['Accuracy']:.2f} saved to {BEST_CLASSIFIER_PATH}")

# Leaderboard
leaderboard = {
    "Best Regressor": best_reg_name if 'best_reg_name' in globals() else "Loaded existing model",
    "Best Regressor Type": best_reg.get("Regressor") if 'best_reg' in globals() else "Loaded existing model",
    "Best Regressor R²": best_reg.get("R2") if 'best_reg' in globals() else "N/A",
    "Best Classifier": best_cls_name if 'best_cls_name' in globals() else "Loaded existing model",
    "Best Classifier Type": best_cls.get("Classifier") if 'best_cls' in globals() else "Loaded existing model",
    "Best Classifier Accuracy": best_cls.get("Accuracy") if 'best_cls' in globals() else "N/A",
    "Regression Results": results_reg,
    "Classification Results": results_cls,
    "timestamp": pd.Timestamp.now().isoformat()
}

with open(LEADERBOARD_PATH, "w") as f:
    json.dump(leaderboard, f, indent=4)

print(f"\nLeaderboard saved at: {LEADERBOARD_PATH}")
print("\nBest models ready for use:")
print(f"   Regressor: {BEST_REGRESSOR_PATH}")
print(f"   Classifier: {BEST_CLASSIFIER_PATH}")



Using GroupKFold for regression.

No saved regressor found — training regression models...



 Training TraditionalFamily ...
   TraditionalFamily Results:
   Best Params: {'preprocessor__num__imputer': SimpleImputer(strategy='median'), 'preprocessor__num__scaler': MinMaxScaler(), 'regressor': KNeighborsRegressor(), 'regressor__n_neighbors': 5, 'regressor__weights': 'distance'}
   Regressor: KNeighborsRegressor
   Kernel: N/A
   RMSE: 9236.70
   MAE:  4376.55
   R²:   0.929
   Time: 53.9s

 Training SVR-Bayesian ...
   SVR-Bayesian Results:
   Best Params: OrderedDict({'regressor': SVR(cache_size=2000, max_iter=25000), 'regressor__C': 23.9883291901949, 'regressor__coef0': 0.0, 'regressor__degree': 3, 'regressor__epsilon': 0.09, 'regressor__gamma': 'scale'})
   Regressor: SVR
   Kernel: rbf
   RMSE: 9442.48
   MAE:  5063.62
   R²:   0.925
   Time: 352.0s

===== REGRESSION MODEL COMPARISON =====
Model                | Regressor            | Kernel     |       RMSE |        MAE |     R²
-

In [13]:
def run_groupkfold_baseline(X_df, y_ser, group_col="make"):
    if group_col not in X_df.columns:
        print(f"Group column '{group_col}' not present in X; skipping GroupKFold baseline.")
        return None
    gkf = GroupKFold(n_splits=5)
    baseline = Pipeline([("preprocessor", preprocessor_reg), ("regressor", Ridge(alpha=1.0))])
    scores = cross_val_score(baseline, X_df, y_ser, cv=gkf.split(X_df, y_ser, groups=X_df[group_col]), scoring=SCORE_REG, n_jobs=-1)
    print(f"GroupKFold baseline (neg RMSE) mean score: {np.mean(scores):.4f}")
    return scores

def compute_permutation_importance(model, X_df, y_ser, n_repeats=10, subset=500):
    if model is None:
        print("No fitted model provided for permutation importance.")
        return None
    n = min(subset, len(X_df))
    subX = X_df.sample(n, random_state=RANDOM_STATE)
    suby = y_ser.loc[subX.index]
    try:
        res = permutation_importance(model, subX, suby, n_repeats=n_repeats, scoring=SCORE_REG, n_jobs=-1, random_state=RANDOM_STATE)
        imp_df = pd.Series(res.importances_mean, index=subX.columns).sort_values(ascending=False)
        print("Top features by permutation importance (top 30):")
        print(imp_df.head(30))
        return imp_df
    except Exception as e:
        print(f"Permutation importance failed: {e}")
        return None

print("Script finished.") 
print("Running GroupKFold baseline...") 
run_groupkfold_baseline(X_train, y_train) 
print("\nRunning permutation importance on best regressor...") 
compute_permutation_importance(best_regressor, X_train, y_train)

Script finished.
Running GroupKFold baseline...
GroupKFold baseline (neg RMSE) mean score: -0.4807

Running permutation importance on best regressor...
Top features by permutation importance (top 30):
age                  0.971427
engine_fuel_type     0.602611
engine_hp            0.317449
driven_wheels        0.315745
transmission_type    0.313065
engine_cylinders     0.225823
number_of_doors      0.181892
city_mpg             0.100115
highway_mpg          0.069855
year                -0.001125
make                -0.001125
drivetrain_simple   -0.001125
dtype: float64


age                  0.971427
engine_fuel_type     0.602611
engine_hp            0.317449
driven_wheels        0.315745
transmission_type    0.313065
engine_cylinders     0.225823
number_of_doors      0.181892
city_mpg             0.100115
highway_mpg          0.069855
year                -0.001125
make                -0.001125
drivetrain_simple   -0.001125
dtype: float64