# **INSTALLS AND PACKAGES**

In [None]:
import warnings
import os
import joblib
import json
import pandas as pd
import numpy as np

from skopt import BayesSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, PowerTransformer, QuantileTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold, StratifiedKFold
from skopt.space import Real, Integer, Categorical
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

warnings.filterwarnings("ignore")





# **SETTINGS AND PATH**

In [76]:


DATA_PATH = "Data/data.csv"
BEST_REGRESSOR_PATH = "models/best_regressor.joblib"
BEST_CLASSIFIER_PATH = "models/best_classifier.joblib"
LEADERBOARD_PATH = "reports/leaderboards.json"

# General Settings
RANDOM_STATE = 42
TARGET = "msrp"
TARGET_CLS = "performance_category"
CV = StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE)
CV5 = KFold(5, shuffle=True, random_state=RANDOM_STATE)
SCORE_REG = "neg_root_mean_squared_error"
SCORE_CLA = "f1_macro"
CACHE_SIZE = 2000
MAX_ITER = 25000
N_ITER = 20


NUMERIC = ["year", "engine_hp", "engine_cylinders", "highway_mpg", "city_mpg", "popularity", "number_of_doors"]
CATEGORICAL = ["make", "model", "engine_fuel_type", "transmission_type", "driven_wheels","market_category", "vehicle_size", "vehicle_style"]

# LASSO/RIDGE settings
ALPHA  =[1e-4, 1e-3, 1e-2, 1e-1, 0.5, 1, 10, 50, 100]

# SVR settings
COEF =10**np.linspace(-3, 3, 101)
EPSILON = np.linspace(0, 0.1, 11)
DEGREE = [2, 3, 4, 5]
GAMMA = ['scale', 'auto']
COEF0 = [0.0, 1.0, 5.0]
# KNN settings
N_NEIGHBORS = [1, 2, 3, 5, 7, 9, 11, 15, 20, 25, 30, 40, 50, 75, 100]
WEIGHTS = ["uniform", "distance"]
LEAF_SIZE = [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
METRIC = ["euclidean", "manhattan", "minkowski", "chebyshev", "cosine"]
P = [1, 1.5, 2, 3]
ALGORITHM = ["auto", "ball_tree", "kd_tree", "brute"]

# LogReg setting
LOGREG_PENALTY = ["l2"]
LOGREG_C = [0.01, 0.1, 0.5, 1, 2, 5, 10]
LOGREG_SOLVER = ["lbfgs", "saga"]

# LDA / QDA settings
LDA_SOLVER = ["svd", "lsqr", "eigen"]
QDA_REG_PARAM = np.linspace(0.05, 0.9, 10) #QDA_REG_PARAM = np.linspace(0.0, 0.5, 6)


print("............................Configuration and Settings loaded successfully.")
print(f"Data path: {DATA_PATH}")
print(f"Random state: {RANDOM_STATE}")
print(f"Target column: {TARGET}")
print(f"Numeric features: {len(NUMERIC)} | Categorical features: {len(CATEGORICAL)}")

............................Configuration and Settings loaded successfully.
Data path: Data/data.csv
Random state: 42
Target column: msrp
Numeric features: 7 | Categorical features: 8


# **PREPROCESS AND SPLIT DATA**

In [77]:

car_data = pd.read_csv(DATA_PATH)
print (f" Data loaded successfully: {car_data.shape[0]:,} rows × {car_data.shape[1]} columns")

car_data.columns = [c.strip().lower().replace(" ","_").replace("-","_") for c in car_data.columns] # Normalize column names

rename_map = {"engine_hp": "engine_hp", "engine_cylinders": "engine_cylinders", 
              "highway_mpg": "highway_mpg", "city_mpg": "city_mpg", "number_of_doors": "number_of_doors", 
              "driven_wheels": "driven_wheels", "engine_fuel_type": "engine_fuel_type", "transmission_type": "transmission_type", 
              "market_category": "market_category", "vehicle_size": "vehicle_size", "vehicle_style": "vehicle_style", 
              "popularity": "popularity", "msrp": "msrp", "make": "make", "model": "model", "year": "year"}
car_data = car_data.rename(columns=rename_map).copy()
needed = set(NUMERIC + CATEGORICAL + [TARGET]) # Validate required columns
missing = [col for col in needed if col not in car_data.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")
print(f"\nAll required columns found ({len(car_data.columns)} total).")

for col in NUMERIC + [TARGET]: # Convert numeric columns
    car_data[col] = pd.to_numeric(car_data[col], errors="coerce")

##########class
hp = pd.to_numeric(car_data["engine_hp"], errors="coerce")
mask = hp.notna()
car_data[TARGET_CLS] = pd.qcut(hp[mask], q=3, labels=["Economy", "Mid", "Sport"])
car_data = car_data.dropna(subset=[TARGET_CLS])
print("\nclass Target ",car_data[TARGET_CLS].value_counts())
######

car_data = car_data.dropna(subset=[TARGET]) # Drop rows with missing target
print(f"\nTarget '{TARGET}' cleaned. Remaining rows: {len(car_data):,}")

for col in CATEGORICAL: # Fill missing categorical values
    car_data[col] = car_data[col].fillna("")

for col in ["year", "engine_cylinders", "number_of_doors", "popularity"]: # Convert certain columns to Int64
    if col in car_data.columns:
        car_data[col] = pd.to_numeric(car_data[col], errors="coerce").round().astype("Int64")

original_cols = set(car_data.columns) # Feature engineering
car_data["combined_mpg"] = (car_data["city_mpg"] + car_data["highway_mpg"]) / 2.0
den = car_data["engine_cylinders"].replace({0: np.nan})
car_data["hp_per_cyl"] = (car_data["engine_hp"] / den).replace([np.inf, -np.inf], np.nan)
car_data["drivetrain_simple"] = (car_data["driven_wheels"].astype(str) .str.extract(r"(front|rear|all)", expand=False).fillna(car_data["driven_wheels"].astype(str)))

num_cols = NUMERIC + ["combined_mpg", "hp_per_cyl"]
cat_cols = CATEGORICAL + ["drivetrain_simple"]
new_features = sorted(set(car_data.columns) - original_cols)
print (f"\nFeature engineering complete.\n new Features {new_features}")

# Remove top 1% price outliers
upper_limit = car_data["msrp"].quantile(0.99)
car_data = car_data[car_data["msrp"] <= upper_limit]
print(f"\nAfter outlier removal: {len(car_data):,} rows")

# Add log-transformed target
car_data["log_msrp"] = np.log1p(car_data["msrp"])

# ---------- 2. Strengthen features ----------
car_data["hp_x_year"] = car_data["engine_hp"] * car_data["year"]
car_data["mpg_ratio"] = car_data["highway_mpg"] / (car_data["city_mpg"] + 1)
car_data["hp_per_door"] = car_data["engine_hp"] / (car_data["number_of_doors"] + 1)
car_data["is_luxury"] = car_data["market_category"].str.contains("Luxury", case=False, na=False).astype(int) # Premium indicators
car_data["is_suv"] = car_data["vehicle_style"].str.contains("SUV", case=False, na=False).astype(int)
car_data["is_performance"] = car_data["market_category"].str.contains("Performance", case=False, na=False).astype(int)
top_makes = car_data["make"].value_counts().nlargest(15).index # Group rare makes
car_data["make_grouped"] = car_data["make"].where(car_data["make"].isin(top_makes), "Other")

class ToDenseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.toarray() if hasattr(X, "toarray") else X

X = car_data.drop(columns=["msrp", "log_msrp"])
y = car_data["log_msrp"]
y_cls = car_data[TARGET_CLS]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

X_cls_train, X_cls_test, y_cls_train, y_cls_test = train_test_split(X, y_cls, test_size=0.3, stratify=y_cls, random_state=RANDOM_STATE)    

# --- Prevent target leakage by removing horsepower-based columns ---
leakage_features = ["engine_hp", "hp_per_cyl", "hp_x_year"]
X_cls_train = X_cls_train.drop(columns=leakage_features, errors="ignore")
X_cls_test = X_cls_test.drop(columns=leakage_features, errors="ignore")


NUMERIC_CLS = [c for c in NUMERIC if c in X_cls_train.columns and c not in leakage_features]
CATEGORICAL_CLS = [c for c in CATEGORICAL if c in X_cls_train.columns]

print(f"\nRemoved leakage features for classification: {leakage_features}")
print(f"\nClassification training shape:  X_cls_train = {X_cls_train.shape}, y_cls_train = {y_cls_train.shape}")
print(f"Classification testing shape:   X_cls_test  = {X_cls_test.shape},  y_cls_test  = {y_cls_test.shape}")
print(f"Regression training shape:      X_train     = {X_train.shape}, y_train     = {y_train.shape}")
print(f"Regression testing shape:       X_test      = {X_test.shape},  y_test      = {y_test.shape}")

print("\nData preparation completed successfully.")



 Data loaded successfully: 11,914 rows × 16 columns

All required columns found (16 total).

class Target  performance_category
Economy    4045
Sport      3926
Mid        3874
Name: count, dtype: int64

Target 'msrp' cleaned. Remaining rows: 11,845

Feature engineering complete.
 new Features ['combined_mpg', 'drivetrain_simple', 'hp_per_cyl']

After outlier removal: 11,726 rows

Removed leakage features for classification: ['engine_hp', 'hp_per_cyl', 'hp_x_year']

Classification training shape:  X_cls_train = (8208, 23), y_cls_train = (8208,)
Classification testing shape:   X_cls_test  = (3518, 23),  y_cls_test  = (3518,)
Regression training shape:      X_train     = (8208, 26), y_train     = (8208,)
Regression testing shape:       X_test      = (3518, 26),  y_test      = (3518,)

Data preparation completed successfully.


# **PIPELINE BUILT**

In [78]:
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

preprocessor_reg = ColumnTransformer([
    ("num", numeric_transformer, NUMERIC),
    ("cat", categorical_transformer, CATEGORICAL)
])

preprocessor_cls = ColumnTransformer([
    ("num", numeric_transformer, NUMERIC_CLS),
    ("cat", categorical_transformer, CATEGORICAL_CLS)
])

model_reg = Pipeline([
    ("preprocessor", preprocessor_reg),
    ("regressor", LinearRegression())
])

model_cls = Pipeline([
    ("preprocessor", preprocessor_cls),
    ("clf", LogisticRegression(max_iter=5000, random_state=RANDOM_STATE))
])

model_cls_2 = Pipeline([
    ("preprocessor", preprocessor_cls),
    ("to_dense", ToDenseTransformer()),
    ("clf", LinearDiscriminantAnalysis())
])

print("\nPipelines built successfully.")
print(f"   Total features (numeric + categorical): {len(NUMERIC) + len(CATEGORICAL)}")



Pipelines built successfully.
   Total features (numeric + categorical): 15


# **SELECTING AND TRAINING MODEL WITH BEST PARAMS**

In [None]:

GRID_PARAM_CLS = [
    {
        "preprocessor__num__imputer": [SimpleImputer(strategy="median")],
        "preprocessor__num__scaler": [StandardScaler(), RobustScaler()],
        "clf": [LogisticRegression(max_iter=5000, random_state=RANDOM_STATE)],
        "clf__C": LOGREG_C,
        "clf__solver": LOGREG_SOLVER,
        "clf__penalty": LOGREG_PENALTY
    },
    {
        "preprocessor__num__imputer": [SimpleImputer(strategy="median")],
        "preprocessor__num__scaler": [StandardScaler()],
        "clf": [KNeighborsClassifier()],
        "clf__n_neighbors": [3, 5, 7, 11],
        "clf__weights": ["uniform", "distance"]
    },
]

GRID_PARAM_CLS_2 = [
    {"clf": [LinearDiscriminantAnalysis()], "clf__solver": LDA_SOLVER},
    {"clf": [QuadraticDiscriminantAnalysis()], "clf__reg_param": QDA_REG_PARAM},
    {"clf": [GaussianNB()]},
]

BAYES_PARAM_SPACE_CLS = {
    "clf": Categorical([
        SVC(kernel="linear", probability=True, max_iter=MAX_ITER, cache_size=CACHE_SIZE),
        SVC(kernel="rbf", probability=True, max_iter=MAX_ITER, cache_size=CACHE_SIZE),
        SVC(kernel="poly", probability=True, max_iter=MAX_ITER, cache_size=CACHE_SIZE)
    ]),
    "clf__C": Real(1e-3, 1e3, prior="log-uniform"),
    "clf__gamma": Categorical(["scale", "auto"]),
    "clf__degree": Integer(2, 5),
    "clf__coef0": Real(0.0, 5.0)
}

GRID_PARAM = [
    {
        "preprocessor__num__imputer": [SimpleImputer(strategy="median"), KNNImputer(n_neighbors=5)],
        "preprocessor__num__scaler": [StandardScaler(), RobustScaler(), PowerTransformer()],
        "regressor": [Ridge(), Lasso()],
        "regressor__alpha": ALPHA
    },
    {
        "preprocessor__num__imputer": [SimpleImputer(strategy="median"), KNNImputer(n_neighbors=5)],
        "preprocessor__num__scaler": [StandardScaler(), MinMaxScaler(), RobustScaler()],
        "regressor": [LinearRegression()]
    },
    {
        "preprocessor__num__imputer": [KNNImputer(n_neighbors=5), SimpleImputer(strategy="median")],
        "preprocessor__num__scaler": [MinMaxScaler(), RobustScaler()],
        "regressor": [KNeighborsRegressor()],
        "regressor__n_neighbors": N_NEIGHBORS,
        "regressor__weights": WEIGHTS
    },
]

BAYES_PARAM_SPACE = {
    "regressor": Categorical([
        SVR(kernel="linear", max_iter=MAX_ITER, cache_size=CACHE_SIZE),
        SVR(kernel="poly", max_iter=MAX_ITER, cache_size=CACHE_SIZE),
        SVR(kernel="rbf", max_iter=MAX_ITER, cache_size=CACHE_SIZE)
    ]),
    "regressor__C": COEF,
    "regressor__epsilon": EPSILON,
    "regressor__gamma": GAMMA,
    "regressor__degree": DEGREE,
    "regressor__coef0": COEF0
}

models_reg = {
    "TraditionalFamily": GridSearchCV(
        estimator=model_reg, param_grid=GRID_PARAM,
        cv=CV5, scoring=SCORE_REG, n_jobs=-1, refit=True
    ),
    "SVR-Bayesian": BayesSearchCV(
        estimator=model_reg, search_spaces=BAYES_PARAM_SPACE,
        n_iter=N_ITER, cv=CV5, n_jobs=-1, scoring=SCORE_REG,
        random_state=RANDOM_STATE, refit=True
    )
}

models_cls = {
    "GridSearch_LogReg_KNN": GridSearchCV(
        estimator=model_cls, param_grid=GRID_PARAM_CLS,
        cv=CV5, scoring=SCORE_CLA, n_jobs=-1
    ),
    "GridSearch_LDA_QDA_NB": GridSearchCV(
        estimator=model_cls_2, param_grid=GRID_PARAM_CLS_2,
        cv=CV5, scoring=SCORE_CLA, n_jobs=-1
    ),
    "BayesSearch_SVM": BayesSearchCV(
        estimator=model_cls, search_spaces=BAYES_PARAM_SPACE_CLS,
        n_iter=N_ITER, cv=CV, n_jobs=-1, scoring=SCORE_CLA,
        random_state=RANDOM_STATE
    )
}

os.makedirs(os.path.dirname(BEST_REGRESSOR_PATH), exist_ok=True)
os.makedirs(os.path.dirname(LEADERBOARD_PATH), exist_ok=True)
os.makedirs(os.path.dirname(LEADERBOARD_PATH), exist_ok=True)

best_regressor = None
best_classifier = None
results_reg, results_cls = {}, {}

if os.path.exists(BEST_REGRESSOR_PATH):
    print("\nFound existing best regressor — skipping regression retraining.")
    best_regressor = joblib.load(BEST_REGRESSOR_PATH)
else:
    print("\nNo saved regressor found — training regression models...")

    print("\n================= REGRESSION RESULTS =================")
    for name, search in models_reg.items():
        print(f"\n Training {name} ...")
        try:
            search.fit(X_train, y_train)
            best_model = search.best_estimator_

            y_pred = np.expm1(best_model.predict(X_test))
            y_true = np.expm1(y_test)

            rmse = np.sqrt(mean_squared_error(y_true, y_pred))
            mae = mean_absolute_error(y_true, y_pred)
            r2 = r2_score(y_true, y_pred)

            reg_name = best_model.named_steps["regressor"].__class__.__name__
            kernel = getattr(best_model.named_steps["regressor"], "kernel", "N/A")

            print(f"   {name} Results:")
            print(f"   Best Params: {search.best_params_}")
            print(f"   Regressor: {reg_name}")
            print(f"   Kernel: {kernel}")
            print(f"   RMSE: {rmse:.2f}")
            print(f"   MAE:  {mae:.2f}")
            print(f"   R²:   {r2:.3f}")

            results_reg[name] = {
                "Regressor": reg_name,
                "RMSE": rmse,
                "MAE": mae,
                "R2": r2,
                "Kernel": kernel
            }
        except Exception as e:
            print(f" {name} failed due to: {e}")

    if results_reg:
        print("\n===== REGRESSION MODEL COMPARISON =====")
        print(f"{'Model':<20} | {'Regressor':<20} | {'Kernel':<10} | {'RMSE':>10} | {'MAE':>10} | {'R²':>6}")
        print("-" * 80)
        for name, res in results_reg.items():
            print(f"{name:<20} | {res['Regressor']:<20} | {res['Kernel']:<10} | "
                  f"{res['RMSE']:>10.2f} | {res['MAE']:>10.2f} | {res['R2']:>6.3f}")
    else:
        print(" No regression results available.")
        raise RuntimeError("Regression training failed — no valid models produced.")

    best_reg_name, best_reg = max(results_reg.items(), key=lambda x: x[1]["R2"])
    best_regressor = models_reg[best_reg_name].best_estimator_
    joblib.dump(best_regressor, BEST_REGRESSOR_PATH)
    print(f"\nBest Regressor: {best_reg_name} ({best_reg['Regressor']}) with R²={best_reg['R2']:.2f} saved to {BEST_REGRESSOR_PATH}")


if os.path.exists(BEST_CLASSIFIER_PATH):
    print("\nFound existing best classifier — skipping classification retraining.")
    best_classifier = joblib.load(BEST_CLASSIFIER_PATH)
else:
    print("\nNo saved classifier found — training classification models...")

    print("\n================= CLASSIFICATION RESULTS =================")
    for name, search in models_cls.items():
        print(f"\n Training {name} ...")
        try:
            search.fit(X_cls_train, y_cls_train)
            best_model = search.best_estimator_

            clf_name = best_model.named_steps["clf"].__class__.__name__
            kernel = getattr(best_model.named_steps["clf"], "kernel", "N/A")

            y_pred = best_model.predict(X_cls_test)
            acc = accuracy_score(y_cls_test, y_pred)
            f1 = f1_score(y_cls_test, y_pred, average="macro")

            print(f"   {name} ({clf_name}) Results:")
            print(f"   Best Params: {search.best_params_}")
            print(f"   Accuracy: {acc:.2f} | F1_macro: {f1:.2f} | Kernel: {kernel}")
            print("\n   Classification report:")
            print(classification_report(y_cls_test, y_pred, digits=3))

            results_cls[name] = {
                "Classifier": clf_name,
                "Accuracy": acc,
                "F1_macro": f1,
                "Kernel": kernel
            }
        except Exception as e:
            print(f" {name} failed due to: {e}")

    if results_cls:
        print("\n===== CLASSIFICATION MODEL COMPARISON =====")
        print(f"{'Model':<25} | {'Classifier':<20} | {'Kernel':<10} | {'Accuracy':>10} | {'F1_macro':>10}")
        print("-" * 80)
        for name, res in results_cls.items():
            print(f"{name:<25} | {res['Classifier']:<20} | {res['Kernel']:<10} | "
                  f"{res['Accuracy']:>10.2f} | {res['F1_macro']:>10.2f}")
    else:
        print(" No classification results available.")
        raise RuntimeError("Classification training failed — no valid models produced.")

    best_cls_name, best_cls = max(results_cls.items(), key=lambda x: x[1]["Accuracy"])
    best_classifier = models_cls[best_cls_name].best_estimator_
    joblib.dump(best_classifier, BEST_CLASSIFIER_PATH)
    print(f"\nBest Classifier: {best_cls_name} ({best_cls['Classifier']}) with Accuracy={best_cls['Accuracy']:.2f} saved to {BEST_CLASSIFIER_PATH}")

leaderboard = {
    "Best Regressor": best_reg_name if best_regressor else "Loaded existing model",
    "Best Regressor Type": best_reg["Regressor"] if best_regressor else "Loaded existing model",
    "Best Regressor R²": best_reg["R2"] if best_regressor else "N/A",
    "Best Classifier": best_cls_name if best_classifier else "Loaded existing model",
    "Best Classifier Type": best_cls["Classifier"] if best_classifier else "Loaded existing model",
    "Best Classifier Accuracy": best_cls["Accuracy"] if best_classifier else "N/A",
    "Regression Results": results_reg,
    "Classification Results": results_cls
}

with open(LEADERBOARD_PATH, "w") as f:
    json.dump(leaderboard, f, indent=4)

print(f"\nLeaderboard saved at: {LEADERBOARD_PATH}")
print("\nBest models ready for use:")
print(f"   Regressor: {BEST_REGRESSOR_PATH}")
print(f"   Classifier: {BEST_CLASSIFIER_PATH}")



No saved regressor found — training regression models...


 Training TraditionalFamily ...
   TraditionalFamily Results:
   Best Params: {'preprocessor__num__imputer': SimpleImputer(strategy='median'), 'preprocessor__num__scaler': RobustScaler(), 'regressor': KNeighborsRegressor(), 'regressor__n_neighbors': 3, 'regressor__weights': 'distance'}
   Regressor: KNeighborsRegressor
   Kernel: N/A
   RMSE: 5506.73
   MAE:  2964.11
   R²:   0.975

 Training SVR-Bayesian ...
   SVR-Bayesian Results:
   Best Params: OrderedDict({'regressor': SVR(cache_size=2000, max_iter=25000), 'regressor__C': 436.51583224016565, 'regressor__coef0': 0.0, 'regressor__degree': 2, 'regressor__epsilon': 0.08, 'regressor__gamma': 'scale'})
   Regressor: SVR
   Kernel: rbf
   RMSE: 5533.41
   MAE:  3069.24
   R²:   0.974

===== REGRESSION MODEL COMPARISON =====
Model                | Regressor            | Kernel     |       RMSE |        MAE |     R²
----------------------------------------------------------------