packages


In [29]:
import os
import pandas as pd
import json, joblib

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA
from sklearn.neural_network import MLPRegressor, MLPClassifier


Paths and Settings

In [30]:
DATA_PATH ="Data/CarPrice_Assignment.csv"
RANDOM_STATE = 42
TARGET = "price"

NUMERIC = ["enginesize","horsepower","citympg","highwaympg","curbweight","wheelbase","carlength","carwidth"]
CATEGORICAL = ["carname","fueltype","aspiration","doornumber","carbody","drivewheel","enginetype","cylindernumber","fuelsystem"]


Preprocessing Data


In [None]:
data = pd.read_csv(DATA_PATH)
data.columns = [c.strip().lower().replace(" ", "").replace("-", "") for c in data.columns]

required_numeric = list(dict.fromkeys(NUMERIC + ["price", "horsepower", "enginesize", "curbweight", "citympg", "highwaympg"]))
required_cat = list(CATEGORICAL)
required_all = required_numeric + required_cat

missing = [col for col in required_all if col not in data.columns]
if missing: raise ValueError(f"Missing required columns: {missing}")

for coe in required_numeric: data[coe] = pd.to_numeric(data[coe], errors="coerce")

data = data.copy()
data["power_to_weight"] = data["horsepower"] / data["curbweight"]
data["efficiency"] = (data["citympg"] + data["highwaympg"]) / 2.0

needed_for_model = list(dict.fromkeys(NUMERIC + ["price", "power_to_weight", "efficiency"]))
data.replace([float("inf"), float("-inf")], pd.NA, inplace=True)
data = data.dropna(subset=needed_for_model)

y_reg = data["price"].astype(float)
try:    y_cls = pd.qcut(data["horsepower"], q=3, labels=["Economy", "Mid", "Sport"], duplicates="drop")
except ValueError:
    ranks = pd.qcut(data["horsepower"].rank(method="average"), q=3, labels=["Economy", "Mid", "Sport"])
    y_cls = ranks
X = data[NUMERIC + CATEGORICAL + ["power_to_weight", "efficiency"]]

num_cols = NUMERIC + ["power_to_weight", "efficiency"]
numerical_pipeline = Pipeline([("scaler", StandardScaler())])
try:    categorical_pipeline = Pipeline([("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])
except TypeError:   categorical_pipeline = Pipeline([("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False))])

preprocessor = ColumnTransformer([("num", numerical_pipeline, num_cols), ("cat", categorical_pipeline, CATEGORICAL)])

 **Train Models**


In [None]:


def regression_grids(pre):
    return {
        "Linear": GridSearchCV(Pipeline([("pre", pre), ("model", LinearRegression())]),
                               {}, cv=KFold(5, shuffle=True, random_state=RANDOM_STATE),
                               n_jobs=-1, scoring="neg_root_mean_squared_error"),
        "Ridge": GridSearchCV(Pipeline([("pre", pre), ("model", Ridge())]),
                              {"model__alpha":[0.1,1,10,100]}, cv=KFold(5, shuffle=True, random_state=RANDOM_STATE),
                              n_jobs=-1, scoring="neg_root_mean_squared_error"),
        "Lasso": GridSearchCV(Pipeline([("pre", pre), ("model", Lasso(max_iter=10000))]),
                              {"model__alpha":[0.0005,0.001,0.01,0.1]}, cv=KFold(5, shuffle=True, random_state=RANDOM_STATE),
                              n_jobs=-1, scoring="neg_root_mean_squared_error"),
        "SVR-linear": GridSearchCV(Pipeline([("pre", pre), ("model", SVR(kernel="linear"))]),
                                   {"model__C":[0.1,1,10], "model__epsilon":[0.05,0.1,0.2]},
                                   cv=KFold(5, shuffle=True, random_state=RANDOM_STATE), n_jobs=-1,
                                   scoring="neg_root_mean_squared_error"),
        "SVR-poly": GridSearchCV(Pipeline([("pre", pre), ("model", SVR(kernel="poly"))]),
                                 {"model__degree":[2,3,4],"model__C":[0.5,1,10],"model__epsilon":[0.05,0.1],"model__coef0":[0,1]},
                                 cv=KFold(5, shuffle=True, random_state=RANDOM_STATE), n_jobs=-1,
                                 scoring="neg_root_mean_squared_error"),
        "SVR-rbf": GridSearchCV(Pipeline([("pre", pre), ("model", SVR(kernel="rbf"))]),
                                {"model__C":[1,10,50],"model__epsilon":[0.05,0.1],"model__gamma":["scale",0.01,0.1]},
                                cv=KFold(5, shuffle=True, random_state=RANDOM_STATE), n_jobs=-1,
                                scoring="neg_root_mean_squared_error"),
        "KNN": GridSearchCV(Pipeline([("pre", pre), ("model", KNeighborsRegressor())]),
                            {"model__n_neighbors":[3,5,9,15],"model__weights":["uniform","distance"]},
                            cv=KFold(5, shuffle=True, random_state=RANDOM_STATE), n_jobs=-1,
                            scoring="neg_root_mean_squared_error"),
        "MLPReg": GridSearchCV(Pipeline([("pre", pre), ("model", MLPRegressor(max_iter=1000, early_stopping=True, random_state=RANDOM_STATE))]),
                               {"model__hidden_layer_sizes":[(64,32),(128,64)],"model__alpha":[1e-5,1e-4],"model__learning_rate_init":[1e-3,5e-4]},
                               cv=KFold(5, shuffle=True, random_state=RANDOM_STATE), n_jobs=-1,
                               scoring="neg_root_mean_squared_error"),
    }

def classification_grids(pre):
    return {
        "LogReg": GridSearchCV(Pipeline([("pre", pre), ("model", LogisticRegression(max_iter=1000))]),
                               {"model__penalty":["l2"],"model__C":[0.5,1,5],"model__solver":["lbfgs"]},
                               cv=StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE), n_jobs=-1,
                               scoring="f1_macro"),
        "SVM-linear": GridSearchCV(Pipeline([("pre", pre), ("model", SVC(kernel="linear", probability=True))]),
                                   {"model__C":[0.5,1,5]},
                                   cv=StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE), n_jobs=-1,
                                   scoring="f1_macro"),
        "SVM-poly": GridSearchCV(Pipeline([("pre", pre), ("model", SVC(kernel="poly", probability=True))]),
                                 {"model__degree":[2,3],"model__C":[1,5],"model__coef0":[0,1]},
                                 cv=StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE), n_jobs=-1,
                                 scoring="f1_macro"),
        "SVM-rbf": GridSearchCV(Pipeline([("pre", pre), ("model", SVC(kernel="rbf", probability=True))]),
                                {"model__C":[1,5,10],"model__gamma":["scale",0.01]},
                                cv=StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE), n_jobs=-1,
                                scoring="f1_macro"),
        "KNN": GridSearchCV(Pipeline([("pre", pre), ("model", KNeighborsClassifier())]),
                            {"model__n_neighbors":[3,5,9,15],"model__weights":["uniform","distance"]},
                            cv=StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE), n_jobs=-1,
                            scoring="f1_macro"),
        "NB": GridSearchCV(Pipeline([("pre", pre), ("model", GaussianNB())]),
                           {}, cv=StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE),
                           n_jobs=-1, scoring="f1_macro"),
        "LDA": GridSearchCV(Pipeline([("pre", pre), ("model", LDA())]),
                            {"model__solver":["svd","lsqr"]},
                            cv=StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE), n_jobs=-1,
                            scoring="f1_macro"),
        "QDA": GridSearchCV(Pipeline([("pre", pre), ("model", QDA())]),
                            {"model__reg_param":[0.0,0.1,0.2]},
                            cv=StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE), n_jobs=-1,
                            scoring="f1_macro"),
        "MLPCls": GridSearchCV(Pipeline([("pre", pre), ("model", MLPClassifier(max_iter=1000, early_stopping=True, random_state=RANDOM_STATE))]),
                               {"model__hidden_layer_sizes":[(64,32),(128,64)],"model__alpha":[1e-5,1e-4],"model__learning_rate_init":[1e-3,5e-4]},
                               cv=StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE), n_jobs=-1,
                               scoring="f1_macro"),
    }

def train_all():
    os.makedirs("models", exist_ok=True)
    os.makedirs("reports", exist_ok=True)

    # --- regression grids ---
    reg = regression_grids(preprocessor)
    reg_results = []
    for name, gs in reg.items():
        gs.fit(X, y_reg)
        reg_results.append({
            "model": name,
            "best_params": gs.best_params_,
            "cv_rmse": -gs.best_score_,
        })
    reg_results.sort(key=lambda d: d["cv_rmse"])
    best_reg = reg[reg_results[0]["model"]].best_estimator_
    joblib.dump(best_reg, "models/best_regressor.joblib")

    # --- classification grids ---
    cls = classification_grids(preprocessor)
    cls_results = []
    for name, gs in cls.items():
        gs.fit(X, y_cls)
        cls_results.append({
            "model": name,
            "best_params": gs.best_params_,
            "cv_f1_macro": gs.best_score_,
        })
    cls_results.sort(key=lambda d: d["cv_f1_macro"], reverse=True)
    best_cls = cls[cls_results[0]["model"]].best_estimator_
    joblib.dump(best_cls, "models/best_classifier.joblib")

    with open("reports/leaderboards.json", "w") as f:
        json.dump({"regression": reg_results, "classification": cls_results}, f, indent=2)

if __name__ == "__main__":
    train_all()


  model = cd_fast.enet_coordinate_descent(


ValueError: 
All the 40 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "d:\chris\Career\Udemy\Machine learning\Main project\carPrediction\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\chris\Career\Udemy\Machine learning\Main project\carPrediction\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\chris\Career\Udemy\Machine learning\Main project\carPrediction\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\chris\Career\Udemy\Machine learning\Main project\carPrediction\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "d:\chris\Career\Udemy\Machine learning\Main project\carPrediction\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 849, in fit
    return self._fit(X, y, sample_weight=sample_weight, incremental=False)
           ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\chris\Career\Udemy\Machine learning\Main project\carPrediction\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 508, in _fit
    self._fit_stochastic(
    ~~~~~~~~~~~~~~~~~~~~^
        X,
        ^^
    ...<7 lines>...
        incremental,
        ^^^^^^^^^^^^
    )
    ^
  File "d:\chris\Career\Udemy\Machine learning\Main project\carPrediction\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 748, in _fit_stochastic
    self._update_no_improvement_count(
    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        early_stopping, X_val, y_val, sample_weight_val
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "d:\chris\Career\Udemy\Machine learning\Main project\carPrediction\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 798, in _update_no_improvement_count
    val_score = self._score(X, y, sample_weight=sample_weight)
  File "d:\chris\Career\Udemy\Machine learning\Main project\carPrediction\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 1289, in _score
    return super()._score_with_function(
           ~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
        X, y, sample_weight=sample_weight, score_function=accuracy_score
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "d:\chris\Career\Udemy\Machine learning\Main project\carPrediction\Lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 864, in _score_with_function
    if np.isnan(y_pred).any() or np.isinf(y_pred).any():
       ~~~~~~~~^^^^^^^^
TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
