# [DM 2025/26] Hyperparameter Tuning

In [10]:
# Utilities
import pandas as pd
import json
import numpy as np

# ML
from sklearn.model_selection import RandomizedSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

In [11]:
RANDOM_SEED = 42
N_SEARCH_ITERATIONS = 10
N_CROSS_VALIDATION_FOLDS = 5
SCORING = "accuracy"

In [12]:
model_search = {
    "DecisionTree": {
        "instance": DecisionTreeClassifier(random_state=RANDOM_SEED),
        "hyperparameters": {
            "max_depth": list(range(1, 50)) + [None],
            "criterion": ["gini", "entropy", "log_loss"]
		}
	},
    "NaiveBayes": {
        "instance": GaussianNB(),
        "hyperparameters": {}
	},
    "MultinomialNB": {
        "instance": MultinomialNB(),
        "hyperparameters": {}
	},
    "BernoulliNB": {
        "instance": BernoulliNB(),
        "hyperparameters": {}
	},
    "KNN": {
        "instance": KNeighborsClassifier(),
        "hyperparameters": {
            "n_neighbors": range(1, 15),
            "metric": ["euclidean", "cosine", "minkowski"]
		}
	},
	"SVC": {
		"instance": SVC(random_state=RANDOM_SEED),
		"hyperparameters": {
			"kernel": ["linear", "rbf"],
			"C": [0.1, 1, 10, 100],
			"gamma": ["scale", 0.01, 0.1, 1]
		}
	},
    "MLP": {
        "instance": MLPClassifier(random_state=RANDOM_SEED),
        "hyperparameters": {
            "hidden_layer_sizes": range(1, 50),
            "learning_rate": ["constant", "invscaling", "adaptive"]
		}
	},
    "RandomForest": {
        "instance": RandomForestClassifier(random_state=RANDOM_SEED),
        "hyperparameters": {
            "n_estimators": range(100, 200),
            "max_depth": list(range(1, 50)) + [None],
            "criterion": ["gini", "entropy", "log_loss"]
		}
	},
    "AdaBoost": {
        "instance": AdaBoostClassifier(random_state=RANDOM_SEED),
        "hyperparameters": {
            "n_estimators": range(100, 200)
		}
	},
    "XGBoost": {
        "instance": XGBClassifier(),
        "hyperparameters": {
            "n_estimators": range(50, 200),
            "max_depth": list(range(1, 50)) + [None],
		}
	}
    
}

Import dei dataset estratti dal Notebook Main

In [13]:
# Import dei dataset originali

X_train = pd.read_csv("data/X_train.csv")
X_test = pd.read_csv("data/X_test.csv")
y_train = pd.read_csv("data/y_train.csv")
y_test = pd.read_csv("data/y_test.csv")

np.unique_counts(y_train)

UniqueCountsResult(values=array([0, 1]), counts=array([2857, 1196]))

In [14]:
# Import dei dataset dopo undersampling

X_train_under = pd.read_csv("data/X_train_under.csv", sep=",")
y_train_under = pd.read_csv("data/y_train_under.csv", sep=",")#.squeeze().map(str)

np.unique_counts(y_train_under)

UniqueCountsResult(values=array([0, 1]), counts=array([1196, 1196]))

In [15]:
# Import dei dataset dopo oversampling

X_train_over = pd.read_csv("data/X_train_over.csv")
y_train_over = pd.read_csv("data/y_train_over.csv")#.squeeze().map(str)

np.unique_counts(y_train_over)

UniqueCountsResult(values=array([0, 1]), counts=array([2857, 2960]))

In [16]:
tuned_models = {}

for sampling in ("over", "under", "none"):
	data = X_train
	labels = y_train
	if sampling == "under":
		data = X_train_under
		labels = y_train_under
	elif sampling == "over":
		data = X_train_over
		labels = y_train_over

	tuned_models[sampling] = {}

	for model in model_search:
		random_search = RandomizedSearchCV(
			estimator=model_search[model].get("instance"),
			param_distributions=model_search[model].get("hyperparameters"),
			n_iter=N_SEARCH_ITERATIONS,
			scoring=SCORING,
			cv=N_CROSS_VALIDATION_FOLDS,
			random_state=RANDOM_SEED
		)

		res = random_search.fit(data, labels)
		res.best_params_, res.best_score_
		print(f"Hyperparameters for {model} selected!")

		tuned_models[sampling][model] = {"params": res.best_params_, "accuracy": res.best_score_}

		with open("tuned_hyperparameters.json", mode="w") as file:
			json.dump(tuned_models, file, indent=4)

Hyperparameters for DecisionTree selected!
Hyperparameters for NaiveBayes selected!


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\fraca\Desktop\Universita\Applied AI\DM\phoneme_classification\DM\Lib\site-packages\sklearn\model_selection\_validation.py", line 833, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\fraca\Desktop\Universita\Applied AI\DM\phoneme_classification\DM\Lib\site-packages\sklearn\base.py", line 1336, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\fraca\Desktop\Universita\Applied AI\DM\phoneme_classification\DM\Lib\site-packages\sklearn\naive_bayes.py", line 794, in fit
    self._count(X, Y)
  File "c:\Users\fraca\Desktop\Universita\Applied AI\DM\phoneme_classification\DM\Lib\site-packages\sklearn\naive_bayes.py", line 921, in _count
    check_non_negative(X, "MultinomialNB (input X)")
  File "c:\Users\fraca\Desktop\Universita\Applied AI\DM\phoneme_classification\DM\Lib\site-packages\sklearn\utils\validation.py", line 1775, in check_non_negative
    raise ValueError(f"Negative values in data passed to {whom}.")
ValueError: Negative values in data passed to MultinomialNB (input X).
