In [19]:
!pip install scikit-optimize --quiet

In [20]:
# Import necessary libraries
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

In [21]:
# Load a sample dataset (the Iris dataset for simplicity)
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# 1. Define the model you want to tune
model = RandomForestClassifier(random_state=42)

In [23]:
# 2. Define the hyperparameter search space
#    Each key in this dictionary is a hyperparameter name (must match the model's parameters)
#    Each value is a search space defined using skopt.space functions:
#    - Real: for continuous hyperparameters (e.g., learning rate)
#    - Integer: for integer hyperparameters (e.g., number of trees)
#    - Categorical: for discrete hyperparameters (e.g., the criterion to split nodes)
param_space = {
    'n_estimators': Integer(50, 200),        # Number of trees in the forest (between 50 and 200)
    'max_depth': Integer(5, 15),             # Maximum depth of the tree (between 5 and 15)
    'min_samples_split': Integer(2, 10),     # Minimum number of samples required to split an internal node (between 2 and 10)
    'min_samples_leaf': Integer(1, 5),      # Minimum number of samples required to be at a leaf node (between 1 and 5)
    'criterion': Categorical(['gini', 'entropy']) # The function to measure the quality of a split (either 'gini' or 'entropy')
}

In [39]:
def weighted_f1(estimator, X, y):
    """
    Custom scoring function to calculate the weighted F1-score.
    """
    y_pred = estimator.predict(X)
    return f1_score(y, y_pred, average='weighted')

# 3. Initialize the BayesSearchCV object
#    - estimator: The model to tune (our RandomForestClassifier)
#    - search_spaces: The hyperparameter search space we defined
#    - n_iter: The number of iterations to perform (how many different hyperparameter combinations to try)
#    - scoring: The metric to optimize (here, we want to maximize accuracy)
#    - cv: The number of cross-validation folds to use for evaluating each hyperparameter combination
#    - n_jobs: Number of jobs to run in parallel (-1 means use all available cores)
#    - random_state: For reproducibility
bayes_search = BayesSearchCV(
    estimator=model,
    search_spaces=param_space,
    n_iter=50,
    scoring= weighted_f1,
    cv=3,
    n_jobs=-1,
    random_state=42,
)

In [40]:
# 4. Run the Bayesian Optimization
print("Starting Bayesian Optimization...")
bayes_search.fit(X_train, y_train)
print("Bayesian Optimization finished!")

Starting Bayesian Optimization...
Bayesian Optimization finished!


In [44]:
# 5. Get the best hyperparameters and the best score
print("\nBest hyperparameters found:")
print(bayes_search.best_params_)
print("\nBest accuracy score (on cross-validation):")
print(bayes_search.best_score_)


# 6. Evaluate the model with the best hyperparameters on the test set
best_model = bayes_search.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = f1_score(y_test, y_pred, average='weighted')
print("\nTest accuracy with the best hyperparameters:")
print(test_accuracy)


Best hyperparameters found:
OrderedDict([('criterion', 'gini'), ('max_depth', 12), ('min_samples_leaf', 5), ('min_samples_split', 5), ('n_estimators', 151)])

Best accuracy score (on cross-validation):
0.9580911025968497

Test accuracy with the best hyperparameters:
1.0
