<a href="https://colab.research.google.com/github/dsmohiit/Machine-Learning-Repo/blob/main/Bayesian_Optimization_Hyperparameter_Tuning_using_Optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.3-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.3-py3-none-any.whl (246 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.9/246.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.3 colorlog-6.9.0 optuna-4.4.0


In [87]:
import numpy as np
import pandas as pd
import optuna

In [88]:
df = pd.read_csv("/content/diabetes.csv")

In [89]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [90]:
col_with_missing_values = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI",	"DiabetesPedigreeFunction",	"Age"]
df[col_with_missing_values] = df[col_with_missing_values].replace(0, np.nan)

In [91]:
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,5
BloodPressure,35
SkinThickness,227
Insulin,374
BMI,11
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [92]:
df = df.fillna(df.mean())

In [93]:
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [94]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [95]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, shuffle= True, random_state= 42)

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train shape:  (537, 8)
X_test shape:  (231, 8)


### BayesianSearch

In [96]:
# Define objective function
def objective(trail):

  # Search space
  n_estimators = trail.suggest_int("n_estimators", 50, 200)
  max_depth = trail.suggest_int("max_depth", 3, 20)

  # Model
  model = RandomForestClassifier(
      n_estimators= n_estimators,
      max_depth= max_depth,
      random_state= 42,
  )

  # KFold CV
  k_fold = KFold(
      n_splits= 10,
      shuffle= True,
      random_state= 42
  )

  scores = cross_val_score(
      estimator= model,
      X= X_train,
      y= y_train,
      cv= k_fold,
      scoring= "accuracy",
      verbose= 2,
      n_jobs= -1
  ).mean()

  return scores

In [97]:
study = optuna.create_study(direction= "maximize", sampler= optuna.samplers.TPESampler())
study.optimize(objective, n_trials= 50)

[I 2025-07-09 12:53:19,141] A new study created in memory with name: no-name-2ad4571d-9933-4314-b734-08886790862d
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.6s finished
[I 2025-07-09 12:53:25,790] Trial 0 finished with value: 0.7709993011879804 and parameters: {'n_estimators': 135, 'max_depth': 12}. Best is trial 0 with value: 0.7709993011879804.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.9s finished
[I 2025-07-09 12:53:28,691] Trial 1 finished with value: 0.7673654786862334 and parameters: {'n_estimators': 134, 'max_depth': 17}. Best is trial 0 with value: 0.7709993011879804.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.9s finished
[I 2025-07-09 12:53:30,613] Trial 2 finished with value: 0.7561844863731657 and param

In [98]:
print("Best trial accuracy: ", study.best_trial.value)
print("Best hyperparameters: ", study.best_trial.params)

Best trial accuracy:  0.7729210342417889
Best hyperparameters:  {'n_estimators': 144, 'max_depth': 18}


In [99]:
best_model = RandomForestClassifier(**study.best_trial.params, random_state= 42)

best_model.fit(X_train, y_train)

In [100]:
y_pred = best_model.predict(X_test)
print("Acuuracy: ", accuracy_score(y_test, y_pred))

Acuuracy:  0.7445887445887446


In [101]:
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances

In [102]:
plot_optimization_history(study).show()

In [104]:
plot_parallel_coordinate(study).show()

In [106]:
plot_slice(study)

In [107]:
plot_contour(study)

In [109]:
plot_param_importances(study)

In [60]:
"""
Above we have performed Bayesian Search using TPESampler, but we can also perform RandomizedSearch and GridSearch as well
"""

'\nAbove we have performed Bayesian Search using TPESampler, but we can also perform RandomizedSearch and GridSearch as well\n'

### RandomizedSearch using Optuna

In [110]:
def objective(trial):

  n_estimators= trial.suggest_int("n_estimators", 50, 250)
  max_depth = trial.suggest_int("max_depth", 3, 20)

  model = RandomForestClassifier(
      n_estimators= n_estimators,
      max_depth= max_depth,
      random_state= 42
  )

  k_fold = KFold(
      n_splits= 5,
      shuffle = True,
      random_state= 42
  )

  scores = cross_val_score(
      estimator= model,
      X= X_train,
      y= y_train,
      scoring= "accuracy",
      cv= k_fold,
      verbose= 2,
      n_jobs= -1
  ).mean()

  return scores

In [111]:
study = optuna.create_study(direction= "maximize", sampler= optuna.samplers.RandomSampler())
study.optimize(objective, n_trials= 50)

[I 2025-07-09 13:00:25,792] A new study created in memory with name: no-name-258ed350-810d-4f8c-8507-11c22ce61309
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.7s finished
[I 2025-07-09 13:00:30,505] Trial 0 finished with value: 0.7802180685358255 and parameters: {'n_estimators': 179, 'max_depth': 11}. Best is trial 0 with value: 0.7802180685358255.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.7s finished
[I 2025-07-09 13:00:31,206] Trial 1 finished with value: 0.7597265489788854 and parameters: {'n_estimators': 90, 'max_depth': 5}. Best is trial 0 with value: 0.7802180685358255.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished
[I 2025-07-09 13:00:31,996] Trial 2 finished with value: 0.7653686396677051 and paramet

In [112]:
print("Best trial accuracy: ", study.best_trial.value)
print("Best trial parameters: ", study.best_trial.params)

Best trial accuracy:  0.7802526825891312
Best trial parameters:  {'n_estimators': 176, 'max_depth': 10}


In [113]:
best_model = RandomForestClassifier(**study.best_trial.params, random_state= 42)

best_model.fit(X_train, y_train)

In [114]:
y_pred = best_model.predict(X_test)

In [115]:
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.7489177489177489


In [116]:
plot_optimization_history(study)

In [117]:
plot_parallel_coordinate(study)

In [118]:
plot_slice(study)

In [119]:
plot_contour(study)

In [121]:
plot_param_importances(study)

### GridSearch using Optuna

In [122]:
search_space = {
    "n_estimators": [50, 100, 150, 200, 250],
    "max_depth": [5, 10, 15, 20]
}

In [123]:
study = optuna.create_study(direction= "maximize", sampler= optuna.samplers.GridSampler(search_space))
study.optimize(objective)

[I 2025-07-09 13:03:58,028] A new study created in memory with name: no-name-4ba4c8fd-1ccb-4545-aa10-56029af2e3bb
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.8s finished
[I 2025-07-09 13:03:59,875] Trial 0 finished with value: 0.7709415022499135 and parameters: {'n_estimators': 200, 'max_depth': 20}. Best is trial 0 with value: 0.7709415022499135.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished
[I 2025-07-09 13:04:00,709] Trial 1 finished with value: 0.7634648667358948 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 0 with value: 0.7709415022499135.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.5s finished
[I 2025-07-09 13:04:04,182] Trial 2 finished with value: 0.7672031844929041 and parame

In [124]:
plot_optimization_history(study).show()

In [126]:
plot_parallel_coordinate(study)

In [127]:
plot_slice(study)

In [128]:
plot_contour(study)

In [129]:
plot_param_importances(study)