<a href="https://colab.research.google.com/github/dsmohiit/Machine-Learning-Repo/blob/main/Bayesian_Optimization_Hyperparameter_Tuning_using_Optuna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.3-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.3-py3-none-any.whl (246 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.9/246.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.3 colorlog-6.9.0 optuna-4.4.0


In [51]:
import numpy as np
import pandas as pd
import optuna

In [52]:
df = pd.read_csv("/content/diabetes.csv")

In [53]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [54]:
col_with_missing_values = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI",	"DiabetesPedigreeFunction",	"Age"]
df[col_with_missing_values] = df[col_with_missing_values].replace(0, np.nan)

In [55]:
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,5
BloodPressure,35
SkinThickness,227
Insulin,374
BMI,11
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [56]:
df = df.fillna(df.mean())

In [57]:
df.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [58]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [59]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, shuffle= True, random_state= 42)

print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train shape:  (537, 8)
X_test shape:  (231, 8)


### BayesianSearch

In [32]:
# Define objective function
def objective(trail):

  # Search space
  n_estimators = trail.suggest_int("n_estimators", 50, 200)
  max_depth = trail.suggest_int("max_depth", 3, 20)

  # Model
  model = RandomForestClassifier(
      n_estimators= n_estimators,
      max_depth= max_depth,
      random_state= 42,
  )

  # KFold CV
  k_fold = KFold(
      n_splits= 10,
      shuffle= True,
      random_state= 42
  )

  scores = cross_val_score(
      estimator= model,
      X= X_train,
      y= y_train,
      cv= k_fold,
      scoring= "accuracy",
      verbose= 2,
      n_jobs= -1
  ).mean()

  return scores

In [33]:
study = optuna.create_study(direction= "maximize", sampler= optuna.samplers.TPESampler())
study.optimize(objective, n_trials= 50)

[I 2025-07-09 11:57:09,137] A new study created in memory with name: no-name-605d2322-6844-4c16-9310-863eaf4a2983
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.8s finished
[I 2025-07-09 11:57:14,972] Trial 0 finished with value: 0.7672955974842768 and parameters: {'n_estimators': 128, 'max_depth': 12}. Best is trial 0 with value: 0.7672955974842768.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.9s finished
[I 2025-07-09 11:57:19,842] Trial 1 finished with value: 0.7728860936408106 and parameters: {'n_estimators': 185, 'max_depth': 18}. Best is trial 1 with value: 0.7728860936408106.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.0s finished
[I 2025-07-09 11:57:21,889] Trial 2 finished with value: 0.7618099231306779 and param

In [41]:
print("Best trial accuracy: ", study.best_trial.value)
print("Best hyperparameters: ", study.best_trial.params)

Best trial accuracy:  0.7747728860936409
Best hyperparameters:  {'n_estimators': 142, 'max_depth': 19}


In [46]:
best_model = RandomForestClassifier(**study.best_trial.params, random_state= 42)

best_model.fit(X_train, y_train)

In [47]:
y_pred = best_model.predict(X_test)
print("Acuuracy: ", accuracy_score(y_test, y_pred))

Acuuracy:  0.7445887445887446


In [60]:
"""
Above we have performed Bayesian Search using TPESampler, but we can also perform RandomizedSearch and GridSearch as well
"""

'\nAbove we have performed Bayesian Search using TPESampler, but we can also perform RandomizedSearch and GridSearch as well\n'

### RandomizedSearch using Optuna

In [67]:
def objective(trial):

  n_estimators= trial.suggest_int("n_estimators", 50, 250)
  max_depth = trial.suggest_int("max_depth", 3, 20)

  model = RandomForestClassifier(
      n_estimators= n_estimators,
      max_depth= max_depth,
      random_state= 42
  )

  k_fold = KFold(
      n_splits= 5,
      shuffle = True,
      random_state= 42
  )

  scores = cross_val_score(
      estimator= model,
      X= X_train,
      y= y_train,
      scoring= "accuracy",
      cv= k_fold,
      verbose= 2,
      n_jobs= -1
  ).mean()

  return scores

In [68]:
study = optuna.create_study(direction= "maximize", sampler= optuna.samplers.RandomSampler())
study.optimize(objective, n_trials= 50)

[I 2025-07-09 12:23:58,348] A new study created in memory with name: no-name-f199df6e-b0f6-46d3-ac88-54dc5c352e6a
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.5s finished
[I 2025-07-09 12:24:02,816] Trial 0 finished with value: 0.7578746971270336 and parameters: {'n_estimators': 67, 'max_depth': 3}. Best is trial 0 with value: 0.7578746971270336.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished
[I 2025-07-09 12:24:04,136] Trial 1 finished with value: 0.7578920041536863 and parameters: {'n_estimators': 182, 'max_depth': 3}. Best is trial 1 with value: 0.7578920041536863.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.2s finished
[I 2025-07-09 12:24:05,397] Trial 2 finished with value: 0.7764970578054691 and paramete

In [71]:
print("Best trial accuracy: ", study.best_trial.value)
print("Best trial parameters: ", study.best_trial.params)

Best trial accuracy:  0.7821391484942886
Best trial parameters:  {'n_estimators': 114, 'max_depth': 10}


In [72]:
best_model = RandomForestClassifier(**study.best_trial.params, random_state= 42)

best_model.fit(X_train, y_train)

In [73]:
y_pred = best_model.predict(X_test)

In [74]:
print("Accuracy: ", accuracy_score(y_test, y_pred))

Accuracy:  0.7532467532467533


### GridSearch using Optuna

In [78]:
search_space = {
    "n_estimators": [50, 100, 150, 200, 250],
    "max_depth": [5, 10, 15, 20]
}

In [79]:
study = optuna.create_study(direction= "maximize", sampler= optuna.samplers.GridSampler(search_space))
study.optimize(objective)

[I 2025-07-09 12:41:23,710] A new study created in memory with name: no-name-5e439656-a871-4d83-bd01-93b8222880d2
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.0s finished
[I 2025-07-09 12:41:26,748] Trial 0 finished with value: 0.7709415022499135 and parameters: {'n_estimators': 200, 'max_depth': 20}. Best is trial 0 with value: 0.7709415022499135.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished
[I 2025-07-09 12:41:27,659] Trial 1 finished with value: 0.7634648667358948 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 0 with value: 0.7709415022499135.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.2s finished
[I 2025-07-09 12:41:29,847] Trial 2 finished with value: 0.7672031844929041 and parame