In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [7]:
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

In [8]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [9]:
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [12]:
# Replace zero values with NaN in columns where zero is not a valid value
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)

In [13]:
df.fillna(df.mean(), inplace=True)

In [14]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check the shape of the data
print(f'Training set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')

Training set shape: (614, 8)
Test set shape: (154, 8)


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [22]:
def objective(trials):
    # suggest the value for the parameters
    n_estimators = trials.suggest_int("n_estimators", 50, 200)
    max_depth = trials.suggest_int("max_depth", 3, 20)

    # Create the RandomForestClassifier with suggested hyperparameters
    model = RandomForestClassifier(
        n_estimators = n_estimators,
        max_depth = max_depth,
        random_state = 42
    )

    # Perform 3-fold cross-validation and calculate accuracy
    accuracy = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy").mean()
    return accuracy

In [23]:
# Create a study object and optimize the objective function
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=50)

[I 2025-05-20 18:25:59,378] A new study created in memory with name: no-name-e21f6a46-88c2-46cf-a4df-8e93bb1a7bb8
[I 2025-05-20 18:26:00,797] Trial 0 finished with value: 0.7655337864854058 and parameters: {'n_estimators': 163, 'max_depth': 10}. Best is trial 0 with value: 0.7655337864854058.
[I 2025-05-20 18:26:01,572] Trial 1 finished with value: 0.7524723443955752 and parameters: {'n_estimators': 102, 'max_depth': 3}. Best is trial 0 with value: 0.7655337864854058.
[I 2025-05-20 18:26:03,005] Trial 2 finished with value: 0.767119818739171 and parameters: {'n_estimators': 164, 'max_depth': 19}. Best is trial 2 with value: 0.767119818739171.
[I 2025-05-20 18:26:03,931] Trial 3 finished with value: 0.7638411302145808 and parameters: {'n_estimators': 105, 'max_depth': 16}. Best is trial 2 with value: 0.767119818739171.
[I 2025-05-20 18:26:04,683] Trial 4 finished with value: 0.7654671464747433 and parameters: {'n_estimators': 86, 'max_depth': 13}. Best is trial 2 with value: 0.767119818

In [24]:
study.best_value

0.7753431960549115

In [26]:
study.best_params

{'n_estimators': 70, 'max_depth': 6}