In [46]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import optuna
from sklearn.model_selection import cross_val_score


In [47]:
# Load the data
df_resampled = pd.read_csv('../../data/processed/heart_resampled_se.csv')
df = pd.read_csv('../../data/processed/heart_se.csv')

# Original Data

In [48]:
# Split the data into features and target variable
X = df.drop(['target','index'], axis=1)
y = df['target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to tune
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True)
    }

    # Initialize the model with the current hyperparameters
    model = xgb.XGBClassifier(**param)
    
    # Perform cross-validation
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    
    return score

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# Get the best hyperparameters
best_params = study.best_params
print("Best hyperparameters: ", best_params)

[I 2024-08-27 16:10:29,624] A new study created in memory with name: no-name-c529402b-5bfc-4460-9eaf-fe0ddd493a4c
[I 2024-08-27 16:10:29,788] Trial 0 finished with value: 0.7934156378600822 and parameters: {'n_estimators': 86, 'max_depth': 4, 'learning_rate': 0.11501332386207593, 'subsample': 0.7776218542270288, 'colsample_bytree': 0.9219422285725694, 'gamma': 1.2909553112049266, 'lambda': 0.004460455728215631, 'alpha': 0.21179983392466206}. Best is trial 0 with value: 0.7934156378600822.
[I 2024-08-27 16:10:30,143] Trial 1 finished with value: 0.7562242798353909 and parameters: {'n_estimators': 276, 'max_depth': 3, 'learning_rate': 0.05617206963081353, 'subsample': 0.5842627755842238, 'colsample_bytree': 0.9796231094179786, 'gamma': 2.266841618110922, 'lambda': 3.623690428637826e-06, 'alpha': 1.1002280073202675e-06}. Best is trial 0 with value: 0.7934156378600822.
[I 2024-08-27 16:10:30,400] Trial 2 finished with value: 0.7728395061728396 and parameters: {'n_estimators': 264, 'max_dep

Best hyperparameters:  {'n_estimators': 295, 'max_depth': 3, 'learning_rate': 0.02253757253299915, 'subsample': 0.5540065503593498, 'colsample_bytree': 0.5743780883012337, 'gamma': 1.1610130113565666, 'lambda': 1.7078121471117709, 'alpha': 0.01110114998230853}


In [50]:
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate, plot_slice

# Plot the optimization history
opt_history = plot_optimization_history(study)
opt_history.show()


In [51]:
# Plot the hyperparameter importances
param_importance = plot_param_importances(study)
param_importance.show()


In [52]:

# Plot the parallel coordinate plot to visualize hyperparameter interactions
parallel_coordinate = plot_parallel_coordinate(study)
parallel_coordinate.show()

In [53]:

# Plot the slice plot for each hyperparameter
slice_plot = plot_slice(study)
slice_plot.show()

# Resampled

In [54]:
# Split the data into features and target variable
X = df_resampled.drop(['target','index'], axis=1)
y = df_resampled['target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [55]:
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to tune
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True)
    }

    # Initialize the model with the current hyperparameters
    model = xgb.XGBClassifier(**param)
    
    # Perform cross-validation
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    
    return score

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# Get the best hyperparameters
best_params = study.best_params
print("Best hyperparameters: ", best_params)

[I 2024-08-27 16:11:53,714] A new study created in memory with name: no-name-ac0f2511-64ef-4b44-98fe-504db2ba6815
[I 2024-08-27 16:11:53,891] Trial 0 finished with value: 0.8068181818181818 and parameters: {'n_estimators': 151, 'max_depth': 4, 'learning_rate': 0.11944849030910791, 'subsample': 0.7868387015390614, 'colsample_bytree': 0.6342947798363692, 'gamma': 4.762158402994604, 'lambda': 0.0007408676297227807, 'alpha': 0.008005767183012269}. Best is trial 0 with value: 0.8068181818181818.
[I 2024-08-27 16:11:54,359] Trial 1 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 271, 'max_depth': 5, 'learning_rate': 0.0739292593424097, 'subsample': 0.7058933510532241, 'colsample_bytree': 0.5354427636882164, 'gamma': 0.7077900462537484, 'lambda': 0.0028077766389702353, 'alpha': 3.291260580044283e-07}. Best is trial 1 with value: 0.8333333333333334.
[I 2024-08-27 16:11:54,598] Trial 2 finished with value: 0.8257575757575757 and parameters: {'n_estimators': 202, 'max_de

Best hyperparameters:  {'n_estimators': 253, 'max_depth': 2, 'learning_rate': 0.044710775279735104, 'subsample': 0.5100416591617843, 'colsample_bytree': 0.7031910108218973, 'gamma': 0.1083525346070166, 'lambda': 3.690239673495391e-05, 'alpha': 8.633763251494675e-08}


In [56]:
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate, plot_slice

# Plot the optimization history
opt_history = plot_optimization_history(study)
opt_history.show()



In [57]:
# Plot the hyperparameter importances
param_importance = plot_param_importances(study)
param_importance.show()


In [58]:

# Plot the parallel coordinate plot to visualize hyperparameter interactions
parallel_coordinate = plot_parallel_coordinate(study)
parallel_coordinate.show()



In [59]:

# Plot the slice plot for each hyperparameter
slice_plot = plot_slice(study)
slice_plot.show()