In [2]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import optuna
from sklearn.model_selection import cross_val_score


In [3]:
# Load the data
df_resampled_train = pd.read_csv('../../data/processed/resampled/heart_resampled_train_se.csv')
df_resampled_test = pd.read_csv('../../data/processed/resampled/heart_resampled_test_se.csv')

# non resampled data are in the same folder
df = pd.read_csv('../../data/processed/heart_se.csv')

# Original Data

In [4]:
# Split the data into features and target variable
X = df.drop(['target','index'], axis=1)
y = df['target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to tune
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True)
    }

    # Initialize the model with the current hyperparameters
    model = xgb.XGBClassifier(**param)
    
    # Perform cross-validation
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    
    return score

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# Get the best hyperparameters
best_params = study.best_params
print("Best hyperparameters: ", best_params)

[I 2024-08-28 16:25:38,799] A new study created in memory with name: no-name-de4285d8-a929-4b3c-9b24-41be3b9b47cc
[I 2024-08-28 16:25:39,013] Trial 0 finished with value: 0.8018004115226337 and parameters: {'n_estimators': 156, 'max_depth': 5, 'learning_rate': 0.11328435618675448, 'subsample': 0.6174112628001176, 'colsample_bytree': 0.569845260906967, 'gamma': 4.986504265491216, 'lambda': 0.025272273430754486, 'alpha': 1.3364572432190542e-05}. Best is trial 0 with value: 0.8018004115226337.
[I 2024-08-28 16:25:41,111] Trial 1 finished with value: 0.8100308641975308 and parameters: {'n_estimators': 106, 'max_depth': 2, 'learning_rate': 0.10277133861761001, 'subsample': 0.6808103970295638, 'colsample_bytree': 0.7389390362055256, 'gamma': 1.6351282597604904, 'lambda': 4.498911418623525, 'alpha': 0.7878367022115941}. Best is trial 1 with value: 0.8100308641975308.
[I 2024-08-28 16:25:44,619] Trial 2 finished with value: 0.8058641975308642 and parameters: {'n_estimators': 206, 'max_depth': 

Best hyperparameters:  {'n_estimators': 103, 'max_depth': 10, 'learning_rate': 0.2558638309958244, 'subsample': 0.7815190959981169, 'colsample_bytree': 0.5477180325115623, 'gamma': 3.295864580160686, 'lambda': 0.005778390590953065, 'alpha': 1.339034529117194e-05}


In [6]:
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate, plot_slice

# Plot the optimization history
opt_history = plot_optimization_history(study)
opt_history.show()


In [7]:
# Plot the hyperparameter importances
param_importance = plot_param_importances(study)
param_importance.show()


In [8]:

# Plot the parallel coordinate plot to visualize hyperparameter interactions
parallel_coordinate = plot_parallel_coordinate(study)
parallel_coordinate.show()

In [9]:

# Plot the slice plot for each hyperparameter
slice_plot = plot_slice(study)
slice_plot.show()

# Resampled

In [10]:
# Split the data into features and target variable
X_train = df_resampled_train.drop(['target','index'], axis=1)
y_train = df_resampled_train['target']

X_test = df_resampled_test.drop(['target','index'], axis=1)
y_test = df_resampled_test['target']

In [11]:
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to tune
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True)
    }

    # Initialize the model with the current hyperparameters
    model = xgb.XGBClassifier(**param)
    
    # Perform cross-validation
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    
    return score

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# Get the best hyperparameters
best_params = study.best_params
print("Best hyperparameters: ", best_params)

[I 2024-08-28 16:26:25,507] A new study created in memory with name: no-name-be4a06c0-ddf0-4bef-b74f-6c5671c17446
[I 2024-08-28 16:26:26,112] Trial 0 finished with value: 0.8246691048415187 and parameters: {'n_estimators': 161, 'max_depth': 9, 'learning_rate': 0.2282795210580631, 'subsample': 0.6254601873069723, 'colsample_bytree': 0.5716843609004163, 'gamma': 2.6350686597041255, 'lambda': 5.768385230810761e-08, 'alpha': 4.037740389723292e-05}. Best is trial 0 with value: 0.8246691048415187.
[I 2024-08-28 16:26:26,204] Trial 1 finished with value: 0.8513584117032392 and parameters: {'n_estimators': 93, 'max_depth': 4, 'learning_rate': 0.14463661404658984, 'subsample': 0.8972063130444372, 'colsample_bytree': 0.7115246767917041, 'gamma': 3.4532045067039734, 'lambda': 1.6311921970215565e-06, 'alpha': 0.012641842225630239}. Best is trial 1 with value: 0.8513584117032392.
[I 2024-08-28 16:26:26,893] Trial 2 finished with value: 0.8322448624172761 and parameters: {'n_estimators': 209, 'max_d

Best hyperparameters:  {'n_estimators': 264, 'max_depth': 8, 'learning_rate': 0.14433644027234646, 'subsample': 0.7228904679902127, 'colsample_bytree': 0.524644543316522, 'gamma': 4.735260805098788, 'lambda': 1.0630311591443133e-06, 'alpha': 0.29818289285993715}


In [12]:
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate, plot_slice

# Plot the optimization history
opt_history = plot_optimization_history(study)
opt_history.show()



In [13]:
# Plot the hyperparameter importances
param_importance = plot_param_importances(study)
param_importance.show()


In [14]:

# Plot the parallel coordinate plot to visualize hyperparameter interactions
parallel_coordinate = plot_parallel_coordinate(study)
parallel_coordinate.show()



In [15]:

# Plot the slice plot for each hyperparameter
slice_plot = plot_slice(study)
slice_plot.show()