In [20]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import optuna
from sklearn.model_selection import cross_val_score


# Original Data

In [21]:
# Load the data
df = pd.read_csv('../../data/processed/heart_se.csv')
# Split the data into features and target variable
X = df.drop(['target','index'], axis=1)
y = df['target']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to tune
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True)
    }

    # Initialize the model with the current hyperparameters
    model = xgb.XGBClassifier(**param)
    
    # Perform cross-validation
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    
    return score

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# Get the best hyperparameters
best_params = study.best_params
print("Best hyperparameters: ", best_params)

[I 2024-08-28 16:27:49,068] A new study created in memory with name: no-name-4e8c7db8-3874-4da9-b385-b37246c60a31
[I 2024-08-28 16:27:49,465] Trial 0 finished with value: 0.8059156378600822 and parameters: {'n_estimators': 160, 'max_depth': 7, 'learning_rate': 0.17175795383858317, 'subsample': 0.539076640586668, 'colsample_bytree': 0.8288952816558466, 'gamma': 0.06325391571148808, 'lambda': 5.423310961444724, 'alpha': 3.3932755164572092e-06}. Best is trial 0 with value: 0.8059156378600822.
[I 2024-08-28 16:27:49,549] Trial 1 finished with value: 0.78940329218107 and parameters: {'n_estimators': 56, 'max_depth': 6, 'learning_rate': 0.26463256271319646, 'subsample': 0.8060186070738407, 'colsample_bytree': 0.5915110580551031, 'gamma': 2.407713457636787, 'lambda': 4.101867555856247e-06, 'alpha': 1.1055879336014804}. Best is trial 0 with value: 0.8059156378600822.
[I 2024-08-28 16:27:49,768] Trial 2 finished with value: 0.810082304526749 and parameters: {'n_estimators': 152, 'max_depth': 10

Best hyperparameters:  {'n_estimators': 295, 'max_depth': 2, 'learning_rate': 0.1713515675766338, 'subsample': 0.6711020900286584, 'colsample_bytree': 0.5894852866253448, 'gamma': 4.078978104097588, 'lambda': 2.9678118505560818, 'alpha': 1.330661010552242e-05}


In [23]:
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate, plot_slice

# Plot the optimization history
opt_history = plot_optimization_history(study)
opt_history.show()


In [24]:
# Plot the hyperparameter importances
param_importance = plot_param_importances(study)
param_importance.show()


In [25]:

# Plot the parallel coordinate plot to visualize hyperparameter interactions
parallel_coordinate = plot_parallel_coordinate(study)
parallel_coordinate.show()

In [26]:

# Plot the slice plot for each hyperparameter
slice_plot = plot_slice(study)
slice_plot.show()

In [27]:
#test the model with the best hyperparameters
model = xgb.XGBClassifier(**best_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test, y_pred))


Accuracy:  0.8852459016393442
              precision    recall  f1-score   support

           0       0.89      0.86      0.88        29
           1       0.88      0.91      0.89        32

    accuracy                           0.89        61
   macro avg       0.89      0.88      0.88        61
weighted avg       0.89      0.89      0.89        61



# Resampled

In [28]:
# Load the data
df_train = pd.read_csv('../../data/processed/resampled/heart_resampled_train_se.csv')
df_test = pd.read_csv('../../data/processed/resampled/heart_resampled_test_se.csv')

# Split the data into features and target variable
X_train = df_train.drop(['target','index'], axis=1)
y_train = df_train['target']
X_test = df_test.drop(['target','index'], axis=1)
y_test = df_test['target']

In [29]:
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to tune
    param = {
        'verbosity': 0,
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'use_label_encoder': False,
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True)
    }

    # Initialize the model with the current hyperparameters
    model = xgb.XGBClassifier(**param)
    
    # Perform cross-validation
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    
    return score

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# Get the best hyperparameters
best_params = study.best_params
print("Best hyperparameters: ", best_params)

[I 2024-08-28 16:28:46,105] A new study created in memory with name: no-name-b9622fcc-be82-4013-b837-09ed008417de
[I 2024-08-28 16:28:46,343] Trial 0 finished with value: 0.8247126436781609 and parameters: {'n_estimators': 221, 'max_depth': 10, 'learning_rate': 0.07523568050613057, 'subsample': 0.8117440542238263, 'colsample_bytree': 0.7939680639971506, 'gamma': 4.7038945947844795, 'lambda': 3.1531510640041273e-06, 'alpha': 0.03573269021273079}. Best is trial 0 with value: 0.8247126436781609.
[I 2024-08-28 16:28:46,578] Trial 1 finished with value: 0.8361198188784394 and parameters: {'n_estimators': 226, 'max_depth': 3, 'learning_rate': 0.13765598974206833, 'subsample': 0.6783146672740197, 'colsample_bytree': 0.8642719452243807, 'gamma': 3.1420930799278364, 'lambda': 2.065393044085008, 'alpha': 0.055667381015656034}. Best is trial 1 with value: 0.8361198188784394.
[I 2024-08-28 16:28:46,732] Trial 2 finished with value: 0.8170498084291188 and parameters: {'n_estimators': 89, 'max_depth

Best hyperparameters:  {'n_estimators': 236, 'max_depth': 8, 'learning_rate': 0.10528579077781224, 'subsample': 0.5974745975393383, 'colsample_bytree': 0.5250567887949182, 'gamma': 4.275084610563515, 'lambda': 2.7092455306149965e-08, 'alpha': 0.0001850339902081556}


In [30]:
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_parallel_coordinate, plot_slice

# Plot the optimization history
opt_history = plot_optimization_history(study)
opt_history.show()



In [31]:
# Plot the hyperparameter importances
param_importance = plot_param_importances(study)
param_importance.show()


In [32]:

# Plot the parallel coordinate plot to visualize hyperparameter interactions
parallel_coordinate = plot_parallel_coordinate(study)
parallel_coordinate.show()



In [33]:

# Plot the slice plot for each hyperparameter
slice_plot = plot_slice(study)
slice_plot.show()

In [34]:
#test the model with the best hyperparameters
model = xgb.XGBClassifier(**best_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
print(classification_report(y_test, y_pred))

Accuracy:  0.8688524590163934
              precision    recall  f1-score   support

           0       0.83      0.89      0.86        27
           1       0.91      0.85      0.88        34

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61

