In [1]:
import xgboost as xgb
print(xgb.__version__)

2.1.4


In [1]:
import pickle
import numpy as np
from xgboost import XGBClassifier

X_train = np.load('X_train_values_DWT_2025_4_level.npy')
y_train = np.load('y_train_values_DWT_2025_4_level.npy')

In [2]:
model = XGBClassifier(
    tree_method="hist",
    device="cuda"
)

In [3]:
X_test = np.load('X_test_values_DWT_2025_4_level.npy')
y_test = np.load('y_test_values_DWT_2025_4_level.npy')

In [4]:
X_eval = np.load('X_val_values_DWT_2025_4_level.npy')
y_eval = np.load('y_val_values_DWT_2025_4_level.npy')

In [5]:
X_train.shape

(66873, 126)

In [6]:
import gc
import optuna
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import roc_curve, auc, confusion_matrix, hamming_loss, accuracy_score, classification_report, precision_recall_fscore_support, balanced_accuracy_score

# Define the objective function
group_1 = {1, 2, 3}  # Group 1 (mapped to 0)
group_2 = {4, 5, 6}  # Group 2 (mapped to 1)

true_labels = [0 if cls in group_1 else 1 for cls in y_test]

def objective(trial):
    # Suggest values for hyperparameters
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "lambda": trial.suggest_float("lambda", 0, 5),
        "alpha": trial.suggest_float("alpha", 0, 5),
        "device": "cuda",
    }

    # Train XGBoost model
    model = xgb.XGBClassifier(**params, eval_metric="logloss")
    model.fit(X_train, y_train)

    # Evaluate on test data
    y_pred = model.predict(X_test)
    predicted_labels = [0 if cls in group_1 else 1 for cls in y_pred]
    
    balanced_accuracy = balanced_accuracy_score(true_labels, predicted_labels)

    del model, y_pred
    gc.collect()

    return balanced_accuracy

In [11]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Print best parameters
print("Best hyperparameters:", study.best_params)

[I 2025-03-26 18:54:41,506] A new study created in memory with name: no-name-fc3d6919-a9f5-4672-b829-5aa73f21c22e
[I 2025-03-26 18:54:44,705] Trial 0 finished with value: 0.7122790770991445 and parameters: {'n_estimators': 256, 'max_depth': 5, 'learning_rate': 0.14257815982172312, 'subsample': 0.7965543186283437, 'colsample_bytree': 0.7588907297134788, 'gamma': 1.1942617957541972, 'lambda': 4.047567528341189, 'alpha': 4.404118903930209}. Best is trial 0 with value: 0.7122790770991445.
[I 2025-03-26 18:54:50,401] Trial 1 finished with value: 0.6725858271745434 and parameters: {'n_estimators': 415, 'max_depth': 5, 'learning_rate': 0.020558665282941244, 'subsample': 0.6546191962127821, 'colsample_bytree': 0.8516772603437668, 'gamma': 4.973892415830808, 'lambda': 2.9164379033499754, 'alpha': 1.458099095639724}. Best is trial 0 with value: 0.7122790770991445.
[I 2025-03-26 18:54:53,469] Trial 2 finished with value: 0.6985268255870448 and parameters: {'n_estimators': 277, 'max_depth': 5, 'le

Best hyperparameters: {'n_estimators': 338, 'max_depth': 4, 'learning_rate': 0.2820666980334658, 'subsample': 0.767681539188025, 'colsample_bytree': 0.9528118151560168, 'gamma': 2.468124423833511, 'lambda': 2.07116788353338, 'alpha': 2.7438301377984136}


In [14]:
best_params = study.best_params
best_model = xgb.XGBClassifier(**best_params, eval_metric="logloss")
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
predicted_labels = [0 if cls in group_1 else 1 for cls in y_pred]
print("Final Accuracy:", balanced_accuracy_score(true_labels, predicted_labels))

Final Accuracy: 0.7136291736994537


In [16]:
# with open("xgb_model_4_level_gpu.pkl", "wb") as file:
#     pickle.dump(best_model, file)