In [22]:
import joblib
import numpy as np
import pandas as pd

X_train = joblib.load("../data/mutation/X_train.pkl")
y_train = joblib.load("../data/mutation/y_train.pkl")
X_test = joblib.load("../data/mutation/X_test.pkl")
y_test = joblib.load("../data/mutation/y_test.pkl")
LASSO_model = joblib.load("../models/LASSO_with_mutation")


In [23]:
import numpy as np
import pandas as pd
import time
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from xgboost import XGBClassifier
from itertools import product

def run_XGBoost_from_LASSO_with_grid(
    lasso_model_path="../models/LASSO_with_mutation",
    X=None, y=None, top_k=500, seed=42,
    param_grid=None, n_splits=3
):
    """
    Load LASSO model, select top_k features, and run custom CV grid search with XGBoost.
    No early stopping (for full compatibility).
    """
    # Load LASSO
    model = joblib.load(lasso_model_path)
    coefs = model.named_steps["clf"].coef_.ravel()
    feature_names = X.columns
    abs_coefs = np.abs(coefs)

    # Select top_k features
    top_k = min(top_k, X.shape[1])
    top_idx = np.argsort(abs_coefs)[-top_k:]
    selected_features = feature_names[top_idx]
    X_sub = X[selected_features].copy()

    print(f"Selected top {top_k} features from LASSO model")

    # Imbalance handling
    y_arr = y.astype(int) if hasattr(y, "astype") else np.asarray(y).astype(int)
    neg, pos = np.bincount(y_arr)
    spw = neg / pos if pos > 0 else 1.0
    print(f"‚öñÔ∏è scale_pos_weight set to {spw:.2f} (neg={neg}, pos={pos})")

    # Default param grid
    if param_grid is None:
        param_grid = {
            "n_estimators": [200, 400],
            "max_depth": [3, 5],
            "learning_rate": [0.01, 0.1],
            "subsample": [0.8, 1.0],
            "colsample_bytree": [0.5, 0.8],
        }

    keys = list(param_grid.keys())
    grid_combos = [dict(zip(keys, v)) for v in product(*param_grid.values())]

    print(f"üîç Running grid search over {len(grid_combos)} parameter sets")

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    results = []

    for combo_i, params in enumerate(grid_combos, 1):
        print(f"\n=== Grid {combo_i}/{len(grid_combos)}: {params} ===")

        fold_metrics = []
        for fold, (train_idx, val_idx) in enumerate(skf.split(X_sub, y_arr), 1):
            start = time.time()
            X_train, X_val = X_sub.iloc[train_idx], X_sub.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx].astype(int), y.iloc[val_idx].astype(int)

            clf_kwargs = {
                "objective": "binary:logistic",
                "eval_metric": "auc",
                "random_state": seed,
                "n_jobs": -1,
                "tree_method": "hist",
                "scale_pos_weight": spw,
                "use_label_encoder": False,
                **params
            }
            clf = XGBClassifier(**clf_kwargs)

            # Plain fit
            clf.fit(X_train, y_train, verbose=False)

            # Predictions
            y_pred_proba = clf.predict_proba(X_val)[:, 1]
            y_pred = (y_pred_proba >= 0.5).astype(int)

            auc = roc_auc_score(y_val, y_pred_proba)
            prec = precision_score(y_val, y_pred, zero_division=0)
            rec = recall_score(y_val, y_pred, zero_division=0)

            elapsed = time.time() - start
            print(f"‚úÖ Fold {fold}/{n_splits} | AUC={auc:.3f}, Prec={prec:.3f}, Rec={rec:.3f} | Time {elapsed:.1f}s")

            fold_metrics.append((auc, prec, rec))

        mean_auc = np.mean([m[0] for m in fold_metrics])
        mean_prec = np.mean([m[1] for m in fold_metrics])
        mean_rec = np.mean([m[2] for m in fold_metrics])

        results.append({
            "params": params,
            "auc": mean_auc,
            "precision": mean_prec,
            "recall": mean_rec
        })

        print(f"üìä Combo {combo_i} mean: AUC={mean_auc:.3f}, Prec={mean_prec:.3f}, Rec={mean_rec:.3f}")

    results_df = pd.DataFrame(results)
    best_idx = results_df["auc"].idxmax()
    best_params = results_df.iloc[best_idx]["params"]

    print("\nüèÜ Best params:", best_params)
    print("üèÜ Best CV AUC:", results_df.iloc[best_idx]['auc'])

    return results_df, selected_features


In [24]:
fold_results, selected_features = run_XGBoost_from_LASSO_with_grid(
    lasso_model_path="../models/LASSO_with_mutation",
    X=X_train,
    y=y_train,
    top_k=500, seed=42)

Selected top 500 features from LASSO model
‚öñÔ∏è scale_pos_weight set to 4.47 (neg=295, pos=66)
üîç Running grid search over 32 parameter sets

=== Grid 1/32: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.738, Prec=0.467, Rec=0.318 | Time 3.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.704, Prec=0.467, Rec=0.318 | Time 2.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.693, Prec=0.455, Rec=0.455 | Time 3.0s
üìä Combo 1 mean: AUC=0.712, Prec=0.463, Rec=0.364

=== Grid 2/32: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.731, Prec=0.353, Rec=0.273 | Time 3.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.716, Prec=0.500, Rec=0.318 | Time 2.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.698, Prec=0.455, Rec=0.455 | Time 3.1s
üìä Combo 2 mean: AUC=0.715, Prec=0.436, Rec=0.348

=== Grid 3/32: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01, 'subsample': 1.0, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.714, Prec=0.333, Rec=0.273 | Time 2.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.710, Prec=0.375, Rec=0.273 | Time 2.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.676, Prec=0.409, Rec=0.409 | Time 2.5s
üìä Combo 3 mean: AUC=0.700, Prec=0.372, Rec=0.318

=== Grid 4/32: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.01, 'subsample': 1.0, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.719, Prec=0.348, Rec=0.364 | Time 2.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.688, Prec=0.286, Rec=0.273 | Time 2.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.674, Prec=0.300, Rec=0.273 | Time 2.8s
üìä Combo 4 mean: AUC=0.694, Prec=0.311, Rec=0.303

=== Grid 5/32: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.735, Prec=0.375, Rec=0.136 | Time 2.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.781, Prec=0.667, Rec=0.273 | Time 1.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.749, Prec=0.474, Rec=0.409 | Time 2.2s
üìä Combo 5 mean: AUC=0.755, Prec=0.505, Rec=0.273

=== Grid 6/32: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.773, Prec=0.600, Rec=0.273 | Time 2.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.807, Prec=0.615, Rec=0.364 | Time 2.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.746, Prec=0.579, Rec=0.500 | Time 2.1s
üìä Combo 6 mean: AUC=0.775, Prec=0.598, Rec=0.379

=== Grid 7/32: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 1.0, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.726, Prec=0.300, Rec=0.136 | Time 2.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.788, Prec=0.625, Rec=0.455 | Time 2.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.744, Prec=0.474, Rec=0.409 | Time 2.1s
üìä Combo 7 mean: AUC=0.752, Prec=0.466, Rec=0.333

=== Grid 8/32: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 1.0, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.753, Prec=0.500, Rec=0.227 | Time 2.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.770, Prec=0.500, Rec=0.273 | Time 2.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.745, Prec=0.600, Rec=0.409 | Time 2.0s
üìä Combo 8 mean: AUC=0.756, Prec=0.533, Rec=0.303

=== Grid 9/32: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.755, Prec=0.429, Rec=0.136 | Time 4.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.689, Prec=0.375, Rec=0.136 | Time 5.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.692, Prec=0.500, Rec=0.273 | Time 4.5s
üìä Combo 9 mean: AUC=0.712, Prec=0.435, Rec=0.182

=== Grid 10/32: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.746, Prec=0.333, Rec=0.136 | Time 4.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.712, Prec=0.375, Rec=0.136 | Time 5.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.699, Prec=0.545, Rec=0.273 | Time 5.1s
üìä Combo 10 mean: AUC=0.719, Prec=0.418, Rec=0.182

=== Grid 11/32: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 1.0, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.754, Prec=0.375, Rec=0.136 | Time 4.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.674, Prec=0.455, Rec=0.227 | Time 5.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.693, Prec=0.467, Rec=0.318 | Time 4.7s
üìä Combo 11 mean: AUC=0.707, Prec=0.432, Rec=0.227

=== Grid 12/32: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 1.0, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.802, Prec=0.500, Rec=0.227 | Time 6.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.670, Prec=0.417, Rec=0.227 | Time 6.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.697, Prec=0.529, Rec=0.409 | Time 6.6s
üìä Combo 12 mean: AUC=0.723, Prec=0.482, Rec=0.288

=== Grid 13/32: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.730, Prec=0.429, Rec=0.136 | Time 2.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.759, Prec=0.385, Rec=0.227 | Time 2.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.716, Prec=0.471, Rec=0.364 | Time 2.6s
üìä Combo 13 mean: AUC=0.735, Prec=0.428, Rec=0.242

=== Grid 14/32: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.758, Prec=0.444, Rec=0.182 | Time 2.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.791, Prec=0.500, Rec=0.227 | Time 3.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.726, Prec=0.529, Rec=0.409 | Time 2.7s
üìä Combo 14 mean: AUC=0.758, Prec=0.491, Rec=0.273

=== Grid 15/32: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 1.0, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.739, Prec=0.333, Rec=0.136 | Time 2.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.769, Prec=0.600, Rec=0.273 | Time 2.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.714, Prec=0.667, Rec=0.455 | Time 2.8s
üìä Combo 15 mean: AUC=0.741, Prec=0.533, Rec=0.288

=== Grid 16/32: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 1.0, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.778, Prec=0.500, Rec=0.273 | Time 2.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.778, Prec=0.600, Rec=0.273 | Time 2.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.744, Prec=0.571, Rec=0.364 | Time 2.7s
üìä Combo 16 mean: AUC=0.767, Prec=0.557, Rec=0.303

=== Grid 17/32: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.749, Prec=0.500, Rec=0.227 | Time 5.0s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.747, Prec=0.462, Rec=0.273 | Time 4.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.714, Prec=0.533, Rec=0.364 | Time 5.0s
üìä Combo 17 mean: AUC=0.737, Prec=0.498, Rec=0.288

=== Grid 18/32: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.738, Prec=0.455, Rec=0.227 | Time 5.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.749, Prec=0.429, Rec=0.273 | Time 5.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.717, Prec=0.529, Rec=0.409 | Time 5.4s
üìä Combo 18 mean: AUC=0.735, Prec=0.471, Rec=0.303

=== Grid 19/32: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.01, 'subsample': 1.0, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.716, Prec=0.308, Rec=0.182 | Time 5.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.740, Prec=0.462, Rec=0.273 | Time 5.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.706, Prec=0.500, Rec=0.455 | Time 5.2s
üìä Combo 19 mean: AUC=0.721, Prec=0.423, Rec=0.303

=== Grid 20/32: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.01, 'subsample': 1.0, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.704, Prec=0.357, Rec=0.227 | Time 5.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.726, Prec=0.385, Rec=0.227 | Time 6.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.706, Prec=0.450, Rec=0.409 | Time 6.1s
üìä Combo 20 mean: AUC=0.712, Prec=0.397, Rec=0.288

=== Grid 21/32: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.746, Prec=0.444, Rec=0.182 | Time 3.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.789, Prec=0.600, Rec=0.273 | Time 3.1s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.749, Prec=0.474, Rec=0.409 | Time 3.0s
üìä Combo 21 mean: AUC=0.761, Prec=0.506, Rec=0.288

=== Grid 22/32: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.767, Prec=0.600, Rec=0.273 | Time 3.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.814, Prec=0.727, Rec=0.364 | Time 3.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.752, Prec=0.524, Rec=0.500 | Time 3.6s
üìä Combo 22 mean: AUC=0.778, Prec=0.617, Rec=0.379

=== Grid 23/32: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 1.0, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.732, Prec=0.333, Rec=0.136 | Time 3.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.800, Prec=0.643, Rec=0.409 | Time 3.2s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.745, Prec=0.476, Rec=0.455 | Time 3.2s
üìä Combo 23 mean: AUC=0.759, Prec=0.484, Rec=0.333

=== Grid 24/32: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 1.0, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.747, Prec=0.444, Rec=0.182 | Time 3.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.777, Prec=0.667, Rec=0.273 | Time 3.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.747, Prec=0.611, Rec=0.500 | Time 3.5s
üìä Combo 24 mean: AUC=0.757, Prec=0.574, Rec=0.318

=== Grid 25/32: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.748, Prec=0.500, Rec=0.136 | Time 9.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.738, Prec=0.375, Rec=0.136 | Time 8.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.699, Prec=0.545, Rec=0.273 | Time 8.6s
üìä Combo 25 mean: AUC=0.729, Prec=0.473, Rec=0.182

=== Grid 26/32: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 0.8, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.755, Prec=0.571, Rec=0.182 | Time 9.8s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.742, Prec=0.375, Rec=0.136 | Time 9.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.719, Prec=0.556, Rec=0.227 | Time 9.8s
üìä Combo 26 mean: AUC=0.738, Prec=0.501, Rec=0.182

=== Grid 27/32: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 1.0, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.742, Prec=0.375, Rec=0.136 | Time 8.6s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.715, Prec=0.455, Rec=0.227 | Time 8.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.704, Prec=0.533, Rec=0.364 | Time 8.7s
üìä Combo 27 mean: AUC=0.720, Prec=0.454, Rec=0.242

=== Grid 28/32: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.01, 'subsample': 1.0, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.792, Prec=0.444, Rec=0.182 | Time 10.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.715, Prec=0.455, Rec=0.227 | Time 10.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.711, Prec=0.467, Rec=0.318 | Time 10.8s
üìä Combo 28 mean: AUC=0.739, Prec=0.455, Rec=0.242

=== Grid 29/32: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.729, Prec=0.429, Rec=0.136 | Time 3.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.763, Prec=0.455, Rec=0.227 | Time 3.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.720, Prec=0.412, Rec=0.318 | Time 3.6s
üìä Combo 29 mean: AUC=0.737, Prec=0.432, Rec=0.227

=== Grid 30/32: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.754, Prec=0.400, Rec=0.182 | Time 3.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.796, Prec=0.556, Rec=0.227 | Time 3.5s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.731, Prec=0.500, Rec=0.364 | Time 4.7s
üìä Combo 30 mean: AUC=0.760, Prec=0.485, Rec=0.258

=== Grid 31/32: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 1.0, 'colsample_bytree': 0.5} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.738, Prec=0.300, Rec=0.136 | Time 4.4s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.778, Prec=0.636, Rec=0.318 | Time 3.9s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.725, Prec=0.625, Rec=0.455 | Time 3.9s
üìä Combo 31 mean: AUC=0.747, Prec=0.520, Rec=0.303

=== Grid 32/32: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 1.0, 'colsample_bytree': 0.8} ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 1/3 | AUC=0.770, Prec=0.500, Rec=0.273 | Time 4.3s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 2/3 | AUC=0.782, Prec=0.667, Rec=0.364 | Time 3.7s


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Fold 3/3 | AUC=0.746, Prec=0.600, Rec=0.409 | Time 3.7s
üìä Combo 32 mean: AUC=0.766, Prec=0.589, Rec=0.348

üèÜ Best params: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8}
üèÜ Best CV AUC: 0.7776934465246154


In [25]:
import xgboost

print(xgboost.__version__)


3.0.0


In [None]:
# Training Data Evaluation Metrics

# Predictions (discrete)
y_train_pred = best_model.predict(X_train)

print("Training Data Evaluation Metrics:")

# Confusion matrix
cm_train = confusion_matrix(y_train, y_train_pred)
tn, fp, fn, tp = cm_train.ravel()
print(f"True Positives (TP): {tp}")
print(f"False Positives (FP): {fp}")
print(f"True Negatives (TN): {tn}")
print(f"False Negatives (FN): {fn}")

# Precision & Recall
precision = precision_score(y_train, y_train_pred)
recall = recall_score(y_train, y_train_pred)
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")

# Continuous scores for AUROC
if hasattr(best_model, "predict_proba"):
    y_scores = best_model.predict_proba(X_train)[:, 1]
elif hasattr(best_model, "decision_function"):
    y_scores = best_model.decision_function(X_train)
else:
    y_scores = y_train_pred  # fallback (not ideal)

# AUROC
roc_auc = roc_auc_score(y_train, y_scores)
print(f"AUC-ROC Score: {roc_auc:.4f}")

# ROC curve
fpr, tpr, _ = roc_curve(y_train, y_scores)
plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, linestyle='-', label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')  # Random classifier line
plt.xlabel("False Positive Rate (FPR)")
plt.ylabel("True Positive Rate (TPR)")
plt.title("ROC Curve on Training Data")
plt.legend()
plt.show()