In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import matthews_corrcoef

!pip install optuna-integration[xgboost]
import optuna

#keeps optuna silent
import logging
logging.getLogger('optuna').setLevel(logging.WARNING)

Collecting optuna-integration[xgboost]
  Downloading optuna_integration-4.1.0-py3-none-any.whl.metadata (12 kB)
Downloading optuna_integration-4.1.0-py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.4/97.4 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optuna-integration
Successfully installed optuna-integration-4.1.0


In [2]:
import logging


In [3]:
train = pd.read_csv("/kaggle/input/playground-series-s4e8/train.csv",index_col = "id")
test = pd.read_csv("/kaggle/input/playground-series-s4e8/test.csv",index_col = "id")

In [4]:
X = train.drop("class",axis = 1)
y = train["class"]

#--------
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = pd.Series(le.fit_transform(y))
#--------

cat_cols = list(X.select_dtypes(include = "object").columns)
X[cat_cols] = X[cat_cols].astype("category")
test[cat_cols] = test[cat_cols].astype("category")

In [5]:
#adapted from https://github.com/optuna/optuna-examples/blob/main/xgboost/xgboost_integration.py
def objective(trial):
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, stratify = y, random_state =42 )
    dtrain = xgb.DMatrix(train_x, label=train_y,enable_categorical=True)
    dvalid = xgb.DMatrix(valid_x, label=valid_y,enable_categorical=True)
    param = {
        "verbosity": 0,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        "enable_categorical": True
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
            param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
            param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
            param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
            param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

   
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    bst = xgb.train(param, dtrain, evals=[(dvalid, "validation")],
                    verbose_eval = False,
                    callbacks=[pruning_callback])
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    score = matthews_corrcoef(valid_y, pred_labels)
    return score

study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
    )

study.optimize(objective, n_trials=50)
print(study.best_trial)
print(study.best_params)

FrozenTrial(number=36, state=TrialState.COMPLETE, values=[0.9782625620625603], datetime_start=datetime.datetime(2024, 11, 21, 12, 19, 21, 824947), datetime_complete=datetime.datetime(2024, 11, 21, 12, 19, 45, 983069), params={'booster': 'dart', 'lambda': 9.39318378907274e-06, 'alpha': 0.010873857185666884, 'max_depth': 9, 'eta': 0.3416204570494483, 'gamma': 1.4735242024551962e-07, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 0.0042143607743783725, 'skip_drop': 0.058401354559312106}, user_attrs={}, system_attrs={}, intermediate_values={0: 0.9793558589823413, 1: 0.9889632852146755, 2: 0.9913047481759562, 3: 0.9934122440799413, 4: 0.994043845685501, 5: 0.9949120513069023, 6: 0.9952616745819369, 7: 0.9955571863479221, 8: 0.9957309773856846, 9: 0.9958972810077009}, distributions={'booster': CategoricalDistribution(choices=('gbtree', 'gblinear', 'dart')), 'lambda': FloatDistribution(high=1.0, log=True, low=1e-08, step=None), 'alpha': FloatDis