In [1]:
!pip install optuna catboost xgboost lightgbm

Collecting optuna
  Downloading optuna-3.2.0-py3-none-any.whl (390 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.6/390.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.11.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m8

In [5]:
from sklearn.datasets import load_breast_cancer
from optuna import trial
from sklearn.model_selection import cross_val_score, train_test_split
import catboost as cb
import lightgbm as lgb
import xgboost as xgb
import optuna
import pandas as pd
import numpy as np

def objective(trial):
    # data, target = load_breast_cancer(return_X_y=True)
    # train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.3)
    model_name = trial.suggest_categorical("classifier", ['xgboost','lightGBM','catboost'])
    if model_name == 'xgboost':
        param = {
            "verbosity": 0,
            "objective": "binary:logistic",
            # use exact for small dataset. [exact, approx, hist and gpu_hist]
            # "tree_method": "exact",
            # defines booster, gblinear for linear functions.
            "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
            # L2 regularization weight.
            "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            # L1 regularization weight.
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
            # sampling ratio for training data.
            "subsample": trial.suggest_float("subsample", 0.2, 1.0),
            # sampling according to each tree.
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
        }

        if param["booster"] in ["gbtree", "dart"]:
            # maximum depth of the tree, signifies complexity of the tree.
            param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
            # minimum child weight, larger the term more conservative the tree.
            param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
            param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
            # defines how selective algorithm is.
            param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
            param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

        if param["booster"] == "dart":
            param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
            param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
            param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
            param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

        clf = xgb.XGBClassifier(**param)
    elif model_name == 'catboost':
        param = {
            "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
            "depth": trial.suggest_int("depth", 1, 12),
            "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
            "bootstrap_type": trial.suggest_categorical(
                "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
            ),
            "used_ram_limit": "3gb",
        }
        if param["bootstrap_type"] == "Bayesian":
            param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
        elif param["bootstrap_type"] == "Bernoulli":
            param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

        clf = cb.CatBoostClassifier(**param)
    elif model_name == 'lightGBM':
        param = {
            #'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt','dart','goss','rf']),
            'num_leaves': trial.suggest_int('num_leaves', 10, 50),
            'max_depth': trial.suggest_int('max_depth', 1, 20),
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        }

        clf = lgb.LGBMClassifier(**param)
    # Perform cross-validation
    scores = cross_val_score(clf, train_x, train_y, cv=5, scoring='accuracy')
    # Calculate the mean accuracy across all folds
    accuracy = np.mean(scores)

    return accuracy
if __name__ == "__main__":
    data, target = load_breast_cancer(return_X_y=True)
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[I 2023-07-24 14:41:34,430] A new study created in memory with name: no-name-77b4299b-a8d9-4033-8f8c-6f0434a9b019


Learning rate set to 0.006692
0:	learn: 0.6898763	total: 7.45ms	remaining: 7.45s
1:	learn: 0.6828301	total: 21.4ms	remaining: 10.7s
2:	learn: 0.6754126	total: 33.7ms	remaining: 11.2s
3:	learn: 0.6750682	total: 37.1ms	remaining: 9.24s
4:	learn: 0.6662516	total: 48ms	remaining: 9.54s
5:	learn: 0.6567430	total: 60.6ms	remaining: 10s
6:	learn: 0.6467807	total: 73.1ms	remaining: 10.4s
7:	learn: 0.6425547	total: 77ms	remaining: 9.55s
8:	learn: 0.6347492	total: 88.8ms	remaining: 9.78s
9:	learn: 0.6248646	total: 98.1ms	remaining: 9.71s
10:	learn: 0.6162815	total: 110ms	remaining: 9.93s
11:	learn: 0.6076549	total: 120ms	remaining: 9.91s
12:	learn: 0.5992590	total: 131ms	remaining: 9.91s
13:	learn: 0.5901034	total: 145ms	remaining: 10.2s
14:	learn: 0.5809807	total: 154ms	remaining: 10.1s
15:	learn: 0.5733451	total: 164ms	remaining: 10.1s
16:	learn: 0.5659535	total: 174ms	remaining: 10.1s
17:	learn: 0.5603490	total: 183ms	remaining: 10s
18:	learn: 0.5520422	total: 200ms	remaining: 10.3s
19:	learn

[I 2023-07-24 14:42:20,280] Trial 0 finished with value: 0.9802197802197803 and parameters: {'classifier': 'catboost', 'objective': 'Logloss', 'colsample_bylevel': 0.08127377226652699, 'depth': 7, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.3856684253738113}. Best is trial 0 with value: 0.9802197802197803.


980:	learn: 0.0230576	total: 8.1s	remaining: 157ms
981:	learn: 0.0230055	total: 8.11s	remaining: 149ms
982:	learn: 0.0229750	total: 8.12s	remaining: 140ms
983:	learn: 0.0229316	total: 8.12s	remaining: 132ms
984:	learn: 0.0229022	total: 8.13s	remaining: 124ms
985:	learn: 0.0228818	total: 8.14s	remaining: 116ms
986:	learn: 0.0228439	total: 8.15s	remaining: 107ms
987:	learn: 0.0228291	total: 8.15s	remaining: 99ms
988:	learn: 0.0227562	total: 8.16s	remaining: 90.8ms
989:	learn: 0.0227020	total: 8.17s	remaining: 82.6ms
990:	learn: 0.0226524	total: 8.18s	remaining: 74.3ms
991:	learn: 0.0226524	total: 8.19s	remaining: 66ms
992:	learn: 0.0226000	total: 8.2s	remaining: 57.8ms
993:	learn: 0.0225745	total: 8.2s	remaining: 49.5ms
994:	learn: 0.0225421	total: 8.21s	remaining: 41.3ms
995:	learn: 0.0225019	total: 8.21s	remaining: 33ms
996:	learn: 0.0224569	total: 8.22s	remaining: 24.7ms
997:	learn: 0.0224256	total: 8.23s	remaining: 16.5ms
998:	learn: 0.0224256	total: 8.23s	remaining: 8.24ms
999:	lear

[I 2023-07-24 14:42:20,553] Trial 1 finished with value: 0.9208791208791208 and parameters: {'classifier': 'lightGBM', 'num_leaves': 47, 'max_depth': 6, 'learning_rate': 0.004052298554390506, 'n_estimators': 106}. Best is trial 0 with value: 0.9802197802197803.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
3:	learn: 0.6598027	total: 18ms	remaining: 4.47s
4:	learn: 0.6517138	total: 18ms	remaining: 4.47s
5:	learn: 0.6431693	total: 18ms	remaining: 4.47s
6:	learn: 0.6349345	total: 59.6ms	remaining: 11.8s
7:	learn: 0.6259374	total: 90.8ms	remaining: 15s
8:	learn: 0.6171946	total: 123ms	remaining: 17.5s
9:	learn: 0.6095107	total: 132ms	remaining: 16.4s
10:	learn: 0.6016243	total: 954ms	remaining: 1m 44s
11:	learn: 0.5937847	total: 1.2s	remaining: 1m 58s
12:	learn: 0.5853667	total: 1.5s	remaining: 2m 14s
13:	learn: 0.5760492	total: 1.51s	remaining: 2m 4s
14:	learn: 0.5695593	total: 1.8s	remaining: 2m 16s
15:	learn: 0.5620029	total: 2.56s	remaining: 2m 59s
16:	learn: 0.5534602	total: 2.56s	remaining: 2m 48s
17:	learn: 0.5473134	total: 3.44s	remaining: 3m 30s
18:	learn: 0.5406670	total: 4.47s	remaining: 4m 17s
19:	learn: 0.5343380	total: 5.1s	remaining: 4m 37s
20:	learn: 0.5270453	total: 5.96s	remaining: 5m 7s
21:	learn: 0.5203660	t

[I 2023-07-24 14:59:51,370] Trial 2 finished with value: 0.9780219780219781 and parameters: {'classifier': 'catboost', 'objective': 'Logloss', 'colsample_bylevel': 0.08832646177852788, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.9802197802197803.


999:	learn: 0.0129148	total: 3m 20s	remaining: 0us
Number of finished trials: 3
Best trial:
  Value: 0.9802197802197803
  Params: 
    classifier: catboost
    objective: Logloss
    colsample_bylevel: 0.08127377226652699
    depth: 7
    boosting_type: Ordered
    bootstrap_type: Bernoulli
    subsample: 0.3856684253738113
