In [1]:
import lightgbm as lgb
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

In [2]:
X = datasets.fetch_covtype().data[:3000]
y = datasets.fetch_covtype().target[:3000]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

print(X_train.shape, y_train.shape)
print(np.unique(y_train))  # 7分类任务

(2700, 54) (2700,)
[1 2 3 4 5 6 7]


In [3]:
enc = OrdinalEncoder()
y_train_enc = enc.fit_transform(y_train.reshape(-1, 1)).reshape(-1, )
y_test_enc = enc.transform(y_test.reshape(-1, 1)).reshape(-1, )

print(np.unique(y_train_enc))
print(y_train_enc.shape)

[0. 1. 2. 3. 4. 5. 6.]
(2700,)


In [4]:
def model_cv(reg_alpha, subsample, max_depth, min_child_samples, X_train_data, y_train_data, kfold, loss_func, great):
    # 参数传递
    # 不同模型需要修改部分
    params = {"n_jobs": -1, "num_class": 7, 'verbose': -1, "objective": "multiclass", 'reg_alpha': reg_alpha,
              'subsample': subsample, 'max_depth': max_depth, 'min_child_samples': min_child_samples}


    eval = 0
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(X_train_data)):
        print('*' * 100)
        x_train, x_val = X_train_data[trn_ind], X_train_data[val_ind]
        y_train, y_val = y_train_data[trn_ind], y_train_data[val_ind]

        # 模型训练
        # 不同模型需要修改部分
        train_dataset = lgb.Dataset(x_train, y_train)
        val_dataset = lgb.Dataset(x_val, y_val)
        estimator = lgb.train(params=params, train_set=train_dataset, valid_sets=[train_dataset, val_dataset],
                              early_stopping_rounds=100, verbose_eval=200)

        eval += (loss_func(y_val, estimator.predict(x_val)) / kfold.n_splits)
        if great:
            eval = eval
        else:
            eval = -eval  # 最大化-eval,等于最小化eval

    return eval  # 返回一个评估指标

In [5]:
def optimize_lgb(X_train_data, y_train_data, kfold, loss_func, pbounds, great=False):
    # 不同模型需要修改部分
    def model_cv_out(reg_alpha, subsample, max_depth, min_child_samples):
        return model_cv(
            reg_alpha=reg_alpha,
            subsample=subsample,
            max_depth=int(np.around(max_depth)),  # max_depth必须为int类型,故这里使其变为整数
            min_child_samples=int(np.around(min_child_samples)),  # 同理
            X_train_data=X_train_data,
            y_train_data=y_train_data,
            kfold=kfold,
            loss_func=loss_func,
            great=great
        )

    optimizer = BayesianOptimization(
        f=model_cv_out,
        pbounds=pbounds,
        random_state=1234,
        verbose=2
    )

    optimizer.maximize(n_iter=10)  # 最大化这个评估指标

    max_result = optimizer.max
    print("Final result:", max_result)
    return max_result

In [153]:
# 不同模型需要修改部分
pbounds_dict = {"reg_alpha": (0, 1),  # reg_alpha这个指标的最小值为0,最大值为1
                "subsample": (0.1, 1),  # subsample这个指标的最小值为0,最大值为1
                "max_depth": (1, 6),
                "min_child_samples": (14, 30)}

kf = KFold(n_splits=5, shuffle=True)

result = optimize_lgb(X_train, y_train_enc, kfold=kf, loss_func=log_loss, pbounds=pbounds_dict, great=False)
result

|   iter    |  target   | max_depth | min_ch... | reg_alpha | subsample |
-------------------------------------------------------------------------
****************************************************************************************************
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.385919	valid_1's multi_logloss: 0.540511
****************************************************************************************************
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.383605	valid_1's multi_logloss: 0.535173
****************************************************************************************************
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	training's multi_logloss: 0.394352	valid_1's multi_logloss: 0.5

{'target': -0.06789718089935763,
 'params': {'max_depth': 5.790696768418526,
  'min_child_samples': 28.014922155873514,
  'reg_alpha': 0.35781726995786667,
  'subsample': 0.5508956129711129}}

In [156]:
# 最优分数
result_loss = -result['target']
result_loss

0.06789718089935763

In [158]:
# 最优参数组合
# 这里max_depth和min_child_samples需要四舍五入后转换为int类型
result_params = result['params']
result_params

{'max_depth': 5.790696768418526,
 'min_child_samples': 28.014922155873514,
 'reg_alpha': 0.35781726995786667,
 'subsample': 0.5508956129711129}