## lightgbm主要调节参数

1. 其他参数
    * boosting_type
    * n_jobs==>xgboost
    * verbosity==>xgboost
    * objective==>xgboost

2. 树调节参数
    * n_estimators==>xgboost
    * max_depth(重要程度高)==>xgboost
    * min_child_weight(重要程度高,与max_depth一起进行网格搜索)==>xgboost
    * min_child_samples
    * num_leaves

2. 防止过拟合参数
    * subsample==>xgboost
    * subsample_freq
    * learning_rate==>xgboost
    * colsample_bytree==>xgboost
    * reg_alpha=0.0==>xgboost
    * reg_lambda=0.0==>xgboost
    * max_bin

In [1]:
from lightgbm import LGBMClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
X = datasets.fetch_covtype().data[:3000]
y = datasets.fetch_covtype().target[:3000]
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [3]:
print(X_train.shape)  # 数据集有54个特征
print(np.unique(y_train))  # 7分类

(2250, 54)
[1 2 3 4 5 6 7]


In [4]:
boosting_type = ['gbdt', 'rf', 'dart', 'goss']

for i in boosting_type:
    if i == 'goss':
        # 注意:Cannot use bagging in GOSS
        lgb = LGBMClassifier(n_jobs=-1, boosting_type=i)
    else:
        # 注意:若boosting_type='rf',则必须进行bagging操作
        '''
        subsample_freq:frequency for bagging
            0 means disable bagging; k means perform bagging at every k iteration.

        sumsample:Subsample ratio of the training instances
            * 0.0 < bagging_fraction <= 1.0
            * to enable bagging, bagging_freq should be set to a non zero value as well
        '''
        lgb = LGBMClassifier(n_jobs=-1, boosting_type=i, subsample_freq=1, subsample=0.9, bagging_seed=1)
    lgb.fit(X_train, y_train)
    print('booster=' + str(i) + ',  score=', lgb.score(X_test, y_test))

booster=gbdt,  score= 0.8533333333333334
booster=rf,  score= 0.7786666666666666
booster=dart,  score= 0.8493333333333334
booster=goss,  score= 0.836


In [5]:
# controls the level of LightGBM’s verbosity(< 0: Fatal, = 0: Error (Warning), = 1: Info, > 1: Debug
lgb = LGBMClassifier(n_jobs=0, verbosity=0)  # 默认不输出其他信息
lgb.fit(X_train, y_train)
print(lgb.score(X_test, y_test))

You can set `force_col_wise=true` to remove the overhead.
0.8453333333333334


In [6]:
objective = ["regression",  # L2损失
             "regression_l1",  # L1损失
             "binary",  #  binary log loss classification
             'softmax',
             'cross_entropy']  # 交叉熵损失
for i in objective:
    '''
    objective : string, callable or None, optional (default=None)
            Specify the learning task and the corresponding learning objective or
            a custom objective function to be used (see note below).
            Default: 'regression' for LGBMRegressor, 'binary' or 'multiclass' for LGBMClassifier, 'lambdarank' for LGBMRanker.
    '''
    lgb = LGBMClassifier(n_jobs=-1, objective=i)
    lgb.fit(X_train, y_train)
    print("objective=" + str(i) + ', score=', lgb.score(X_test, y_test))

objective=regression, score= 0.8453333333333334
objective=regression_l1, score= 0.8453333333333334
objective=binary, score= 0.8453333333333334
objective=softmax, score= 0.8453333333333334
objective=cross_entropy, score= 0.8453333333333334


In [21]:
learning_rate = [0.01, 0.02, 0.05, 0.1, 0.15, 0.3, 0.5, 0.7, 0.9]  # 默认learning_rate=0.1
for i in learning_rate:
    lgb = LGBMClassifier(n_jobs=-1, learning_rate=i,
                         cat_feature=0)  # 指定分类特征;默认自动
    lgb.fit(X_train, y_train)
    print('learing_rate=' + str(i) + ',  score=', lgb.score(X_test, y_test))

learing_rate=0.01,  score= 0.7826666666666666
learing_rate=0.02,  score= 0.788
learing_rate=0.05,  score= 0.7826666666666666
learing_rate=0.1,  score= 0.7773333333333333
learing_rate=0.15,  score= 0.7933333333333333
learing_rate=0.3,  score= 0.8
learing_rate=0.5,  score= 0.36533333333333334
learing_rate=0.7,  score= 0.7946666666666666
learing_rate=0.9,  score= 0.2786666666666667


In [8]:
colsample_bytree = [0.1, 0.3, 0.4, 0.6, 0.7, 0.8, 0.85, 0.95, 1]
for i in colsample_bytree:
    lgb = LGBMClassifier(n_jobs=-1, colsample_bytree=i)  # 默认colsample_bytree=1.0
    lgb.fit(X_train, y_train)
    print('colsample_bytree=' + str(i) + ',  score=', lgb.score(X_test, y_test))

colsample_bytree=0.1,  score= 0.7866666666666666
colsample_bytree=0.3,  score= 0.8293333333333334
colsample_bytree=0.4,  score= 0.8493333333333334
colsample_bytree=0.6,  score= 0.8466666666666667
colsample_bytree=0.7,  score= 0.8573333333333333
colsample_bytree=0.8,  score= 0.848
colsample_bytree=0.85,  score= 0.852
colsample_bytree=0.95,  score= 0.848
colsample_bytree=1,  score= 0.8453333333333334


In [9]:
reg_alpha = [0, 0.25, 0.5, 0.75, 1, 3, 9]
for i in reg_alpha:
    lgb = LGBMClassifier(n_jobs=-1, reg_alpha=i)  # 默认reg_alpha=0.0
    lgb.fit(X_train, y_train)
    print('reg_alpha=' + str(i) + ',  score=', lgb.score(X_test, y_test))

reg_alpha=0,  score= 0.8453333333333334
reg_alpha=0.25,  score= 0.8453333333333334
reg_alpha=0.5,  score= 0.8426666666666667
reg_alpha=0.75,  score= 0.8386666666666667
reg_alpha=1,  score= 0.836
reg_alpha=3,  score= 0.832
reg_alpha=9,  score= 0.7986666666666666


In [10]:
reg_lambda = [0, 1, 3, 9, 27, 81]
for i in reg_lambda:
    lgb = LGBMClassifier(n_jobs=-1, reg_lambda=i)  # 默认reg_lambda=0.0
    lgb.fit(X_train, y_train)
    print('reg_lambda=' + str(i) + ',  score=', lgb.score(X_test, y_test))

reg_lambda=0,  score= 0.8453333333333334
reg_lambda=1,  score= 0.8466666666666667
reg_lambda=3,  score= 0.8373333333333334
reg_lambda=9,  score= 0.84
reg_lambda=27,  score= 0.8373333333333334
reg_lambda=81,  score= 0.8093333333333333


In [11]:
max_bin = [50, 120, 255, 300, 500]
for i in max_bin:
    '''
    max number of bins that feature values will be bucketed in
    small number of bins may reduce training accuracy but may increase general power (deal with over-fitting)
    '''
    lgb = LGBMClassifier(n_jobs=-1, max_bin=i)  # 默认max_bin=255
    lgb.fit(X_train, y_train)
    print('max_bin=' + str(i) + ',  score=', lgb.score(X_test, y_test))

max_bin=50,  score= 0.848
max_bin=120,  score= 0.848
max_bin=255,  score= 0.8453333333333334
max_bin=300,  score= 0.852
max_bin=500,  score= 0.8506666666666667


In [12]:
n_estimators = [10, 20, 50, 100, 200, 500]  # 默认n_estimators=100
for i in n_estimators:
    lgb = LGBMClassifier(n_jobs=-1, n_estimators=i)
    lgb.fit(X_train, y_train)
    print('n_estimators=' + str(i) + ',  score=', lgb.score(X_test, y_test))

n_estimators=10,  score= 0.82
n_estimators=20,  score= 0.84
n_estimators=50,  score= 0.8506666666666667
n_estimators=100,  score= 0.8453333333333334
n_estimators=200,  score= 0.856
n_estimators=500,  score= 0.852


In [13]:
max_depth = [1, 3, 6, 9, 12, 15, 18, 21, -1]
for i in max_depth:
    lgb = LGBMClassifier(n_jobs=-1, max_depth=i)  # 默认max_depth=-1
    lgb.fit(X_train, y_train)
    print('max_depth=' + str(i) + ',  score=', lgb.score(X_test, y_test))

max_depth=1,  score= 0.7586666666666667
max_depth=3,  score= 0.8146666666666667
max_depth=6,  score= 0.8373333333333334
max_depth=9,  score= 0.848
max_depth=12,  score= 0.852
max_depth=15,  score= 0.844
max_depth=18,  score= 0.8533333333333334
max_depth=21,  score= 0.8453333333333334
max_depth=-1,  score= 0.8453333333333334


In [14]:
min_child_weight = [10, 20, 30, 40, 50, 60]
for i in min_child_weight:
    lgb = LGBMClassifier(n_jobs=-1, min_child_weight=i)  # 默认min_child_weight=1e3
    lgb.fit(X_train, y_train)
    print('min_child_weight=' + str(i) + ',  score=', lgb.score(X_test, y_test))

min_child_weight=10,  score= 0.8266666666666667
min_child_weight=20,  score= 0.8146666666666667
min_child_weight=30,  score= 0.792
min_child_weight=40,  score= 0.768
min_child_weight=50,  score= 0.7506666666666667
min_child_weight=60,  score= 0.696


In [15]:
min_child_sample = [10, 20, 30, 40, 50, 60]
for i in min_child_weight:
    # minimal number of data in one leaf. Can be used to deal with over-fitting
    lgb = LGBMClassifier(n_jobs=-1, min_child_samples=i)  # 默认min_child_sample=20
    lgb.fit(X_train, y_train)
    print('min_child_sample=' + str(i) + ',  score=', lgb.score(X_test, y_test))

min_child_sample=10,  score= 0.86
min_child_sample=20,  score= 0.8453333333333334
min_child_sample=30,  score= 0.8453333333333334
min_child_sample=40,  score= 0.844
min_child_sample=50,  score= 0.8533333333333334
min_child_sample=60,  score= 0.844


In [16]:
num_leaves = [6, 13, 31, 51, 63]
for i in num_leaves:
    # max number of leaves in one tree
    lgb = LGBMClassifier(n_jobs=-1, num_leaves=i, max_bin=2)  # 默认num_leaves=31
    lgb.fit(X_train, y_train)
    print('num_leaves=' + str(i) + ',  score=', lgb.score(X_test, y_test))

num_leaves=6,  score= 0.6026666666666667
num_leaves=13,  score= 0.6053333333333333
num_leaves=31,  score= 0.6053333333333333
num_leaves=51,  score= 0.6053333333333333
num_leaves=63,  score= 0.6053333333333333
