定义quality<6为低品质葡萄酒标识为0，quality>=6为高品质葡萄酒标识为1

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pylab as plt
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn import metrics

%matplotlib inline

## 特征工程

In [2]:
df = pd.read_csv('https://query.data.world/s/4ee2mcqmzj55nta6nhj7nu7mmyifob', sep=';')
df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [3]:
# 类似数据处理，可参考https://stackoverflow.com/questions/49228596/pandas-case-when-default-in-pandas
# 方法1
df['quality'] = df['quality'].apply(lambda item: 0 if item<6 else 1)

# # 方法2
# df['quality'] = np.where(
#     df['quality'].between(0, 6, inclusive=False), 0,
#     np.where(df['quality'].between(6, 8, inclusive=True), 1, -1)
# )

# 切分训练与测试数据集
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:11], df['quality'], test_size=0.2, random_state=666)


## 基本模型

In [4]:
def model_cv(model, X_train, y_train, nfold=5, early_stopping_rounds=30):
    params = model.get_xgb_params()
    dtrain = xgb.DMatrix(X_train.values, y_train.values)
    # 交叉验证
    result = xgb.cv(
        params,
        dtrain,
        nfold=nfold,
        metrics=['auc'],
        early_stopping_rounds=early_stopping_rounds
    )
    
    print(u'最优轮数: ', result.shape[0])
    print(u'最优论详情')
    print(result[result.shape[0]-1:])
    
    return result
    

In [5]:
def model_fit(model, X_train, y_train, X_test, y_test, cv_result):
    model.set_params(n_estimators=cv_result.shape[0])
    
    # 训练集拟合模型
    model.fit(X_train, y_train, eval_metric=['auc'])
    
    # 预测训练集 并评估模型
    train_predict_proba = model.predict_proba(X_train)[:,1]
    train_auc = metrics.roc_auc_score(y_train, train_predict_proba)
    print("训练集AUC得分: ", train_auc)
    
    # 测试数据集
    test_predict_proba = model.predict_proba(X_test)[:,1]
    test_auc = metrics.roc_auc_score(y_test, test_predict_proba)
    print("测试集AUC得分: ", test_auc)
    

In [6]:
model = XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,
    learning_rate=0.3,
    max_depth=6,
    min_child_weight=1,
    gamma=0,
    subsample=1,
    colsample_bytree=1,    
    reg_alpha=0,
    nthread=8,
    seed=666
)

In [7]:
cv_result = model_cv(model, X_train, y_train)
model_fit(model, X_train, y_train, X_test, y_test, cv_result)

最优轮数:  10
最优论详情
   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
9        0.980733       0.003281       0.855685      0.019876
训练集AUC得分:  0.977800838018978
测试集AUC得分:  0.8276476881017473


## max_depth、min_child_weight参数调优

### 初步设置参数范围
尽量范围跨度大写，对应的步长也大些

In [8]:
bst1 = XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,
    learning_rate=0.3,
    max_depth=6,
    min_child_weight=1,
    gamma=0,
    subsample=1,
    colsample_bytree=1,    
    reg_alpha=0,
    nthread=8,
    seed=666
)

param1 = {
    'max_depth': range(2,20,2),
    'min_child_weight': range(1,6,2)
}
# jupyter后台可看打印的日志
grid_search1 = GridSearchCV(
    estimator=bst1,
    param_grid=param1,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

grid_search1.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    5.0s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    7.0s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    8.7s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   11.4s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   14.5s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:   17.9s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   21.2s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:   31.0s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:   35.8s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:   39.4s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:   42.4s
[Parallel(n_jobs=4)]: Done 135 out of 135 | elapsed:   45.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None, colsample_bytree=1,
                                     gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.3, max_delta_step=None,
                                     max_depth=6, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_es...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=0,
                                     reg_lambda=None, scale_pos_weight=None,
                                     seed=666, subsample=1, tree_method

In [9]:
print("best_params: ", grid_search1.best_params_)
print("best_score: ", grid_search1.best_score_)

best_params:  {'max_depth': 14, 'min_child_weight': 1}
best_score:  0.8711365940529558


### 微调
微调这两个参数 max_depth=[7,8,9], min_child_weight=[1,2]后，发现分数没有差异。说明这两个max_depth=9, min_child_weight为最好参数。

In [10]:
param2 = {
    'max_depth': [13,14,15],
    'min_child_weight': [1,2]
}
# jupyter后台可看打印的日志
grid_search2 = GridSearchCV(
    estimator=bst1,
    param_grid=param2,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

grid_search2.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    4.7s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    5.6s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    7.3s
[Parallel(n_jobs=4)]: Done  27 out of  30 | elapsed:    9.2s remaining:    1.0s
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    9.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None, colsample_bytree=1,
                                     gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.3, max_delta_step=None,
                                     max_depth=6, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_es...
                                     num_parallel_tree=None,
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=0,
                                     reg_lambda=None, scale_pos_weight=None,
          

In [11]:
print("best_params: ", grid_search1.best_params_)
print("best_score: ", grid_search1.best_score_)

best_params:  {'max_depth': 14, 'min_child_weight': 1}
best_score:  0.8711365940529558


## Gamma参数调优

### 初步设置参数范围

范围：0~1，步长度0.1

In [12]:
gamma_bst = XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,
    learning_rate=0.3,
    max_depth=14,
    min_child_weight=1,
    gamma=0,
    subsample=1,
    colsample_bytree=1,    
    reg_alpha=0,
    nthread=8,
    seed=666
)

gamma_param_01 = {
    'gamma': [item/10.0 for item in range(0,11)]
}
# jupyter后台可看打印的日志
gamma_grid_search_01 = GridSearchCV(
    estimator=gamma_bst,
    param_grid=gamma_param_01,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

gamma_grid_search_01.fit(X_train, y_train)

Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    7.2s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    9.9s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   15.2s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   19.3s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   32.1s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   41.4s
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:   50.5s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None, colsample_bytree=1,
                                     gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.3, max_delta_step=None,
                                     max_depth=14, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_e...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=0,
                                     reg_lambda=None, scale_pos_weight=None,
                                     seed=666, subsample=1, tree_method

In [13]:
print("best_params: ", gamma_grid_search_01.best_params_)
print("best_score: ", gamma_grid_search_01.best_score_)

best_params:  {'gamma': 0.1}
best_score:  0.8723163688862158


### 微调

范围:0~0.2 步长：0.02

In [14]:
gamma_param_02 = {
    'gamma': [item/100.0 for item in range(0,20,2)]
}
# jupyter后台可看打印的日志
gamma_grid_search_02 = GridSearchCV(
    estimator=gamma_bst,
    param_grid=gamma_param_02,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

gamma_grid_search_02.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    5.2s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    6.4s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    8.5s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    9.7s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   13.0s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   18.2s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   22.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None, colsample_bytree=1,
                                     gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.3, max_delta_step=None,
                                     max_depth=14, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n_e...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=0,
                                     reg_lambda=None, scale_pos_weight=None,
                                     seed=666, subsample=1, tree_method

In [15]:
print("best_params: ", gamma_grid_search_02.best_params_)
print("best_score: ", gamma_grid_search_02.best_score_)

best_params:  {'gamma': 0.02}
best_score:  0.8733551345368942


结论：score一样，说明gamma=0是最优参数

### 将当前已调优的参数放入训练模型

In [16]:
gamma_bst = XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,
    learning_rate=0.3,
    max_depth=14,
    min_child_weight=1,
    gamma=0.02,
    subsample=1,
    colsample_bytree=1,    
    reg_alpha=0,
    nthread=8,
    seed=666
)

gamma_cv_result = model_cv(gamma_bst, X_train, y_train)

最优轮数:  10
最优论详情
   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
9        0.999758       0.000063       0.864245      0.028974


In [17]:
model_fit(gamma_bst, X_train, y_train, X_test, y_test, gamma_cv_result)

训练集AUC得分:  0.9997448654165072
测试集AUC得分:  0.8428622370141449


结论：比最开始的分数好。基础(训练集AUC: 0.9778, 测试集AUC:0.8276); 调优后(训练集AUC: 0.9980, 测试集AUC:0.8355)

## subsample和closample_bytree

先粗后细, 首先取 0.6、0.7、0.8、0.9、1.0

In [18]:
sample_bst = XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,
    learning_rate=0.3,
    max_depth=14,
    min_child_weight=1,
    gamma=0.02,
    subsample=1,
    colsample_bytree=1,    
    reg_alpha=0,
    nthread=8,
    seed=666
)

sample_param_01 = {
    'subsample': [item/10.0 for item in range(5,11)],
    'colsample_bytree': [item/10.0 for item in range(5,11)],
}
# jupyter后台可看打印的日志
sample_grid_search_01 = GridSearchCV(
    estimator=sample_bst,
    param_grid=sample_param_01,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

sample_grid_search_01.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    6.6s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    8.0s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   10.3s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   12.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   16.2s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   19.6s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:   23.3s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   26.2s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:   29.7s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:   32.5s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:   37.4s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:   44.1s
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed:   48.3s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:   52.1s
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed:  1.1min
[Parallel(

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None, colsample_bytree=1,
                                     gamma=0.02, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.3, max_delta_step=None,
                                     max_depth=14, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,...
                                     random_state=None, reg_alpha=0,
                                     reg_lambda=None, scale_pos_weight=None,
                                     seed=666, subsample=1, tree_method=None,
                                     validate_parameters=False,
                                 

In [19]:
print("best_params: ", sample_grid_search_01.best_params_)
print("best_score: ", sample_grid_search_01.best_score_)

best_params:  {'colsample_bytree': 0.5, 'subsample': 1.0}
best_score:  0.8739694096158872


### 微调

- colsample_bytree 0.4~0.6 步长：0.05
- subsample： 0.9~0.1 步长0.02


In [20]:
sample_param_02 = {
    'subsample': [item/100.0 for item in range(90,101,2)],
    'colsample_bytree': [item/100.0 for item in range(40,60,5)],
}
# jupyter后台可看打印的日志
sample_grid_search_02 = GridSearchCV(
    estimator=sample_bst,
    param_grid=sample_param_02,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

sample_grid_search_02.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    8.2s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   10.6s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   13.9s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   16.2s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   20.5s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   23.1s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:   25.7s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   29.2s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:   34.4s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:   37.2s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:   41.5s
[Parallel(n_jobs=4)]: Done 120 out of 120 | elapsed:   52.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None, colsample_bytree=1,
                                     gamma=0.02, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.3, max_delta_step=None,
                                     max_depth=14, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,...
                                     random_state=None, reg_alpha=0,
                                     reg_lambda=None, scale_pos_weight=None,
                                     seed=666, subsample=1, tree_method=None,
                                     validate_parameters=False,
                                 

In [21]:
print("best_params: ", sample_grid_search_02.best_params_)
print("best_score: ", sample_grid_search_02.best_score_)

best_params:  {'colsample_bytree': 0.4, 'subsample': 0.98}
best_score:  0.8758019123866231


结论：没有微调后基本没有提升

## 正则参数调优

In [22]:
reg_bst = XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,
    learning_rate=0.3,
    max_depth=14,
    min_child_weight=1,
    gamma=0.02,
    subsample=0.98,
    colsample_bytree=0.4,    
    reg_alpha=0,
    nthread=8,
    seed=666
)

reg_param_01 = {
    'reg_alpha': [0, 1e-5, 1e-2, 1e-1, 1, 100]
}
# jupyter后台可看打印的日志
reg_grid_search_01 = GridSearchCV(
    estimator=reg_bst,
    param_grid=reg_param_01,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

reg_grid_search_01.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   13.8s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   16.5s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   18.8s
[Parallel(n_jobs=4)]: Done  27 out of  30 | elapsed:   20.1s remaining:    2.2s
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   21.0s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.4, gamma=0.02,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.3, max_delta_step=None,
                                     max_depth=14, min_child_weight=1,
                                     missing=nan, monotone_constraints=Non...
                                     num_parallel_tree=None,
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=0,
                                     reg_lambda=None, scale_pos_weight=None,
                                     seed=666, s

In [23]:
print("best_params: ", reg_grid_search_01.best_params_)
print("best_score: ", reg_grid_search_01.best_score_)

best_params:  {'reg_alpha': 1e-05}
best_score:  0.8761072747294019


### 微调

In [24]:
reg_param_02 = {
    'reg_alpha': [1e-5, 1e-6, 1e-7, 1e-8, 1e-9]
}
# jupyter后台可看打印的日志
reg_grid_search_02 = GridSearchCV(
    estimator=reg_bst,
    param_grid=reg_param_02,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

reg_grid_search_02.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    6.5s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    7.8s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   10.1s
[Parallel(n_jobs=4)]: Done  21 out of  25 | elapsed:   11.3s remaining:    2.2s
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:   11.8s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.4, gamma=0.02,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.3, max_delta_step=None,
                                     max_depth=14, min_child_weight=1,
                                     missing=nan, monotone_constraints=Non...
                                     num_parallel_tree=None,
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=0,
                                     reg_lambda=None, scale_pos_weight=None,
                                     seed=666, s

In [25]:
print("best_params: ", reg_grid_search_02.best_params_)
print("best_score: ", reg_grid_search_02.best_score_)

best_params:  {'reg_alpha': 1e-05}
best_score:  0.8761072747294019


结论： reg_alpha最优参数为0

## 降低学习率

In [26]:
learn_bst = XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,
    learning_rate=0.3,
    max_depth=14,
    min_child_weight=1,
    gamma=0.02,
    subsample=0.98,
    colsample_bytree=0.4,    
    reg_alpha=1e-05,
    nthread=8,
    seed=666
)

learn_param_01 = {
    'learning_rate': [item/10 for item in range(1, 10)]
}
# jupyter后台可看打印的日志
learn_grid_search_01 = GridSearchCV(
    estimator=learn_bst,
    param_grid=learn_param_01,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

learn_grid_search_01.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    6.8s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    8.3s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   10.3s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   11.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   13.0s
[Parallel(n_jobs=4)]: Done  43 out of  45 | elapsed:   14.4s remaining:    0.7s
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:   14.7s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.4, gamma=0.02,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.3, max_delta_step=None,
                                     max_depth=14, min_child_weight=1,
                                     missing=nan, monotone_constraints=Non...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=1e-05,
                                     reg_lambda=None, scale_pos_weight=None,
                                     seed=666, subsample=0.98, tree_method=None,
                        

In [27]:
print("best_params: ", learn_grid_search_01.best_params_)
print("best_score: ", learn_grid_search_01.best_score_)

best_params:  {'learning_rate': 0.1}
best_score:  0.8778614277641134


### 微调

0.0~0.2 步长0.01

In [None]:
learn_param_02 = {
    'learning_rate': [item/100 for item in range(0, 21, 1)]
}
# jupyter后台可看打印的日志
learn_grid_search_02 = GridSearchCV(
    estimator=learn_bst,
    param_grid=learn_param_02,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

learn_grid_search_02.fit(X_train, y_train)

Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
print("best_params: ", learn_grid_search_02.best_params_)
print("best_score: ", learn_grid_search_02.best_score_)

结论: 最好参数 learning_rate=0.25

### 将当前已调优的参数放入训练模型

In [None]:
gamma_bst = XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,
    learning_rate=0.05,
    max_depth=14,
    min_child_weight=1,
    gamma=0.02,
    subsample=0.98,
    colsample_bytree=0.4,    
    reg_alpha=1e-05,
    nthread=8,
    seed=666
)

gamma_cv_result = model_cv(gamma_bst, X_train, y_train)
model_fit(gamma_bst, X_train, y_train, X_test, y_test, gamma_cv_result)

调参完毕，提升太少了。看来重头还是在模型融合和特征工程上。