定义quality<6为低品质葡萄酒标识为0，quality>=6为高品质葡萄酒标识为1

In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pylab as plt
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn import metrics

%matplotlib inline

## 特征工程

In [2]:
df = pd.read_csv('https://query.data.world/s/4ee2mcqmzj55nta6nhj7nu7mmyifob', sep=';')
df['quality'].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

In [3]:
# 类似数据处理，可参考https://stackoverflow.com/questions/49228596/pandas-case-when-default-in-pandas
# 方法1
df['quality'] = df['quality'].apply(lambda item: 0 if item<6 else 1)

# # 方法2
# df['quality'] = np.where(
#     df['quality'].between(0, 6, inclusive=False), 0,
#     np.where(df['quality'].between(6, 8, inclusive=True), 1, -1)
# )

# 切分训练与测试数据集
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:11], df['quality'], test_size=0.2, random_state=666)


## 基本模型

In [4]:
def model_cv(model, X_train, y_train, nfold=5, early_stopping_rounds=30):
    params = model.get_xgb_params()
    dtrain = xgb.DMatrix(X_train.values, y_train.values)
    # 交叉验证
    result = xgb.cv(
        params,
        dtrain,
        nfold=nfold,
        metrics=['auc'],
        early_stopping_rounds=early_stopping_rounds
    )
    
    print(u'最优轮数: ', result.shape[0])
    print(u'最优论详情')
    print(result[result.shape[0]-1:])
    
    return result
    

In [5]:
def model_fit(model, X_train, y_train, X_test, y_test, cv_result):
    model.set_params(n_estimators=cv_result.shape[0])
    
    # 训练集拟合模型
    model.fit(X_train, y_train, eval_metric=['auc'])
    
    # 预测训练集 并评估模型
    train_predict_proba = model.predict_proba(X_train)[:,1]
    train_auc = metrics.roc_auc_score(y_train, train_predict_proba)
    print("训练集AUC得分: ", train_auc)
    
    # 测试数据集
    test_predict_proba = model.predict_proba(X_test)[:,1]
    test_auc = metrics.roc_auc_score(y_test, test_predict_proba)
    print("测试集AUC得分: ", test_auc)
    

In [7]:
model = XGBClassifier(
    objective='binary:logistic',    
    n_estimators=100,
    max_depth=5,
    min_child_weight=1,
    nthread=8,
    scale_pos_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    seed=666
)

In [8]:
cv_result = model_cv(model, X_train, y_train)

最优轮数:  10
最优论详情
   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
9        0.952595       0.001735       0.845133      0.021583


In [9]:
model_fit(model, X_train, y_train, X_test, y_test, cv_result)

训练集AUC得分:  0.945128892029007
测试集AUC得分:  0.8059946907563692


## max_depth、min_child_weight参数调优

### 初步设置参数范围
尽量范围跨度大写，对应的步长也大些

In [19]:
bst1 = XGBClassifier(
    objective='binary:logistic',    
    n_estimators=100,
    max_depth=5,
    min_child_weight=1,
    nthread=8,
    scale_pos_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    seed=666
)

param1 = {
    'max_depth': range(3,15,2),
    'min_child_weight': range(1,6,2)
}
# jupyter后台可看打印的日志
grid_search1 = GridSearchCV(
    estimator=bst1,
    param_grid=param1,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

grid_search1.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    4.2s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    4.6s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    5.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    6.1s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.9s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:    8.4s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   11.9s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:   14.6s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:   16.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=4, min_child_weight=1,
                                     missing=nan, monotone_constraints=No...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     seed=666, subsample=None, tree_method=None,
                         

In [20]:
print("best_params: ", grid_search1.best_params_)
print("best_score: ", grid_search1.best_score_)

best_params:  {'max_depth': 13, 'min_child_weight': 1}
best_score:  0.8683381597285968


### 微调
微调这两个参数 max_depth=[7,8,9], min_child_weight=[1,2]后，发现分数没有差异。说明这两个max_depth=9, min_child_weight为最好参数。

In [11]:
param2 = {
    'max_depth': [7,8,9],
    'min_child_weight': [1,2]
}
# jupyter后台可看打印的日志
grid_search2 = GridSearchCV(
    estimator=bst1,
    param_grid=param2,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

grid_search2.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    3.3s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    3.9s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    5.3s
[Parallel(n_jobs=4)]: Done  27 out of  30 | elapsed:    6.8s remaining:    0.8s
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    7.2s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=4, min_child_weight=1,
                                     missing=nan, monotone_constraints=Non...
                                     num_parallel_tree=None,
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=1,
                                     seed=10, su

In [12]:
print("best_params: ", grid_search1.best_params_)
print("best_score: ", grid_search1.best_score_)

best_params:  {'max_depth': 9, 'min_child_weight': 1}
best_score:  0.8706992836328089


## Gamma参数调优

### 初步设置参数范围

范围：0~1，步长度0.1

In [23]:
gamma_bst = XGBClassifier(
    objective='binary:logistic',    
    n_estimators=100,
    max_depth=9,
    min_child_weight=1,
    nthread=8,
    scale_pos_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    seed=666
)

gamma_param_01 = {
    'gamma': [item/10.0 for item in range(0,10)]
}
# jupyter后台可看打印的日志
gamma_grid_search_01 = GridSearchCV(
    estimator=gamma_bst,
    param_grid=gamma_param_01,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

gamma_grid_search_01.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    5.7s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    6.7s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    8.3s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    9.2s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   13.3s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   15.8s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   17.8s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=9, min_child_weight=1,
                                     missing=nan, monotone_constraints=Non...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=1,
                                     seed=666, subsample=0.8, tree_method=None,
                             

In [14]:
print("best_params: ", gamma_grid_search_01.best_params_)
print("best_score: ", gamma_grid_search_01.best_score_)

best_params:  {'gamma': 0.0}
best_score:  0.8706992836328089


### 微调

范围:0~0.1 步长：0.01

In [15]:
gamma_param_02 = {
    'gamma': [item/10.0 for item in range(0,10)]
}
# jupyter后台可看打印的日志
gamma_grid_search_02 = GridSearchCV(
    estimator=gamma_bst,
    param_grid=gamma_param_02,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

gamma_grid_search_02.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    8.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   10.7s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   13.4s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   15.1s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   18.8s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   23.8s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   26.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=9, min_child_weight=1,
                                     missing=nan, monotone_constraints=Non...
                                     num_parallel_tree=None,
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=1,
                                     seed=10, su

In [16]:
print("best_params: ", gamma_grid_search_02.best_params_)
print("best_score: ", gamma_grid_search_02.best_score_)

best_params:  {'gamma': 0.0}
best_score:  0.8706992836328089


结论：score一样，说明gamma=0是最优参数

### 将当前已调优的参数放入训练模型

In [24]:
gamma_bst = XGBClassifier(
    objective='binary:logistic',    
    n_estimators=100,
    max_depth=9,
    min_child_weight=1,
    nthread=8,
    scale_pos_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    seed=666
)

gamma_cv_result = model_cv(gamma_bst, X_train, y_train)

最优轮数:  10
最优论详情
   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
9        0.994709       0.001281       0.866575      0.021553


In [25]:
model_fit(gamma_bst, X_train, y_train, X_test, y_test, gamma_cv_result)

训练集AUC得分:  0.9916762342135476
测试集AUC得分:  0.8132255636118705


结论：比最开始的分数好。基础(训练集AUC: 0.9778, 测试集AUC:0.8276); 调优后(训练集AUC: 0.9980, 测试集AUC:0.8355)

## subsample和closample_bytree

先粗后细, 首先取 0.6、0.7、0.8、0.9、1.0

In [30]:
sample_bst = XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,
    max_depth=9,
    min_child_weight=1,
    gamma=0,
    nthread=8,
    scale_pos_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    seed=10 
)

sample_param_01 = {
    'subsample': [item/10.0 for item in range(5,10)],
    'colsample_bytree': [item/10.0 for item in range(5,10)],
}
# jupyter后台可看打印的日志
sample_grid_search_01 = GridSearchCV(
    estimator=sample_bst,
    param_grid=sample_param_01,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

sample_grid_search_01.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    5.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    5.7s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    7.1s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    7.9s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    9.9s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   11.4s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:   13.5s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   15.1s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:   17.9s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:   20.2s
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:   23.3s
[Parallel(n_jobs=4)]: Done 125 out of 125 | elapsed:   27.5s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=9, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=1,
                                     seed=10, subsample=0.8, tree_metho

In [31]:
print("best_params: ", sample_grid_search_01.best_params_)
print("best_score: ", sample_grid_search_01.best_score_)

best_params:  {'colsample_bytree': 0.8, 'subsample': 0.7}
best_score:  0.8720401329322487


### 微调

- colsample_bytree 0.7~0.9 步长：0.05
- subsample： 0.6~0.8 步长0.05


In [32]:
sample_param_02 = {
    'subsample': [item/100.0 for item in range(60,80,5)],
    'colsample_bytree': [item/100.0 for item in range(70,90,5)],
}
# jupyter后台可看打印的日志
sample_grid_search_02 = GridSearchCV(
    estimator=sample_bst,
    param_grid=sample_param_02,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

sample_grid_search_02.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    7.5s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    8.6s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   11.4s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   12.8s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   17.2s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   19.7s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:   22.5s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   24.3s
[Parallel(n_jobs=4)]: Done  80 out of  80 | elapsed:   27.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8, gamma=0, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=9, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
                                     n...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=1,
                                     seed=10, subsample=0.8, tree_metho

In [33]:
print("best_params: ", sample_grid_search_02.best_params_)
print("best_score: ", sample_grid_search_02.best_score_)

best_params:  {'colsample_bytree': 0.75, 'subsample': 0.7}
best_score:  0.8720401329322487


结论：没有微调后基本没有提升

## 正则参数调优

In [34]:
reg_bst = XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,
    max_depth=9,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.75,    
    nthread=8,
    scale_pos_weight=1,
    seed=10 
)

reg_param_01 = {
    'reg_alpha': [0, 1e-5, 1e-2, 1e-1, 1, 100]
}
# jupyter后台可看打印的日志
reg_grid_search_01 = GridSearchCV(
    estimator=reg_bst,
    param_grid=reg_param_01,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

reg_grid_search_01.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    4.7s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    5.5s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    6.9s
[Parallel(n_jobs=4)]: Done  27 out of  30 | elapsed:    8.0s remaining:    0.9s
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    8.3s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.75, gamma=0,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=9, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,...
                                     num_parallel_tree=None,
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=1,
                                     seed=10, su

In [35]:
print("best_params: ", reg_grid_search_01.best_params_)
print("best_score: ", reg_grid_search_01.best_score_)

best_params:  {'reg_alpha': 0}
best_score:  0.8706992836328089


### 微调

In [37]:
reg_param_02 = {
    'reg_alpha': [0, 1e-6, 1e-6, 1e-7, 1e-8]
}
# jupyter后台可看打印的日志
reg_grid_search_02 = GridSearchCV(
    estimator=reg_bst,
    param_grid=reg_param_01,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

reg_grid_search_02.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    5.5s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    6.6s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    8.8s
[Parallel(n_jobs=4)]: Done  27 out of  30 | elapsed:   10.0s remaining:    1.1s
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   10.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.75, gamma=0,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=9, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,...
                                     num_parallel_tree=None,
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=1,
                                     seed=10, su

In [38]:
print("best_params: ", reg_grid_search_02.best_params_)
print("best_score: ", reg_grid_search_02.best_score_)

best_params:  {'reg_alpha': 0}
best_score:  0.8706992836328089


结论： reg_alpha最优参数为0

## 降低学习率

In [39]:
learn_bst = XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,
    max_depth=9,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.75,    
    nthread=8,
    scale_pos_weight=1,
    seed=10 
)

learn_param_01 = {
    'learning_rate': [item/10 for item in range(1, 10)]
}
# jupyter后台可看打印的日志
learn_grid_search_01 = GridSearchCV(
    estimator=learn_bst,
    param_grid=learn_param_01,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

learn_grid_search_01.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    8.1s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   10.0s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   12.5s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   13.9s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   16.6s
[Parallel(n_jobs=4)]: Done  43 out of  45 | elapsed:   21.4s remaining:    1.0s
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:   22.1s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.75, gamma=0,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=9, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,...
                                     objective='binary:logistic',
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=1,
                                     seed=10, subsample=0.8, tree_method=None,
                              

In [40]:
print("best_params: ", learn_grid_search_01.best_params_)
print("best_score: ", learn_grid_search_01.best_score_)

best_params:  {'learning_rate': 0.3}
best_score:  0.8706992836328089


### 微调

0.2~0.4 步长0.01

In [42]:
learn_param_02 = {
    'learning_rate': [item/100 for item in range(20, 40, 1)]
}
# jupyter后台可看打印的日志
learn_grid_search_02 = GridSearchCV(
    estimator=learn_bst,
    param_grid=learn_param_01,
    scoring='roc_auc',
    n_jobs=4,
    cv=5,
    verbose=10
)

learn_grid_search_02.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    4.9s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    6.0s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    7.9s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    9.0s
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:   11.5s
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   13.1s
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:   15.4s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   17.1s
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:   20.0s
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:   22.4s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   26.2s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.75, gamma=0,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=9, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,...
                                     reg_lambda=None, scale_pos_weight=1,
                                     seed=10, subsample=0.8, tree_method=None,
                                     validate_parameters=False,
                                     verbosity=None),
             iid='warn', n_jobs=4,
             pa

In [43]:
print("best_params: ", learn_grid_search_02.best_params_)
print("best_score: ", learn_grid_search_02.best_score_)

best_params:  {'learning_rate': 0.25}
best_score:  0.8707096134898394


结论: 最好参数 learning_rate=0.25

### 将当前已调优的参数放入训练模型

In [10]:
gamma_bst = XGBClassifier(
    objective='binary:logistic', 
    n_estimators=100,
    max_depth=9,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.75,    
    learning_rate=0.25,
    nthread=8,
    scale_pos_weight=1,
    seed=666
)

gamma_cv_result = model_cv(gamma_bst, X_train, y_train)
model_fit(gamma_bst, X_train, y_train, X_test, y_test, gamma_cv_result)

最优轮数:  10
最优论详情
   train-auc-mean  train-auc-std  test-auc-mean  test-auc-std
9        0.992925       0.001101       0.857606       0.02199
训练集AUC得分:  0.9908004356913658
测试集AUC得分:  0.8192083680019018


调参完毕，提升太少了。看来重头还是在模型融合和特征工程上。