## Xgboost调参步骤
参考：https://zhuanlan.zhihu.com/p/29649128  
使用泰坦尼克号数据进行训练

In [137]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import xgboost as xgb
from xgboost import XGBClassifier,XGBRegressor
from sklearn.metrics import accuracy_score,mean_squared_error, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV,learning_curve
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

### 数据导入和预处理

In [138]:
df = pd.read_csv('../data/taitanic_data/data.csv')
df.info()
#数据预处理
df.drop(['PassengerId','Name','Ticket', 'Fare', 'Cabin'], axis=1, inplace=True)
df['Age'] = SimpleImputer(strategy='mean').fit_transform(df[['Age']]) #均值填充age
df.dropna(axis=0, inplace=True)     #删除有缺失值的行
df.reset_index(drop=True, inplace=True)
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.000000,1,0,S
1,1,1,female,38.000000,1,0,C
2,1,3,female,26.000000,0,0,S
3,1,1,female,35.000000,1,0,S
4,0,3,male,35.000000,0,0,S
...,...,...,...,...,...,...,...
884,0,2,male,27.000000,0,0,S
885,1,1,female,19.000000,0,0,S
886,0,3,female,29.699118,1,2,S
887,1,1,male,26.000000,0,0,C


In [139]:
# 分割出标签
y_data = df['Survived']
x_data = df.drop(['Survived'], axis=1)
# 编码
enc = OneHotEncoder(dtype=int)
result = enc.fit_transform( x_data[['Sex','Embarked']]).toarray()
print(enc.categories_)
print(enc.feature_names_in_) #参与one-hot编码的特征名字
code_df = pd.DataFrame(result)
print(code_df)
column_names = enc.get_feature_names_out()
column_names

[array(['female', 'male'], dtype=object), array(['C', 'Q', 'S'], dtype=object)]
['Sex' 'Embarked']
     0  1  2  3  4
0    0  1  0  0  1
1    1  0  1  0  0
2    1  0  0  0  1
3    1  0  0  0  1
4    0  1  0  0  1
..  .. .. .. .. ..
884  0  1  0  0  1
885  1  0  0  0  1
886  1  0  0  0  1
887  0  1  1  0  0
888  0  1  0  1  0

[889 rows x 5 columns]


array(['Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype=object)

In [140]:
# 拼接one-hot编码
x_data = pd.concat([x_data, code_df], axis=1)
x_data.drop(columns=enc.feature_names_in_,inplace=True)
keys = range(column_names.shape[0])
x_data.rename(columns=dict(zip(keys, column_names)), inplace=True)
x_train,x_test,y_train,y_test = train_test_split(x_data,y_data,test_size=0.3)
x_train

Unnamed: 0,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
670,1,31.000000,1,0,0,1,0,0,1
125,3,29.699118,0,0,0,1,0,1,0
570,1,53.000000,2,0,1,0,0,0,1
811,2,35.000000,0,0,0,1,0,0,1
253,3,41.000000,0,2,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...
266,3,25.000000,1,0,0,1,0,0,1
264,2,36.000000,0,0,0,1,0,0,1
514,1,47.000000,0,0,0,1,0,0,1
510,3,29.699118,0,0,0,1,0,0,1


### Xgboost调参

In [141]:
# 初始参数
params_best = {
    'n_estimators':120,
    'max_depth':6,
    'eta':0.1,
    'gamma':1,
    'reg_lambda':3,
    'reg_alpha':0,
    'eval_metric':'auc',
    'seed':1024
}

### 步骤一：调n_estimators和eta

In [142]:
# early_stopping_rounds : 
#       cv过程中验证集出现early_stopping_rounds次数的score decrease后将提前停止迭代（一般设置为迭代次数的10%）
def modelfit(alg, x_train,y_train, useTrainCV=True, cv_folds=5, early_stopping_rounds=30):

    if useTrainCV:
        # train
        xgb_param = alg.get_xgb_params()
        dtrain = xgb.DMatrix(x_train, y_train)
        cvresult = xgb.cv(xgb_param, dtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
             early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
        alg.fit(x_train, y_train)

    # pred
    dtrain_predictions = alg.predict(x_train)
    dtrain_predprob = alg.predict_proba(x_train)[:,1]


    # eval
    print("当前树数量：%d" % alg.n_estimators)
    print ("关于现在这个模型(在训练集上表现)：")
    print ("准确率 : %.4g" % accuracy_score(y_train, dtrain_predictions))
    print ("AUC 得分 : %f" % roc_auc_score(y_train, dtrain_predprob))

def plot_feat_importance(alg, columns):
    feat_imp = pd.Series(alg.feature_importances_,index=list(columns)).sort_values(ascending=False)
    fig, ax = plt.subplots(figsize=(12,5))
    feat_imp.plot(kind='bar', ax=ax, title='Feature Importances')
    ax.set_ylabel('Feature Importance Score')
    

In [143]:

def search_best_xgbc(init_params,x_train, y_train):
    best_xgbc = None
    best_score = 0
    for i in np.arange(0.03,0.3,0.03):
        print(f"\n---eta={i}---")
        params_best['eta'] = i
        xgbc_1 = XGBClassifier(**init_params)
        modelfit(xgbc_1, x_train, y_train)
        score = xgbc_1.score(x_test,y_test)
        if(best_score < score):
            best_score = score
            best_xgbc = xgbc_1
        print(f"----测试集score:{score}---")
    return best_xgbc

best_xgbc = search_best_xgbc(params_best, x_data, y_data)
params_best['n_estimators'] = best_xgbc.n_estimators
params_best['eta'] = best_xgbc.get_params()['eta']
params_best


---eta=0.03---
当前树数量：67
关于现在这个模型(在训练集上表现)：
准确率 : 0.8549
AUC 得分 : 0.897126
----测试集score:0.8689138576779026---

---eta=0.06---
当前树数量：35
关于现在这个模型(在训练集上表现)：
准确率 : 0.8538
AUC 得分 : 0.897940
----测试集score:0.8689138576779026---

---eta=0.09---
当前树数量：24
关于现在这个模型(在训练集上表现)：
准确率 : 0.8493
AUC 得分 : 0.899973
----测试集score:0.8614232209737828---

---eta=0.12---
当前树数量：17
关于现在这个模型(在训练集上表现)：
准确率 : 0.8594
AUC 得分 : 0.897710
----测试集score:0.8651685393258427---

---eta=0.15---
当前树数量：14
关于现在这个模型(在训练集上表现)：
准确率 : 0.8526
AUC 得分 : 0.898154
----测试集score:0.8614232209737828---

---eta=0.18---
当前树数量：13
关于现在这个模型(在训练集上表现)：
准确率 : 0.8515
AUC 得分 : 0.903498
----测试集score:0.8651685393258427---

---eta=0.21---
当前树数量：10
关于现在这个模型(在训练集上表现)：
准确率 : 0.847
AUC 得分 : 0.898195
----测试集score:0.8614232209737828---

---eta=0.24---
当前树数量：9
关于现在这个模型(在训练集上表现)：
准确率 : 0.8504
AUC 得分 : 0.898473
----测试集score:0.8614232209737828---

---eta=0.27---
当前树数量：9
关于现在这个模型(在训练集上表现)：
准确率 : 0.8549
AUC 得分 : 0.904902
----测试集score:0.8614232209737828---

---eta=0.300

{'n_estimators': 67,
 'max_depth': 6,
 'eta': 0.03,
 'gamma': 1,
 'reg_lambda': 3,
 'reg_alpha': 0,
 'eval_metric': 'auc',
 'seed': 1024}

### 步骤二：max_depth 和 min_weight 参数调优

In [152]:
#先粗略估计max_depth
param_grid = {
    'max_depth':range(3,10),
}
gs = GridSearchCV(estimator = XGBClassifier(**params_best), 
                       param_grid = param_grid, scoring='roc_auc',cv=5)
gs.fit(x_data, y_data)
gs.best_params_, gs.best_score_

({'max_depth': 7}, 0.8694553058921652)

In [146]:
#先粗略估计
param_grid = {
   'min_child_weight':[1,2,3]
}
gs = GridSearchCV(estimator = XGBClassifier(**params_best), 
                       param_grid = param_grid, scoring='roc_auc',cv=5)
gs.fit(x_data, y_data)
gs.best_params_, gs.best_score_

({'min_child_weight': 1}, 0.8671699455428543)

In [151]:
# min_child_weight 对数据量较大的情况有用
param_grid = {
    'max_depth':[3,4,5,6,7],
    'min_child_weight':[0,1,2,3]
}
gs = GridSearchCV(estimator = XGBClassifier(**params_best), 
                       param_grid=param_grid, scoring='roc_auc',cv=5)
gs.fit(x_train, y_train)

print(gs.best_params_, gs.best_score_)


{'max_depth': 5, 'min_child_weight': 2} 0.8571817174499875


In [153]:
params_best['max_depth'] = gs.best_params_['max_depth']
# params_best['min_child_weight'] = gs.best_params_['min_child_weight'] min_child_weight不调
params_best

{'n_estimators': 67,
 'max_depth': 7,
 'eta': 0.03,
 'gamma': 1,
 'reg_lambda': 3,
 'reg_alpha': 0,
 'eval_metric': 'auc',
 'seed': 1024}

### 步骤三：gamma参数调优

In [154]:
param_grid = {
   'gamma':[*np.arange(0,1,0.1)]
}
gs = GridSearchCV(estimator = XGBClassifier(**params_best), 
                       param_grid = param_grid, scoring='roc_auc',cv=5)
gs.fit(x_data, y_data)
gs.best_params_, gs.best_score_

({'gamma': 0.9}, 0.8658973163911101)

In [156]:
param_grid = {
   'gamma':[*np.arange(0,0.3,0.02)]
}
gs = GridSearchCV(estimator = XGBClassifier(**params_best), 
                       param_grid = param_grid, scoring='roc_auc',cv=5)
gs.fit(x_data, y_data)
gs.best_params_, gs.best_score_

({'gamma': 0.24}, 0.8642948780846783)

In [120]:
params_best['gamma'] = gs.best_params_['gamma']
params_best

{'n_estimators': 22,
 'max_depth': 4,
 'eta': 0.03,
 'gamma': 0.26,
 'reg_lambda': 3,
 'reg_alpha': 0,
 'eval_metric': 'auc',
 'seed': 1024,
 'min_child_weight': 3}

In [158]:
# 看一下测试集分数
xgbc = XGBClassifier(**params_best)
xgbc.fit(x_train,y_train)
xgbc.score(x_test,y_test)

0.8164794007490637

### 步骤四：调subsample 和colsample_bytree

In [129]:
param_grid = {
   'subsample':[*np.arange(0.8,1,0.01)],
}
gs = GridSearchCV(estimator = XGBClassifier(**params_best), 
                       param_grid = param_grid, scoring='roc_auc',cv=5)
gs.fit(x_data, y_data)
gs.best_params_, gs.best_score_

({'subsample': 0.8400000000000001}, 0.860084629347986)

In [131]:
param_grid = {
   'colsample_bytree':[*np.arange(0.8,1.0001,0.05)],
}
gs = GridSearchCV(estimator = XGBClassifier(**params_best), 
                       param_grid = param_grid, scoring='roc_auc',cv=5)
gs.fit(x_data, y_data)
gs.best_params_, gs.best_score_

({'colsample_bytree': 1.0000000000000002}, 0.8556174262866113)

In [159]:
param_grid = {
    'subsample':[*np.arange(0.8,0.9,0.01)],
   'colsample_bytree':[*np.arange(0.9,1.0001,0.05)],
}
gs = GridSearchCV(estimator = XGBClassifier(**params_best), 
                       param_grid = param_grid, scoring='roc_auc',cv=5)
gs.fit(x_data, y_data)
gs.best_params_, gs.best_score_

({'colsample_bytree': 1.0, 'subsample': 0.8}, 0.87101469361723)

In [160]:
# colsample_bytree默认值最好，可以不调（因为本身我们之前就去除了不重要特征）
params_best['subsample'] = gs.best_params_['subsample']
params_best

{'n_estimators': 67,
 'max_depth': 7,
 'eta': 0.03,
 'gamma': 1,
 'reg_lambda': 3,
 'reg_alpha': 0,
 'eval_metric': 'auc',
 'seed': 1024,
 'subsample': 0.8}

In [161]:
# 看一下测试集分数
xgbc = XGBClassifier(**params_best)
xgbc.fit(x_train,y_train)
xgbc.score(x_test,y_test)

0.8239700374531835

### 步骤五：调正则项
gamma本身可以很好防止过拟合，正则项重要性不大

In [167]:
param_grid = {
   'reg_lambda':[*np.arange(0,4,0.2)]
}
gs = GridSearchCV(estimator = XGBClassifier(**params_best), 
                       param_grid = param_grid, scoring='roc_auc',cv=5)
gs.fit(x_data, y_data)
gs.best_params_, gs.best_score_

({'reg_lambda': 1.8}, 0.8722642643379286)

In [168]:
params_best['reg_lambda'] = gs.best_params_['reg_lambda']
params_best

{'n_estimators': 67,
 'max_depth': 7,
 'eta': 0.03,
 'gamma': 1,
 'reg_lambda': 1.8,
 'reg_alpha': 0,
 'eval_metric': 'auc',
 'seed': 1024,
 'subsample': 0.8}

In [174]:
# 看一下测试集分数
params_best['reg_lambda'] = 3 # 下降了，所以不调lamda
xgbc = XGBClassifier(**params_best)
xgbc.fit(x_train,y_train)
xgbc.score(x_test,y_test)

0.8239700374531835

### 最后再回去调一下 eta和n_estimators

In [175]:

best_xgbc = search_best_xgbc(params_best, x_data, y_data)
params_best['n_estimators'] = best_xgbc.n_estimators
params_best['eta'] = best_xgbc.get_params()['eta']
params_best


---eta=0.03---
当前树数量：67
关于现在这个模型(在训练集上表现)：
准确率 : 0.8583
AUC 得分 : 0.903605
----测试集score:0.8651685393258427---

---eta=0.06---
当前树数量：67
关于现在这个模型(在训练集上表现)：
准确率 : 0.8616
AUC 得分 : 0.914765
----测试集score:0.8614232209737828---

---eta=0.09---
当前树数量：67
关于现在这个模型(在训练集上表现)：
准确率 : 0.8616
AUC 得分 : 0.918445
----测试集score:0.8614232209737828---

---eta=0.12---
当前树数量：33
关于现在这个模型(在训练集上表现)：
准确率 : 0.8594
AUC 得分 : 0.913519
----测试集score:0.8614232209737828---

---eta=0.15---
当前树数量：10
关于现在这个模型(在训练集上表现)：
准确率 : 0.856
AUC 得分 : 0.885160
----测试集score:0.8576779026217228---

---eta=0.18---
当前树数量：9
关于现在这个模型(在训练集上表现)：
准确率 : 0.8583
AUC 得分 : 0.886639
----测试集score:0.850187265917603---

---eta=0.21---
当前树数量：10
关于现在这个模型(在训练集上表现)：
准确率 : 0.8571
AUC 得分 : 0.900881
----测试集score:0.8539325842696629---

---eta=0.24---
当前树数量：10
关于现在这个模型(在训练集上表现)：
准确率 : 0.8583
AUC 得分 : 0.904165
----测试集score:0.8576779026217228---

---eta=0.27---
当前树数量：10
关于现在这个模型(在训练集上表现)：
准确率 : 0.8583
AUC 得分 : 0.905631
----测试集score:0.8576779026217228---

---eta=0.300

{'n_estimators': 67,
 'max_depth': 7,
 'eta': 0.03,
 'gamma': 1,
 'reg_lambda': 3,
 'reg_alpha': 0,
 'eval_metric': 'auc',
 'seed': 1024,
 'subsample': 0.8}

In [176]:
# 看一下测试集分数
xgbc = XGBClassifier(**params_best)
xgbc.fit(x_train,y_train)
xgbc.score(x_test,y_test)

0.8239700374531835

In [181]:
# 改一下seed
params_best['seed'] = 100
xgbc = XGBClassifier(**params_best)
xgbc.fit(x_train,y_train)
xgbc.score(x_test,y_test)

0.8127340823970037