In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pylab as plt
%matplotlib inline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [10]:
from sklearn.datasets import make_classification
# X为样本特征，y为样本类别输出， 共10000个样本，每个样本20个特征，输出有2个类别，没有冗余特征，
    #每个类别一个（数据密集的子区域）簇
X, y = make_classification(n_samples=10000, n_features=20, n_redundant=0,
                             n_clusters_per_class=1, n_classes=2, flip_y=0.1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print (X_train.shape)
print (y_train.shape)
print (X_test.shape)
print (y_test.shape)

(7500, 20)
(7500,)
(2500, 20)
(2500,)


XGBoost 使用原生API

In [34]:
dtrain = xgb.DMatrix(X_train,y_train)
dtest = xgb.DMatrix(X_test,y_test)
param = {'max_depth':5,        # 树的最大深度
         'learning_rate':0.5,            # 学习率
         'verbosity':1,        # 输出详细程度
         'objective':'binary:logistic'  # 二分类任务
        }
raw_model = xgb.train(param, dtrain, num_boost_round=20)  # 训练模型

In [22]:
from sklearn.metrics import accuracy_score
pred_train_raw = raw_model.predict(dtrain)  # 对训练集进行预测
for i in range(len(pred_train_raw)):   # 转化为二元标签
    if pred_train_raw[i] > 0.5:
         pred_train_raw[i]=1
    else:
        pred_train_raw[i]=0               
print (accuracy_score(dtrain.get_label(), pred_train_raw))

0.9477333333333333


In [28]:
pred_train_raw,dtrain.get_label()

(array([1., 0., 1., ..., 1., 1., 1.], dtype=float32),
 array([1., 0., 1., ..., 1., 1., 1.], dtype=float32))

In [30]:
pred_test_raw = raw_model.predict(dtest)
for i in range(len(pred_test_raw)):
    if pred_test_raw[i] > 0.5:
         pred_test_raw[i]=1
    else:
        pred_test_raw[i]=0               
print (accuracy_score(dtest.get_label(), pred_test_raw))

0.918


XGBoost 使用sklearn wrapper，仍然使用原始API的参数

In [71]:
param = {'max_depth':5, 'eta':0.5, 'verbosity':1, 'objective':'binary:logistic'}

In [73]:
sklearn_model_raw = xgb.XGBClassifier(**param)
sklearn_model_raw.fit(X_train, y_train,
        eval_set=[(X_test, y_test)])

[0]	validation_0-logloss:0.42984
[1]	validation_0-logloss:0.33743
[2]	validation_0-logloss:0.29842
[3]	validation_0-logloss:0.28212
[4]	validation_0-logloss:0.27493
[5]	validation_0-logloss:0.27340
[6]	validation_0-logloss:0.27483
[7]	validation_0-logloss:0.27640
[8]	validation_0-logloss:0.27749
[9]	validation_0-logloss:0.27857
[10]	validation_0-logloss:0.27899
[11]	validation_0-logloss:0.28154
[12]	validation_0-logloss:0.28455
[13]	validation_0-logloss:0.28627
[14]	validation_0-logloss:0.28648
[15]	validation_0-logloss:0.28689
[16]	validation_0-logloss:0.28847
[17]	validation_0-logloss:0.29030
[18]	validation_0-logloss:0.29166
[19]	validation_0-logloss:0.29283
[20]	validation_0-logloss:0.29206
[21]	validation_0-logloss:0.29187
[22]	validation_0-logloss:0.29312
[23]	validation_0-logloss:0.29403
[24]	validation_0-logloss:0.29647
[25]	validation_0-logloss:0.29639
[26]	validation_0-logloss:0.29837
[27]	validation_0-logloss:0.29886
[28]	validation_0-logloss:0.29912
[29]	validation_0-loglos

XGBoost 使用sklearn wrapper，使用sklearn风格的参数(推荐)

In [49]:
sklearn_model_new = xgb.XGBClassifier(max_depth=5,learning_rate= 0.5, verbosity=1,early_stopping_rounds=10, eval_metric="error",objective='binary:logistic',random_state=1)

In [51]:
sklearn_model_new.fit(X_train, y_train,eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.09000
[1]	validation_0-error:0.08680
[2]	validation_0-error:0.08560
[3]	validation_0-error:0.08480
[4]	validation_0-error:0.08320
[5]	validation_0-error:0.08320
[6]	validation_0-error:0.08160
[7]	validation_0-error:0.08120
[8]	validation_0-error:0.08160
[9]	validation_0-error:0.08080
[10]	validation_0-error:0.07960
[11]	validation_0-error:0.08160
[12]	validation_0-error:0.08120
[13]	validation_0-error:0.08160
[14]	validation_0-error:0.08000
[15]	validation_0-error:0.08000
[16]	validation_0-error:0.08120
[17]	validation_0-error:0.08200
[18]	validation_0-error:0.08160
[19]	validation_0-error:0.08200


使用sklearn网格搜索调参
一般固定步长，先调好框架参数n_estimators，再调弱学习器参数max_depth，min_child_weight,gamma等，接着调正则化相关参数subsample，colsample_byXXX, reg_alpha以及reg_lambda,最后固定前面调好的参数，来调步长learning_rate

In [35]:
gsCv = GridSearchCV(sklearn_model_new,
                   {'max_depth': [4,5,6],
                    'n_estimators': [5,10,20]})
gsCv.fit(X_train,y_train)

In [36]:
print(gsCv.best_score_)
print(gsCv.best_params_)

0.9305333333333333
{'max_depth': 4, 'n_estimators': 10}


In [75]:
sklearn_model_new2 = xgb.XGBClassifier(max_depth=4,n_estimators=10,verbosity=1, objective='binary:logistic',random_state=1)
gsCv2 = GridSearchCV(sklearn_model_new2, 
                   {'learning_rate': [0.3,0.5,0.7]})
gsCv2.fit(X_train,y_train)

In [76]:
print(gsCv2.best_score_)
print(gsCv2.best_params_)

0.9309333333333335
{'learning_rate': 0.5}


In [99]:
sklearn_model_new2 = xgb.XGBClassifier(max_depth=4,learning_rate= 0.5, early_stopping_rounds=10, eval_metric="error", verbosity=1, objective='binary:logistic',n_estimators=10)
sklearn_model_new2.fit(X_train, y_train,
        eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.09640
[1]	validation_0-error:0.08960
[2]	validation_0-error:0.08800
[3]	validation_0-error:0.08800
[4]	validation_0-error:0.08760
[5]	validation_0-error:0.08200
[6]	validation_0-error:0.08200
[7]	validation_0-error:0.08080
[8]	validation_0-error:0.08080
[9]	validation_0-error:0.08080


In [101]:
pred_test_new = sklearn_model_new2.predict(X_test)
print (accuracy_score(dtest.get_label(), pred_test_new))

0.9192
