In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pylab as plt
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn import metrics

%matplotlib inline

## 数据加载

In [2]:
# 数据加载
df = pd.read_csv('https://query.data.world/s/4ee2mcqmzj55nta6nhj7nu7mmyifob', sep=';')
df['quality'] = df['quality'].apply(lambda item: 0 if item<6 else 1)

# 切分训练与测试数据集
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:11], df['quality'], test_size=0.2, random_state=666)

In [3]:
features = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide',
            'total sulfur dioxide','density','pH','sulphates','alcohol']
label = 'quality'

## 贝叶斯调参

In [4]:
def xgb_optimize(learning_rate, n_estimators, min_child_weight, colsample_bytree, max_depth, subsample, gamma, alpha):
    dtrain = xgb.DMatrix(df[features].values, df[label].values)
    params = {}
    params['learning_rate'] = float(learning_rate)
    params['min_child_weight'] = int(min_child_weight)
    params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)   
    params['objective'] = 'binary:logistic'

    cv_result = xgb.cv(params, dtrain, num_boost_round=int(n_estimators), nfold=5, seed=666, metrics=['auc'])
    return cv_result['test-auc-mean'].iloc[-1]

In [None]:
from bayes_opt import BayesianOptimization

xgb_opt = BayesianOptimization(xgb_optimize, {
    'learning_rate': (0.05, 0.5),
    'n_estimators': (50, 500),    
    'min_child_weight': (1, 10),        
    'colsample_bytree': (0.5, 1),            
    'max_depth': (4, 10),                
    'subsample': (0.5, 1),                    
    'gamma': (0, 10),                        
    'alpha': (0, 10)
})
dtrain = xgb.DMatrix(X_train.values, y_train.values)
xgb_opt.maximize(init_points=5, n_iter=30)

|   iter    |  target   |   alpha   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8204  [0m | [0m 8.304   [0m | [0m 0.995   [0m | [0m 8.982   [0m | [0m 0.279   [0m | [0m 9.932   [0m | [0m 3.552   [0m | [0m 441.2   [0m | [0m 0.7249  [0m |
| [95m 2       [0m | [95m 0.8629  [0m | [95m 0.9033  [0m | [95m 0.9047  [0m | [95m 2.632   [0m | [95m 0.08732 [0m | [95m 9.096   [0m | [95m 1.149   [0m | [95m 494.3   [0m | [95m 0.7033  [0m |
| [0m 3       [0m | [0m 0.8502  [0m | [0m 5.219   [0m | [0m 0.6283  [0m | [0m 1.895   [0m | [0m 0.343   [0m | [0m 8.636   [0m | [0m 3.109   [0m | [0m 369.6   [0m | [0m 0.8599  [0m |


In [None]:
xgb_opt.max

## 训练模型

In [None]:
def model_fit_for_bayesian(bst, X_train, X_test, y_train, y_test):    
    # 训练
    bst.fit(X_train, y_train, eval_metric=['auc'])

    # 评估训练集
    train_predict = bst.predict(X_train)
    train_auc = metrics.roc_auc_score(y_train, train_predict)
    print('train AUC: ', train_auc)
    
    # 评估测试集
    test_predict = bst.predict(X_test)
    test_auc = metrics.roc_auc_score(y_test, test_predict)
    print('test AUC: ', test_auc)

In [None]:
bst = XGBClassifier(
    learning_rate=0.1488,
    n_estimators=1000,
    max_depth=9,
    min_child_weight=1.1505,
    objective='binary:logistic',    
    subsample=0.8915,
    colsample_bytree=0.5033,
    alpha=0.2085,
    gamma=0.6196, 
    nthread=8,
    scale_pos_weight=1,
    seed=666
)
model_fit_for_bayesian(bst, X_train, X_test, y_train, y_test)