In [7]:
# 导入工具库
import numpy as np
import scipy.sparse
import pickle
import xgboost as xgb

In [8]:
# 皮马印第安人糖尿病数据集 包含很多字段：怀孕次数 口服葡萄糖耐量试验中血浆葡萄糖浓度 舒张压(mm Hg) 三头肌组织褶厚度(mm) 
# 2小时血清胰岛素(μU/ ml) 体重指数(kg/(身高(m)^2) 糖尿病系统功能 年龄(岁)
import pandas as pd
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
from sklearn.model_selection import train_test_split
# 做数据切分
train, test = train_test_split(data, random_state=42) # 魔法数字42
# 转换成Dmatrix格式
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
target_column = 'Outcome'
# 取出Dataframe的numpy数组值去初始化DMatrix对象
xgtrain = xgb.DMatrix(train[feature_columns].values, train[target_column].values)
xgtest = xgb.DMatrix(test[feature_columns].values, test[target_column].values)
#参数设定: 树最大深度、学习率、输出信息的数量、使用数据子集的比例、使用特征子集的比例、要优化的目标函数
param = {'max_depth':5, 'eta':0.1, 'silent':1, 'subsample':0.7, 'colsample_bytree':0.7, 'objective':'binary:logistic' }
# 设定watchlist用于查看模型状态
watchlist  = [(xgtest,'eval'), (xgtrain,'train')]
num_round = 10
bst = xgb.train(param, xgtrain, num_round, watchlist)
# 使用模型预测
preds = bst.predict(xgtest)
# 判断准确率
labels = xgtest.get_label()
print('错误类为%f' %  (sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))
# 模型存储
bst.save_model('0002.model')

[0]	eval-logloss:0.63832	train-logloss:0.61628
[1]	eval-logloss:0.61803	train-logloss:0.58507
[2]	eval-logloss:0.60041	train-logloss:0.55552
[3]	eval-logloss:0.58634	train-logloss:0.53131
[4]	eval-logloss:0.57893	train-logloss:0.51387
[5]	eval-logloss:0.56520	train-logloss:0.49238
[6]	eval-logloss:0.55668	train-logloss:0.47898
[7]	eval-logloss:0.54832	train-logloss:0.46092
[8]	eval-logloss:0.54171	train-logloss:0.44394
[9]	eval-logloss:0.53541	train-logloss:0.42887
错误类为0.239583


Parameters: { "silent" } are not used.



In [12]:
xgb.cv(param, xgtrain, num_round, nfold=5,metrics={'auc'}, seed = 0)

Parameters: { "silent" } are not used.



Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.825389,0.014544,0.702348,0.09257
1,0.9025,0.011259,0.787829,0.059166
2,0.91791,0.009373,0.804127,0.063083
3,0.928497,0.007524,0.816456,0.057629
4,0.936885,0.008101,0.814271,0.053343
5,0.93921,0.00769,0.824942,0.054605
6,0.944009,0.009373,0.818327,0.055326
7,0.946059,0.007744,0.824808,0.055222
8,0.947799,0.00739,0.826891,0.057141
9,0.948271,0.006772,0.830007,0.058178


In [11]:
# 计算正负样本比，调整样本权重
def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label==1)
    param['scale_pos_weight'] = ratio
    return (dtrain, dtest, param)

# 先做预处理，计算样本权重，再做交叉验证
xgb.cv(param, xgtrain, num_round, nfold=5,
       metrics={'auc'}, seed = 0, fpreproc = fpreproc)

Parameters: { "silent" } are not used.



Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.827466,0.014757,0.697755,0.072548
1,0.908491,0.009455,0.782986,0.054298
2,0.923983,0.007927,0.81641,0.055886
3,0.935427,0.00828,0.8275,0.047565
4,0.943227,0.007474,0.821902,0.045688
5,0.946287,0.007249,0.833035,0.049085
6,0.951845,0.009267,0.822924,0.054825
7,0.952665,0.008423,0.825252,0.056222
8,0.954767,0.008244,0.827869,0.057538
9,0.95571,0.007549,0.828441,0.057659


In [None]:
print('使用自定义损失函数进行交叉验证')
# 自定义损失函数，需要提供损失函数的一阶导和二阶导
def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1.0-preds)
    return grad, hess

# 自定义评估准则，评估预估值和标准答案之间的差距
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)



In [2]:
# 借助sklearn统一的预测器接口进行建模
# 导入工具库
import numpy as np
import pandas as pd
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split
# 用pandas读入数据
data = pd.read_csv('diabetes.csv')
# 做数据切分
train, test = train_test_split(data)
# 特征列
feature_columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
# 标签列
target_column = 'Outcome'

In [6]:
from joblib import dump
# 初始化模型
xgb_classifier = xgb.XGBClassifier(n_estimators=20,\
                                   max_depth=4, \
                                   learning_rate=0.1, \
                                   subsample=0.7, \
                                   colsample_bytree=0.7, \
                                   eval_metric='error')
# Dataframe格式数据拟合模型
xgb_classifier.fit(train[feature_columns], train[target_column])
# 使用模型预测
preds = xgb_classifier.predict(test[feature_columns])
# 判断准确率
print('错误类为%f' %((preds!=test[target_column]).sum()/float(test.shape[0])))
# 模型存储
dump(xgb_classifier, '0003.model')

错误类为0.234375


['0003.model']