# 机器学习练习8 集成学习

代码修改并注释：黄海广，haiguang2000@wzu.edu.cn 

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.model_selection import train_test_split  

## 生成数据
生成12000行的数据，训练集和测试集按照3:1划分

In [2]:
from sklearn.datasets import make_hastie_10_2

data, target = make_hastie_10_2()

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=123)
X_train.shape, X_test.shape

((9000, 10), (3000, 10))

## 模型对比
对比六大模型，都使用默认参数，因为数据是

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
import time

clf1 = LogisticRegression()
clf2 = RandomForestClassifier()
clf3 = AdaBoostClassifier()
clf4 = GradientBoostingClassifier()
clf5 = XGBClassifier()
clf6 = LGBMClassifier()

for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6], [
        'Logistic Regression', 'Random Forest', 'AdaBoost', 'GBDT', 'XGBoost',
        'LightGBM'
]):
    start = time.time()
    scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=5)
    end = time.time()
    running_time = end - start
    print("Accuracy: %0.8f (+/- %0.2f),耗时%0.2f秒。模型名称[%s]" %
          (scores.mean(), scores.std(), running_time, label))

Accuracy: 0.51200000 (+/- 0.02),耗时0.04秒。模型名称[Logistic Regression]
Accuracy: 0.88222222 (+/- 0.01),耗时15.51秒。模型名称[Random Forest]
Accuracy: 0.87377778 (+/- 0.01),耗时3.11秒。模型名称[AdaBoost]
Accuracy: 0.91444444 (+/- 0.01),耗时12.64秒。模型名称[GBDT]
Accuracy: 0.92400000 (+/- 0.00),耗时3.55秒。模型名称[XGBoost]
Accuracy: 0.93200000 (+/- 0.01),耗时0.67秒。模型名称[LightGBM]


对比了六大模型，可以看出，逻辑回归速度最快，但准确率最低。
而LightGBM，速度快，而且准确率最高，所以，现在处理结构化数据的时候，大部分都是用LightGBM算法。

## XGBoost的使用

### 1.原生XGBoost的使用

In [5]:
import xgboost as xgb
#记录程序运行时间
import time

start_time = time.time()

#xgb矩阵赋值
xgb_train = xgb.DMatrix(X_train, y_train)
xgb_test = xgb.DMatrix(X_test, label=y_test)
##参数
params = {
    'booster': 'gbtree',
    'silent': 1,  #设置成1则没有运行信息输出，最好是设置为0.
    #'nthread':7,# cpu 线程数 默认最大
    'eta': 0.007,  # 如同学习率
    'min_child_weight': 3,
    # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
    #，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
    #这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。
    'max_depth': 6,  # 构建树的深度，越大越容易过拟合
    'gamma': 0.1,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守，一般0.1、0.2这样子。
    'subsample': 0.7,  # 随机采样训练样本
    'colsample_bytree': 0.7,  # 生成树时进行的列采样 
    'lambda': 2,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    #'alpha':0, # L1 正则项参数
    #'scale_pos_weight':1, #如果取值大于0的话，在类别样本不平衡的情况下有助于快速收敛。
    #'objective': 'multi:softmax', #多分类的问题
    #'num_class':10, # 类别数，多分类与 multisoftmax 并用
    'seed': 1000,  #随机种子
    #'eval_metric': 'auc'
}
plst = list(params.items())
num_rounds = 500  # 迭代次数
watchlist = [(xgb_train, 'train'), (xgb_test, 'val')]

In [6]:
#训练模型并保存
# early_stopping_rounds 当设置的迭代次数较大时，early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
model = xgb.train(
    plst,
    xgb_train,
    num_rounds,
    watchlist,
    early_stopping_rounds=100,
)
#model.save_model('./model/xgb.model') # 用于存储训练出的模型
print("best best_ntree_limit", model.best_ntree_limit)
y_pred = model.predict(xgb_test, ntree_limit=model.best_ntree_limit)

# print("Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred))
print('error=%f' %
      (sum(1
           for i in range(len(y_pred)) if int(y_pred[i] > 0.5) != y_test[i]) /
       float(len(y_pred))))
#输出运行时长
cost_time = time.time() - start_time
print("xgboost success!", '\n', "cost time:", cost_time, "(s)......")

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:1.11867	val-rmse:1.10739
[1]	train-rmse:1.11594	val-rmse:1.10488
[2]	train-rmse:1.11322	val-rmse:1.10231
[3]	train-rmse:1.11046	val-rmse:1.09976
[4]	train-rmse:1.10784	val-rmse:1.09732
[5]	train-rmse:1.10518	val-rmse:1.09492
[6]	train-rmse:1.10251	val-rmse:1.09247
[7]	train-rmse:1.09984	val-rmse:1.08999
[8]	train-rmse:1.09725	val-rmse:1.08757
[9]	train-rmse:1.09460	val-rmse:1.08517
[10]	train-rmse:1.09209	val-rmse:1.08286
[11]	train-rmse:1.08948	val-rmse:1.08055
[12]	train-rmse:1.08692	val-rmse:1.07824
[13]	train-rmse:1.08438	val-rmse:1.07597
[14]	train-rmse:1.08182	val-rmse:1.07362
[15]	train-rmse:1.07940	val-rmse:1.07142
[16]	train-rmse:1.07695	val-rmse:1.06917
[17]	train-rmse:1.07458	val-rms

[189]	train-rmse:0.81530	val-rmse:0.83966
[190]	train-rmse:0.81438	val-rmse:0.83891
[191]	train-rmse:0.81347	val-rmse:0.83818
[192]	train-rmse:0.81256	val-rmse:0.83742
[193]	train-rmse:0.81163	val-rmse:0.83663
[194]	train-rmse:0.81068	val-rmse:0.83581
[195]	train-rmse:0.80973	val-rmse:0.83504
[196]	train-rmse:0.80876	val-rmse:0.83424
[197]	train-rmse:0.80781	val-rmse:0.83346
[198]	train-rmse:0.80689	val-rmse:0.83273
[199]	train-rmse:0.80593	val-rmse:0.83189
[200]	train-rmse:0.80504	val-rmse:0.83112
[201]	train-rmse:0.80417	val-rmse:0.83036
[202]	train-rmse:0.80327	val-rmse:0.82960
[203]	train-rmse:0.80236	val-rmse:0.82886
[204]	train-rmse:0.80145	val-rmse:0.82817
[205]	train-rmse:0.80051	val-rmse:0.82739
[206]	train-rmse:0.79960	val-rmse:0.82660
[207]	train-rmse:0.79875	val-rmse:0.82595
[208]	train-rmse:0.79785	val-rmse:0.82522
[209]	train-rmse:0.79696	val-rmse:0.82451
[210]	train-rmse:0.79606	val-rmse:0.82382
[211]	train-rmse:0.79513	val-rmse:0.82310
[212]	train-rmse:0.79426	val-rmse:

[385]	train-rmse:0.67733	val-rmse:0.72984
[386]	train-rmse:0.67679	val-rmse:0.72948
[387]	train-rmse:0.67628	val-rmse:0.72911
[388]	train-rmse:0.67576	val-rmse:0.72872
[389]	train-rmse:0.67524	val-rmse:0.72824
[390]	train-rmse:0.67468	val-rmse:0.72786
[391]	train-rmse:0.67415	val-rmse:0.72746
[392]	train-rmse:0.67362	val-rmse:0.72708
[393]	train-rmse:0.67309	val-rmse:0.72671
[394]	train-rmse:0.67255	val-rmse:0.72631
[395]	train-rmse:0.67201	val-rmse:0.72590
[396]	train-rmse:0.67147	val-rmse:0.72547
[397]	train-rmse:0.67093	val-rmse:0.72502
[398]	train-rmse:0.67038	val-rmse:0.72460
[399]	train-rmse:0.66988	val-rmse:0.72420
[400]	train-rmse:0.66935	val-rmse:0.72383
[401]	train-rmse:0.66883	val-rmse:0.72341
[402]	train-rmse:0.66831	val-rmse:0.72299
[403]	train-rmse:0.66777	val-rmse:0.72262
[404]	train-rmse:0.66725	val-rmse:0.72225
[405]	train-rmse:0.66676	val-rmse:0.72184
[406]	train-rmse:0.66622	val-rmse:0.72150
[407]	train-rmse:0.66571	val-rmse:0.72111
[408]	train-rmse:0.66521	val-rmse:

### 2.使用scikit-learn接口
会改变的函数名是：

eta -> learning_rate

lambda -> reg_lambda

alpha -> reg_alpha

In [7]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

from xgboost import XGBClassifier

clf = XGBClassifier(
#     silent=0,  #设置成1则没有运行信息输出，最好是设置为0.是否在运行升级时打印消息。
    #nthread=4,# cpu 线程数 默认最大
    learning_rate=0.3,  # 如同学习率
    min_child_weight=1,
    # 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言
    #，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
    #这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。
    max_depth=6,  # 构建树的深度，越大越容易过拟合
    gamma=0,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守，一般0.1、0.2这样子。
    subsample=1,  # 随机采样训练样本 训练实例的子采样比
    max_delta_step=0,  #最大增量步长，我们允许每个树的权重估计。
    colsample_bytree=1,  # 生成树时进行的列采样 
    reg_lambda=1,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。
    #reg_alpha=0, # L1 正则项参数
    #scale_pos_weight=1, #如果取值大于0的话，在类别样本不平衡的情况下有助于快速收敛。平衡正负权重
    #objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标
    #num_class=10, # 类别数，多分类与 multisoftmax 并用
    n_estimators=100,  #树的个数
    seed=1000  #随机种子
    #eval_metric= 'auc'
)
clf.fit(X_train, y_train, eval_metric='auc')
#设置验证集合 verbose=False不打印过程
clf.fit(X_train, y_train)

y_true, y_pred = y_test, clf.predict(X_test)
print("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))

Accuracy : 0.9333


## LIghtGBM的使用
### 1.原生接口

In [8]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
# 加载你的数据
# print('Load data...')
# df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
# df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
#
# y_train = df_train[0].values
# y_test = df_test[0].values
# X_train = df_train.drop(0, axis=1).values
# X_test = df_test.drop(0, axis=1).values

# 创建成lgb特征的数据集格式
lgb_train = lgb.Dataset(X_train, y_train)  # 将数据保存到LightGBM二进制文件将使加载更快
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)  # 创建验证数据

# 将参数写成字典下形式
params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'regression',  # 目标函数
    'metric': {'l2', 'auc'},  # 评估函数
    'num_leaves': 31,  # 叶子节点数
    'learning_rate': 0.05,  # 学习速率
    'feature_fraction': 0.9,  # 建树的特征选择比例
    'bagging_fraction': 0.8,  # 建树的样本采样比例
    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
    'verbose': 1  # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}

print('Start training...')
# 训练 cv and train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)  # 训练数据需要参数列表和数据集

print('Save model...')

gbm.save_model('model.txt')  # 训练后保存模型到文件

print('Start predicting...')
# 预测数据集
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration
                     )  #如果在训练期间启用了早期停止，可以通过best_iteration方式从最佳迭代中获得预测
# 评估模型
print('error=%f' %
      (sum(1
           for i in range(len(y_pred)) if int(y_pred[i] > 0.5) != y_test[i]) /
       float(len(y_pred))))

Start training...
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2550
[LightGBM] [Info] Number of data points in the train set: 9000, number of used features: 10
[LightGBM] [Info] Start training from score -0.007778
[1]	valid_0's l2: 0.963407	valid_0's auc: 0.822306
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l2: 0.929732	valid_0's auc: 0.864549
[3]	valid_0's l2: 0.899021	valid_0's auc: 0.878464
[4]	valid_0's l2: 0.87243	valid_0's auc: 0.882184
[5]	valid_0's l2: 0.84644	valid_0's auc: 0.892078
[6]	valid_0's l2: 0.821495	valid_0's auc: 0.898242
[7]	valid_0's l2: 0.799114	valid_0's auc: 0.902837
[8]	valid_0's l2: 0.777408	valid_0's auc: 0.905669
[9]	valid_0's l2: 0.758056	valid_0's auc: 0.910419
[10]	valid_0's l2: 0.738781	valid_0's auc: 0.915105
[11]	valid_0's l2: 0.720738	valid_0's auc: 0.917576
[12]	valid_0's l2: 0.703566	valid_0's auc: 0.920586
[13]	valid_0's l2: 0.690569	valid_0's auc: 0.921341
[14]	valid_0's l2: 

## 2.scikit-learn接口

In [9]:
from sklearn import metrics
from lightgbm import LGBMClassifier

clf = LGBMClassifier(
    boosting_type='gbdt',  # 提升树的类型 gbdt,dart,goss,rf
    num_leaves=31,  #树的最大叶子数，对比xgboost一般为2^(max_depth)
    max_depth=-1,  #最大树的深度
    learning_rate=0.1,  #学习率
    n_estimators=100,  # 拟合的树的棵树，相当于训练轮数
    subsample_for_bin=200000,
    objective=None,
    class_weight=None,
    min_split_gain=0.0,  # 最小分割增益
    min_child_weight=0.001,  # 分支结点的最小权重
    min_child_samples=20,
    subsample=1.0,  # 训练样本采样率 行
    subsample_freq=0,  # 子样本频率
    colsample_bytree=1.0,  # 训练特征采样率 列
    reg_alpha=0.0,  # L1正则化系数
    reg_lambda=0.0,  # L2正则化系数
    random_state=None,
    n_jobs=-1,
    silent=True,
)
clf.fit(X_train, y_train, eval_metric='auc')
#设置验证集合 verbose=False不打印过程
clf.fit(X_train, y_train)

y_true, y_pred = y_test, clf.predict(X_test)
print("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))

Accuracy : 0.933


## 参考
1.https://xgboost.readthedocs.io/

2.https://lightgbm.readthedocs.io/

3.https://blog.csdn.net/q383700092/article/details/53763328?locationNum=9&fps=1