# 集成学习--集体的智慧

**使用完全不同的算法，这会使他们做出不同种类的错误，这会提高集成的正确率**

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split


X,y = make_moons()
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3,
                                                 shuffle=True, # 是否先打乱数据的顺序再划分
                                                 random_state=40)   # 控制将样本随机打乱
log_clg = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()
voting_clf = VotingClassifier(estimators=[('lr',log_clg),('rnd',rnd_clf),('svm',svm_clf)],voting='hard')

In [26]:
from sklearn.metrics import accuracy_score
for clf in (log_clg,rnd_clf,svm_clf,voting_clf):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__,accuracy_score(y_test,y_pred))

<class 'sklearn.linear_model._logistic.LogisticRegression'> 0.9333333333333333
<class 'sklearn.ensemble._forest.RandomForestClassifier'> 1.0
<class 'sklearn.svm._classes.SVC'> 1.0
<class 'sklearn.ensemble._voting.VotingClassifier'> 1.0


# Bagging(装袋) and Pasting(粘贴) 
对每个分类器都使用相同的训练算法，但在不同的训练集上训练

In [28]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=10,bootstrap=True,n_jobs=-1)
bag_clf.fit(X_train,y_train)
y_pred_bag = bag_clf.predict(X_test)
print(bag_clf.__class__,accuracy_score(y_test,y_pred))

<class 'sklearn.ensemble._bagging.BaggingClassifier'> 1.0


# 随机森林
+ 决策树的一种集成，通过bagging方法进行训练，max_samples设置训练集大小

In [None]:
#### from sklearn.ensemble import RandomForestClassifier
rndd_clf = RandomForestClassifier(n_estimators=500,max_leaf_nodes=16,n_jobs=-1)
# 类似之前随机森林
# bag_clf = BaggingClassifier(DecisionTreeClassifier(splitter='random',max_leaf_nodes=16),
#                            n_estimators=500,max_samples=1.0,bootstrap=True,n_jobs=-1
#                            )

rndd_clf.fit(X_train,y_train)
rndd_clf.predict(X_test)

**可以使用交叉验证来评估Bagging和Pasting**

# 特征重要度
**重要特征分布在根节点处 不重要的分布在叶子节点，我们可以根据这一特性求除特征的重要度**

#  提升
**Adaboost-适应性提升 Gradient Boosting-梯度提升**

## Adaboost 

In [None]:
#不断迭代更新实例参数 提高效益
from sklearn.ensemble import AdaBoostClassifier
#将决策树定作基分类器
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=200,algorithm='SAMME.R',learning_rate=0.5)
ada_clf.fit

## GBRT 

In [32]:
from sklearn.tree import DecisionTreeRegressor

# 第一个分类器
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

# 在第一个分类器的残差上运行第二个分类器
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

# 在第二个分类器的残差上运行第三个分类器
y3 = y2 - tree_reg1.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
ree_reg3.fit(X, y3)

# 它可以通过集成所有树的预测来在一个新的实例上进行预测。
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

NameError: name 'ree_reg3' is not defined

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor

X_train, X_val, y_train, y_val = train_test_split(X, y)
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)
errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]

bst_n_estimators = np.argmin(errors) ## np.argmin表示最小值在数组中所在的位置
gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

In [None]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)
min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break

# Stacking 

另外一个集成方法叫做 Stacking（stacked generalization 的缩写）。
<br>这个算法基于一个简单的想法：不使用琐碎的函数（如硬投票）来聚合集合中所有分类器的预测，而是自己训练一个模型来执行这个聚合。