## 集成学习
基于不同算法的投票决定方式

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

X,y = datasets.make_moons(n_samples=500,noise=0.3,random_state=42)
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()

<Figure size 640x480 with 1 Axes>

In [2]:
from sklearn.model_selection import train_test_split

trainX,testX,trainY,testY = train_test_split(X,y)

### 自己实现集成学习

In [3]:
# 逻辑回归
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
log_clf.fit(trainX,trainY)
print("logistic=",log_clf.score(testX,testY))

# SVM
from sklearn.svm import SVC

svm_clf = SVC()
svm_clf.fit(trainX,trainY)
print("svm=",svm_clf.score(testX,testY))

# 决策树
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(trainX,trainY)
print("DecisionTree=",dt_clf.score(testX,testY))

# 集成学习
yHat1 = log_clf.predict(testX)
yHat2 = svm_clf.predict(testX)
yHat3 = dt_clf.predict(testX)

yHat = np.int32((yHat1+yHat2+yHat3)>=2)
yHat
ratio = np.sum(yHat==testY)/yHat.shape[0] # accuracy_score
print("集成=",ratio)

logistic= 0.872
svm= 0.92
DecisionTree= 0.864
集成= 0.912


### Hard Voting Classifier(少数服从多数)

In [4]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

voting_clf = VotingClassifier(estimators=[
    ('log_clf',LogisticRegression()),
    ('svm_clf',SVC()),
    ('dt_clf',DecisionTreeClassifier())
],voting='hard')

In [5]:
voting_clf.fit(trainX,trainY)
voting_clf.score(testX,testY)

  if diff:


0.912

### Soft Voting Classifier(基于概率)

In [6]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

voting_clf2 = VotingClassifier(estimators=[
    ('log_clf',LogisticRegression()),
    ('svm_clf',SVC(probability=True)),# 默认不支持计算概率
    ('dt_clf',DecisionTreeClassifier())
],voting='soft')

In [7]:
voting_clf2.fit(trainX,trainY)
voting_clf2.score(testX,testY)

  if diff:


0.904

## 如何创建多个子模型之间的差异性

每个子模型只看样本数据的一部分，每个子模型不需要太高的准确率

### Bagging(放回取样) 和 Pasting(不放回取样)

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
                               n_estimators=500,# 子模型数量
                               max_samples=100, # 每个子模型的样本数
                               bootstrap=True ) # 是否放回，True为放回
bagging_clf.fit(trainX,trainY)
bagging_clf.score(testX,testY) 

0.928

In [12]:
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
                               n_estimators=5000,# 子模型数量
                               max_samples=100, # 每个子模型的样本数
                               bootstrap=True ) # 是否放回，True为放回
bagging_clf.fit(trainX,trainY)
bagging_clf.score(testX,testY)

0.928

### out-of-bag
放回取样导致一部分样本很有可能没有取到<br>
不使用测试数据集，而使用这部分没有取到的样本做测试/验证<br>


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
                               n_estimators=500,# 子模型数量
                               max_samples=100, # 每个子模型的样本数
                               bootstrap=True,# 是否放回，True为放回
                               oob_score=True)# 
bagging_clf.fit(X,y)
bagging_clf.oob_score_

### n_jobs

In [16]:
%%time
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
                               n_estimators=5000,# 子模型数量
                               max_samples=100, # 每个子模型的样本数
                               bootstrap=True,# 是否放回，True为放回
                               oob_score=True)# 
bagging_clf.fit(X,y)

Wall time: 4.54 s


In [18]:
%%time
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
                               n_estimators=5000,# 子模型数量
                               max_samples=100, # 每个子模型的样本数
                               bootstrap=True,# 是否放回，True为放回
                               oob_score=True,
                               n_jobs=2)# 几核工作 
bagging_clf.fit(X,y)

Wall time: 3.61 s


### bootstrap_features(特征随机取样)

In [35]:
bagging_clf = BaggingClassifier(DecisionTreeClassifier(),
                               n_estimators=500,# 子模型数量
                               max_samples=100, # 每个子模型的样本数
                               bootstrap=True,# 是否放回，True为放回
                               oob_score=True,
                               max_features=1,# 随机选取特征的个数
                               bootstrap_features=True)
bagging_clf.fit(X,y)
bagging_clf.oob_score_

0.852

## 随机森林

In [27]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=500,
                                max_leaf_nodes=16,
                                max_depth=3,
                                oob_score=True,
                               n_jobs=1)
rf_clf.fit(X,y)
rf_clf.oob_score_

0.88

### 集成学习解决回归问题

In [28]:
from sklearn.ensemble import RandomForestRegressor

## Boosting
1. 集成多个模型
2. 每个模型都在尝试增强整体的效果（增强未拟合样本点的权重系数）

### Ada Boosting

![](http://p9tybni1b.bkt.clouddn.com/adaboost.png)

In [33]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=30)
ada_clf.fit(trainX,trainY)
ada_clf.score(testX,testY)

0.896

### Gradient Boosting
根据模型预测错误的值加以修正。

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier(max_depth=2,n_estimators=30)
gb_clf.fit(trainX,trainY)
gb_clf.score(testX,testY)

0.928

### Boosting解决回归问题

In [34]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

## Stacking

![](http://p9tybni1b.bkt.clouddn.com/Stacking.png)

![](http://p9tybni1b.bkt.clouddn.com/Stacking2.png)