## 集成学习
基于不同算法的投票决定方式

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

X,y = datasets.make_moons(n_samples=500,noise=0.3,random_state=42)
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()

<Figure size 640x480 with 1 Axes>

In [2]:
from sklearn.model_selection import train_test_split

trainX,testX,trainY,testY = train_test_split(X,y)

### 自己实现集成学习

In [3]:
# 逻辑回归
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression()
log_clf.fit(trainX,trainY)
print("logistic=",log_clf.score(testX,testY))

# SVM
from sklearn.svm import SVC

svm_clf = SVC()
svm_clf.fit(trainX,trainY)
print("svm=",svm_clf.score(testX,testY))

# 决策树
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier()
dt_clf.fit(trainX,trainY)
print("DecisionTree=",dt_clf.score(testX,testY))

# 集成学习
yHat1 = log_clf.predict(testX)
yHat2 = svm_clf.predict(testX)
yHat3 = dt_clf.predict(testX)

yHat = np.int32((yHat1+yHat2+yHat3)>=2)
yHat
ratio = np.sum(yHat==testY)/yHat.shape[0] # accuracy_score
print("集成=",ratio)

logistic= 0.872
svm= 0.92
DecisionTree= 0.864
集成= 0.912


### Hard Voting Classifier(少数服从多数)

In [4]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

voting_clf = VotingClassifier(estimators=[
    ('log_clf',LogisticRegression()),
    ('svm_clf',SVC()),
    ('dt_clf',DecisionTreeClassifier())
],voting='hard')

In [5]:
voting_clf.fit(trainX,trainY)
voting_clf.score(testX,testY)

  if diff:


0.912

### Soft Voting Classifier(基于概率)

In [6]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

voting_clf2 = VotingClassifier(estimators=[
    ('log_clf',LogisticRegression()),
    ('svm_clf',SVC(probability=True)),# 默认不支持计算概率
    ('dt_clf',DecisionTreeClassifier())
],voting='soft')

In [7]:
voting_clf2.fit(trainX,trainY)
voting_clf2.score(testX,testY)

  if diff:


0.904

## 如何创建多个子模型之间的差异性

每个子模型只看样本数据的一部分