# Ensemble Learning

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# data
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
X, y = make_moons(n_samples=10000, noise=0.5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Voting Classifiers

In [None]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

clfs = [LogisticRegression(solver="lbfgs"), RandomForestClassifier(n_estimators=20), SVC(gamma="scale")]
# SVC 加 probability=True 拥有 predict_proba 方法
clfs.append(VotingClassifier(estimators=[("log", log_clf), ("rnd", rnd_clf), ("svm", svm_clf)], voting="hard", n_jobs=-1))
# voting="soft", 对每个分类器的概率取平均，再预测具有最高类概率的类，这被称为软投票, 调用predict_proba
# 得到最多选票的类, 这个多数投票分类器被称为硬投票
for clf in clfs:
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_pred, y_test))

# Bagging and Pasting

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
# bootstrap=True: bagging, bootstrap=False: Pasting
bag_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                            n_estimators=500, max_samples=200, n_jobs=-1, bootstrap=True)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(bag_clf.__class__.__name__, accuracy_score(y_pred, y_test))

In [None]:
# Out-of-Bag Evaluation
bag_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                            n_estimators=500, max_samples=200, n_jobs=-1, bootstrap=True, oob_score=True)
bag_clf.fit(X_train, y_train)
print("oob_score", bag_clf.oob_score_)
y_pred = bag_clf.predict(X_test)
print("test", accuracy_score(y_pred, y_test))
# test, oob_score 差不多

# Random Patches and Random Subspaces

In [None]:
# 除了能随机选样本创建多个子分类器以外还能够随机选择特征来创建多个子分类器
# 通过参数max_features和bootstrap_features实现。对特征进行采样能够提升模型的多样性，增加偏差，减少方差。
# 同时对训练数据和特征进行抽样称为Random Patches，只针对特征抽样而不针对训练数据抽样是Random Subspaces。
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, max_features=3, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
# 特征重要性
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

In [None]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata("MNIST original")
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(mnist["data"], mnist["target"])
# 特征重要性
image = rnd_clf.feature_importances_.reshape(28,28)
plt.imshow(image, cmap=plt.cm.hot)
plt.colorbar()

# Boosting

In [None]:
# AdaBoostClassifier 修改权重
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
# AdaBoost是二分类学习器，如果要多分类，则可以设置参数algorithm=”SAMME”,如果需要predict_proba()方法，则设置参数algorithm=”SAMME.R”
ada_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                            n_estimators=500, algorithm="SAMME.R", learning_rate=0.5)
# SVM算法由于训练速度慢且不稳定，所以不适合AdaBoost的基算法
# 如果产生过拟合可以减少学习器的数目
# AdaBoost的缺点为不能并行，由于每一个学习器依赖上一个学习器

In [None]:
# Gradient Boosting 拟合参差
# data
X = np.random.rand(200, 1) - 0.5
y = 3 * X ** 2 + 0.05 * np.random.randn(200,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
fig, ax = plt.subplots(4, 2, figsize=(20, 15))
def display(regs, ax, X, y):
    ax.scatter(X, y, marker="+")
    xx = np.linspace(-0.5, 0.5, 1000)
    yy = 0
    for reg in regs:
        yy += reg.predict(xx.reshape(-1, 1))
    ax.plot(xx, yy, "r--")
    
# GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
regs = []
y_temp = y.copy()
for i in range(3):
    if regs:
        y_temp -= regs[-1].predict(X).reshape(-1, 1)
    regs.append(DecisionTreeRegressor(max_depth=2))
    regs[-1].fit(X, y_temp)
    display([regs[-1]], ax[i, 0], X, y_temp)
    display(regs, ax[i, 1], X, y)
    
# GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
regs = []
for i in range(2):
    regs.append(GradientBoostingRegressor(learning_rate=0.1, n_estimators=(i + 1) * 100, max_depth=2))
    regs[-1].fit(X.reshape(-1, 1), y.ravel())
    display([regs[-1]], ax[3, i], X, y)

In [None]:
# 为了找到最优学习器的数量，可以使用early stopping方法。对应可以使用staged_predict()方法，该方法能够返回每增加一个学习器的预测结果。
from sklearn.metrics import mean_squared_error
gb_reg = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gb_reg.fit(X_train, y_train.ravel())
errors = [mean_squared_error(y_test, y_pred) for y_pred in gb_reg.staged_predict(X_test)]
best_n_estimator = np.argmin(errors)
print(best_n_estimator)
gb_reg_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimator)

# 设置warm_start = True使模型继续训练，当认为不能再下降时停止，而不是训练完最大数目的学习器再找最小错误的
gb_reg = GradientBoostingRegressor(max_depth=2, warm_start=True, learning_rate=0.1)
min_val_error = float("inf")
error_going_up = 0
for i in range(1, 120):
    gb_reg.n_estimators = i
    gb_reg.fit(X_train, y_train.ravel())
    val_error = mean_squared_error(y_test, gb_reg.predict(X_test))
    if min_val_error > val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break
print(gb_reg.n_estimators)