# Ensemble Learning


In [3]:
from sklearn.datasets import make_moons
data = make_moons(n_samples=10000, noise=0.4)

In [4]:
from sklearn.model_selection import train_test_split
X, y = data[0], data[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Voting

In [5]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svc_clf = SVC()


In [6]:
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rnd', rnd_clf), ('svc', svc_clf)],
    voting='hard'
)

In [7]:
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('rnd',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini'...
                                        

In [8]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svc_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))




LogisticRegression 0.8272
RandomForestClassifier 0.8424
SVC 0.8628




VotingClassifier 0.8576


## Bagging and Pasting


In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
        DecisionTreeClassifier(), n_estimators=500,
        max_samples=100, bootstrap=True, n_jobs=-1
    )
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [10]:
print(bag_clf.__class__.__name__, accuracy_score(y_test, y_pred))

BaggingClassifier 0.866


### Evaluating Out of Bag 

In [11]:
bag_clf = BaggingClassifier(
        DecisionTreeClassifier(), n_estimators=500,
        max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True
    )
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.8584

In [12]:
print('accuracy was 0.85')

accuracy was 0.85


In [13]:
bag_clf.oob_decision_function_

array([[0.05668016, 0.94331984],
       [0.93292683, 0.06707317],
       [0.98170732, 0.01829268],
       ...,
       [0.95102041, 0.04897959],
       [0.60685484, 0.39314516],
       [0.16666667, 0.83333333]])

* We can set `max_features` and `bootstrap_feature` hyper-parameters, and these two pramater help model to select a random subset of original features in training. It is particularly useful when you have a big set of features.


## Random Forest
Instead of searching for best feature in set of features, it searches in for the best features in a random set of features.

In [14]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)
y_pred = rf_clf.predict(X_test)
print(rf_clf.__class__.__name__, accuracy_score(y_test, y_pred))

RandomForestClassifier 0.8388




### Feature Importance

In [15]:
rf_clf.feature_importances_

array([0.46111378, 0.53888622])

## Extra Trees
We can set a random threshold for each set of features and it got more randomly. Extra Tree uses this method.

In [16]:
from sklearn.tree import ExtraTreeClassifier
ext_clf = ExtraTreeClassifier()
ext_clf.fit(X_train, y_train)
y_pred = ext_clf.predict(X_test)
print(ext_clf.__class__.__name__, accuracy_score(y_test, y_pred))

ExtraTreeClassifier 0.806


Here random forest got better accuracy. We can not say which one is better for a training set. The only good way is try both and select one of them.

## Boosting 


### Ada Boosting
This kind of boosting try to make last predictor better with changing weight of instances train set.


In [17]:
dt_clf = DecisionTreeClassifier(max_depth=1)
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)
print(dt_clf.__class__.__name__, accuracy_score(y_test, y_pred))

DecisionTreeClassifier 0.77


In [18]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
        DecisionTreeClassifier(max_depth=1), n_estimators=200,
        algorithm="SAMME.R", learning_rate=0.5
    )
ada_clf.fit(X_train, y_train)


AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=None,
                             

In [19]:
y_pred = ada_clf.predict(X_test)
print(ada_clf.__class__.__name__, accuracy_score(y_test, y_pred))

AdaBoostClassifier 0.86


### Gradient Boosting 
This method try to train model on residual errors made by the previous pridector.

In [20]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X_train, y_train)

y2 = y_train - tree_reg1.predict(X_train)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X_train, y2)

y3 = y2 - tree_reg2.predict(X_train)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X_train, y3)

y_pred = sum(tree.predict(X_test) for tree in (tree_reg1, tree_reg2, tree_reg3))
print (mean_squared_error(y_test, y_pred), mean_squared_error(y_test, tree_reg1.predict(X_test)))

0.11397517248263994 0.12477792234344702


We can use also sklearn GradientBoostingRegressor. It is like random forest algorithm and also have that algorithm hyper-parameters.

In [46]:
from sklearn.base import TransformerMixin, BaseEstimator
class GradientBoostingOtherRegressor(TransformerMixin, BaseEstimator):
    def __init__(self, estimator, n_estimates = 20):
        self.estimator = estimator
        self.estimators = []
        self.n_estimates = n_estimates
        self.last_estimator = self.estimator
    def fit(self, X, y_train=None):
        self.last_estimator = self.estimator
        self.last_estimator.fit(X, y_train)
        y = y_train
        self.estimators = [self.last_estimator]
        for i in range(self.n_estimates):
            y = y - self.last_estimator.predict(X)
            new_estimator = clone(self.estimator)
            new_estimator.fit(X, y)
            self.last_estimator = new_estimator
            self.estimators.append(self.last_estimator)
#         print(self.estimators)
        return self
    def predict(self, X_test):
        
        y_pred = sum(tree.predict(X_test) for tree in self.estimators)
        return y_pred

In [47]:
gbor = GradientBoostingOtherRegressor( DecisionTreeRegressor(max_depth=2))
gbor.fit(X_train, y_train)
y_pred = gbor.predict(X_test)
print (mean_squared_error(y_test, y_pred))


0.10482449867048392


In [52]:
from sklearn.ensemble import GradientBoostingRegressor
gradient_reg = GradientBoostingRegressor(max_depth=2, n_estimators=1000, learning_rate=0.1)
gradient_reg.fit(X_train, y_train)
print(mean_squared_error(y_test, gradient_reg.predict(X_test)))

0.10813776790066852


#### Selecting Best Gradient Booster By Early Stopping


In [43]:
import numpy as np
import seaborn as sns
sns.set()

In [54]:
gradient_reg = GradientBoostingRegressor(max_depth=2, n_estimators=1000)
gradient_reg.fit(X_train, y_train)

errors = [mean_squared_error(y_test, y_pred) for y_pred in gradient_reg.staged_predict(X_test)]
best_n_estimators = np.argmin(errors)



gradient_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators)
gradient_best.fit(X_train, y_train)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=111,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [55]:
print(mean_squared_error(y_test, gradient_best.predict(X_test)))

0.10373061664731406
