In [None]:
# The following code creates and trains a voting classifier in Scikit-Learn, composed of three diverse
# classifiers:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],voting='hard')
voting_clf.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

In [None]:
# Bagging and Pasting in Scikit-Learn

# Scikit-Learn offers a simple API for both bagging and pasting with the BaggingClassifier class (or
# BaggingRegressor for regression). The following code trains an ensemble of 500 Decision Tree
# classifiers, 5 each trained on 100 training instances randomly sampled from the training set with
# replacement (this is an example of bagging, but if you want to use pasting instead, just set
# bootstrap=False ). The n_jobs parameter tells Scikit-Learn the number of CPU cores to use for training
# and predictions (–1 tells Scikit-Learn to use all available cores):

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
# . As you can
# see, the ensemble’s predictions will likely generalize much better than the single Decision Tree’s
# predictions: the ensemble has a comparable bias but a smaller variance

In [None]:
# Out-of-Bag Evaluation


# With bagging, some instances may be sampled several times for any given predictor, while others may not
# be sampled at all. By default a BaggingClassifier samples m training instances with replacement
# ( bootstrap=True ), where m is the size of the training set. This means that only about 63% of the training
# instances are sampled on average for each predictor. 6 The remaining 37% of the training instances that are
# not sampled are called out-of-bag (oob) instances. Note that they are not the same 37% for all predictors.
# Since a predictor never sees the oob instances during training, it can be evaluated on these instances,
# without the need for a separate validation set or cross-validation. You can evaluate the ensemble itself by
# averaging out the oob evaluations of each predictor.
# In Scikit-Learn, you can set oob_score=True when creating a BaggingClassifier to request an
# automatic oob evaluation after training. The following code demonstrates this. The resulting evaluation
# score is available through the oob_score_ variable:
bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
bootstrap=True, n_jobs=-1, oob_score=True)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_
# 0.90133333333333332

# According to this oob evaluation, this BaggingClassifier is likely to achieve about 90.1% accuracy on
# the test set. Let’s verify this:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)
# 0.91200000000000003

In [None]:
# Random Patches and Random Subspaces


# The BaggingClassifier class supports sampling the features as well. This is controlled by two
# hyperparameters: max_features and bootstrap_features . They work the same way as max_samples
# and bootstrap , but for feature sampling instead of instance sampling. Thus, each predictor will be
# trained on a random subset of the input features.
# This is particularly useful when you are dealing with high-dimensional inputs (such as images). Sampling
# both training instances and features is called the Random Patches method. 7 Keeping all training instances
# (i.e., bootstrap=False and max_samples=1.0 ) but sampling features (i.e., bootstrap_features=True
# and/or max_features smaller than 1.0) is called the Random Subspaces method. 8
# Sampling features results in even more predictor diversity, trading a bit more bias for a lower variance

In [1]:
# As we have discussed, a Random Forest 9 is an ensemble of Decision Trees, generally trained via the
# bagging method (or sometimes pasting), typically with max_samples set to the size of the training set.

# The following code trains a
# Random Forest classifier with 500 trees (each limited to maximum 16 nodes), using all available CPU
# cores:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred_rf = rnd_clf.predict(X_test)
# With a few exceptions, a RandomForestClassifier has all the hyperparameters of a
# DecisionTreeClassifier (to control how trees are grown), plus all the hyperparameters of a
# BaggingClassifier to control the ensemble itself. 11
# The Random Forest algorithm introduces extra randomness when growing trees; instead of searching for
# the very best feature when splitting a node (see Chapter 6), it searches for the best feature among a
# random subset of features. This results in a greater tree diversity, which (once again) trades a higher bias
# for a lower variance, generally yielding an overall better model. The following BaggingClassifier is
# roughly equivalent to the previous RandomForestClassifier :
bag_clf = BaggingClassifier(
DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1_

In [None]:
# FEATURE IMPORTANCE

# Scikit-Learn measures a feature’s importance by looking at how much the tree nodes that use
# that feature reduce impurity on average (across all trees in the forest)Scikit-Learn computes this score automatically for each feature after training, then it scales the results so
# that the sum of all importances is equal to 1. You can access the result using the feature_importances_
# variable.

from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
print(name, score)
# sepal length (cm) 0.112492250999
# sepal width (cm) 0.0231192882825
# petal length (cm) 0.441030464364
# petal width (cm) 0.423357996355

In [None]:
# Boosting


# Boosting (originally called hypothesis boosting) refers to any Ensemble method that can combine several
# weak learners into a strong learner.

# AdaBoost

# One way for a new predictor to correct its predecessor is to pay a bit more attention to the training
# instances that the predecessor underfitted. This results in new predictors focusing more and more on the
# hard cases. This is the technique used by AdaBoost.
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)

In [None]:
# Gradient Boosted Regression Trees (GBRT) A simpler way to train GBRT ensembles is to use Scikit-Learn’s GradientBoostingRegressor class.
# Much like the RandomForestRegressor class, it has hyperparameters to control the growth of Decision
# Trees (e.g., max_depth , min_samples_leaf , and so on), as well as hyperparameters to control the
# ensemble training, such as the number of trees ( n_estimators ).

from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

In [None]:
# The following code trains a GBRT ensemble with 120 trees, then measures the validation error at each stage of training to find the
# optimal number of trees, and finally trains another GBRT ensemble using the optimal number of trees:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
X_train, X_val, y_train, y_val = train_test_split(X, y)
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)
errors = [mean_squared_error(y_val, y_pred)
for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)
gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

In [None]:
# It is also possible to implement early stopping by actually stopping training early (instead of training a
# large number of trees first and then looking back to find the optimal number). You can do so by setting
# warm_start=True , which makes Scikit-Learn keep existing trees when the fit() method is called,
# allowing incremental training. The following code stops training when the validation error does notimprove for five iterations in a row:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)
min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
    if error_going_up == 5:
        break # early stopping
# The GradientBoostingRegressor class also supports a subsample hyperparameter, which specifies
# the fraction of training instances to be used for training each tree. For example, if subsample=0.25 , then
# each tree is trained on 25% of the training instances, selected randomly. As you can probably guess by
# now, this trades a higher bias for a lower variance. It also speeds up training considerably. This technique
# is called Stochastic Gradient Boosting.



In [None]:
#BLENDER

#Just read the section

# you can use an open source implementation
# such as brew (available at https://github.com/viisar/brew)