# Ensemble Learning and Random Forests

In [1]:
# Create and train a voting classifier w/ three diverse classifiers
import numpy as np
import os
%matplotlib inline
np.random.seed(42)



from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)



In [2]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

voting_clf = VotingClassifier(
    estimators = [('lr', log_clf),
                  ('rf', rnd_clf),
                  ('svc', svm_clf)],
    voting = 'soft')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomF...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [3]:
# Let's look at each classifier's accuracy on the test set
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.904
SVC 0.888
VotingClassifier 0.912


  if diff:


## Bagging and Pasting in Scikit-Learn

In [4]:
# Train on an ensemble of 500 Decision Tree classifiers each trained on 100 training instances randomly
# sampled from the training set with replacement

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators = 500,
    max_samples = 100, bootstrap = True, n_jobs = -1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [5]:
accuracy_score(y_test, y_pred)

0.904

## Out-of-Bag Evaluation

In [6]:
# set oob_score = True to request an automatic oob evaluation after training
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators = 500,
    bootstrap = True, n_jobs = -1, oob_score = True)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.8986666666666666

In [7]:
# So this is saying this BaggingClassifier is likely to achieve about 89.9% accuracy.
# Let's see what happens:

y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.912

In [8]:
bag_clf.oob_decision_function_

array([[0.45728643, 0.54271357],
       [0.40697674, 0.59302326],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.05527638, 0.94472362],
       [0.39053254, 0.60946746],
       [0.00980392, 0.99019608],
       [0.99408284, 0.00591716],
       [0.97368421, 0.02631579],
       [0.75879397, 0.24120603],
       [0.00574713, 0.99425287],
       [0.77222222, 0.22777778],
       [0.80446927, 0.19553073],
       [0.97752809, 0.02247191],
       [0.06077348, 0.93922652],
       [0.00520833, 0.99479167],
       [0.98369565, 0.01630435],
       [0.89552239, 0.10447761],
       [1.        , 0.        ],
       [0.01685393, 0.98314607],
       [0.32571429, 0.67428571],
       [0.92021277, 0.07978723],
       [1.        , 0.        ],
       [0.96650718, 0.03349282],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.0052356 , 0.9947644 ],
       [0.63541667, 0.36458333],
       [0.

# Random Forests

In [9]:
# Train a Random Forest classifier with 500 trees (each limited to maximum 16 nodes), using all available CPU cores
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [10]:
# This BaggingClassifier is roughly the same as the above cell
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter = 'random', max_leaf_nodes = 16),
    n_estimators = 500, max_samples = 1.0, bootstrap = True, n_jobs = -1)

## Feature Importance

In [11]:
# Train a RandomForestClassifier on the iris dataset and output each feature's importance
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators = 500, n_jobs = -1)
rnd_clf.fit(iris['data'], iris['target'])
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.10525185076034302
sepal width (cm) 0.025126722068359905
petal length (cm) 0.4236915209767608
petal width (cm) 0.44592990619453604


# Boosting
## AdaBoost

In [12]:
# Train an AdaBoost classifier based on 200 Decision Stumps
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth = 1), n_estimators = 200,
    algorithm = "SAMME.R", learning_rate = 0.5)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.5, n_estimators=200, random_state=None)

## Gradient Boosting

In [13]:
# Fit a DecisionTreeRegressor to the training set
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [14]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth = 2)
tree_reg1.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [15]:
# Train a second DecisionTreeRegressor on the residual errors made by the predicotr
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth = 2)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [16]:
# Train a third regressor on the residual errors made by the second predictor
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth = 2)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [17]:
# This gives us an ensemble, make predictions by adding up the predictions of all three


X_new = np.array([[0.8]])


y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [18]:
y_pred

array([0.75026781])

In [19]:
# This creates the same ensemble as above:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 3, learning_rate = 1.0)
gbrt.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [20]:
# Train a GBRT ensemble with 120 trees, measure the validation error at each stage of training to find the
# optimal number of trees, and train another GBRT ensemble using the optimal number of trees
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 120)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth = 2, n_estimators = bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=84, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [21]:
# This will stop training when the validation error doesn't improve for 5 iterations in a row
gbrt = GradientBoostingRegressor(max_depth = 2, warm_start = True)

min_val_error = float('inf')
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break   #early stop

# Exercises
## #8

In [22]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')

In [41]:
X, y = mnist["data"], mnist["target"]
y.shape

(70000,)

In [42]:
# Create train, test, val sets
X_train, X_test, X_val = X[:50000], X[50000:60000], X[60000:]
y_train, y_test, y_val = y[:50000], y[50000:60000], y[60000:]
X_train.shape

(50000, 784)

In [43]:
# Shuffle the training set
import numpy as np

shuffle_index = np.random.permutation(50000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [44]:
# Train various models
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(max_depth = 2, random_state = 0)
rf_clf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [50]:
from sklearn.ensemble import ExtraTreesClassifier
ext_clf = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=10, random_state=0)
ext_clf.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [55]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
svm_clf = SVC(kernel = 'poly', degree = 3, coef0=1, C=5)
svm_clf.fit(X_train, y_train)

SVC(C=5, cache_size=200, class_weight=None, coef0=1,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [56]:
# Check how they perform individually on the validation set
from sklearn.metrics import f1_score
rf_pred = rf_clf.predict(X_val)
rf_f1 = f1_score(rf_pred, y_val, average = 'weighted')
print("Random Forest F1:", rf_f1)

ext_pred = ext_clf.predict(X_val)
ext_f1 = f1_score(ext_pred, y_val, average = 'weighted')
print("Extra-trees F1:", ext_f1)

svm_pred = poly_kernel_svm_clf.predict(X_val)
svm_f1 = f1_score(svm_pred, y_val, average = 'weighted')
print("SVM F1:", svm_f1)


  'recall', 'true', average, warn_for)


Random Forest F1: 0.6248312541060128
Extra-trees F1: 0.9074312955719895




SVM F1: 0.9213049678300277


In [52]:
# Now create the ensemble
from sklearn.ensemble import VotingClassifier
ens_clf1 = RandomForestClassifier(max_depth = 2, random_state = 0)
ens_clf2 = ExtraTreesClassifier(n_estimators=200, max_depth=None, min_samples_split=10, random_state=0)
ens_clf3 = SVC(kernel = 'poly', degree = 3, coef0=1, C=5)

voting_clf = VotingClassifier(
    estimators = [('rfc', ens_clf1), ('etc', ens_clf2), ('svm', ens_clf3)],
    voting = 'soft')
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('rfc', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weig... max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [54]:
vote_pred = voting_clf.predict(X_val)
vote_f1 = f1_score(vote_pred, y_val, average = 'weighted')
print(vote_f1)



0.9083653753767718


  if diff:
  'recall', 'true', average, warn_for)


In [57]:
# Check on the test set
vote_pred = voting_clf.predict(X_test)
print("Voting F1: ", f1_score(vote_pred, y_test, average = 'weighted'))
rf_pred = rf_clf.predict(X_test)
print("Random Forest F1: ", f1_score(rf_pred, y_test, average = 'weighted'))
ext_pred = ext_clf.predict(X_test)
print("Extra Trees F1: ", f1_score(ext_pred, y_test, average = 'weighted'))
svm_pred = svm_clf.predict(X_test)
print("SVM F1: ", f1_score(svm_pred, y_test, average = 'weighted'))


  if diff:
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


Voting F1:  0.3143734009881159
Random Forest F1:  0.0
Extra Trees F1:  0.32647258107039284
SVM F1:  0.3724525436408978


In [58]:
rf_pred

array([3., 6., 1., ..., 7., 3., 3.])

In [59]:
y_test

array([8., 8., 8., ..., 9., 9., 9.])

## #9

In [65]:
vectors = list(zip(rf_clf.predict(X_val), ext_clf.predict(X_val), svm_clf.predict(X_val)))

In [76]:
vectors[9000]

(1.0, 4.0, 4.0)

In [None]:
blend_clf = SVC(kernel = 'poly', degree = 3, coef0=1, C=5)
blend_clf.fit(vectors, y_val)

In [None]:
test_vec = list(zip(rf_clf.predict(X_test), ext_clf.predict(X_test), svm_clf.predict(X_test)))
test_pred = blend_clf.predict(test_vec)
print("Blender F1: ", f1_score(test_pred, y_test, average = 'weighted'))