In [17]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

### *Voting Classifier*

***Hard Voting***

In [18]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

In [25]:
X, y = make_moons(n_samples = 500, noise = 0.30, random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [26]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [27]:
reg_clf = LogisticRegression(solver="lbfgs", random_state = 42)
rf_clf = RandomForestClassifier(n_estimators = 100, random_state = 42)
svm_clf = SVC(gamma = "scale", random_state = 42)

In [28]:
vot_clf = VotingClassifier(
            estimators = [('lr', reg_clf), ('rf', rf_clf), ('svc', svm_clf)],
            voting = 'hard')

In [29]:
vot_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svc', SVC(random_state=42))])

In [31]:
from sklearn.metrics import accuracy_score

for clf in (reg_clf, rf_clf, svm_clf, vot_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.912


***Soft Voting***

In [33]:
ref_clf = LogisticRegression(solver = 'lbfgs', random_state = 42)
rf_clf = RandomForestClassifier(n_estimators = 100, random_state = 42)
svm_clf = SVC(gamma = 'scale', probability = True, random_state = 42)

In [34]:
vot_clf = VotingClassifier(
            estimators = [('lr', ref_clf), ('rf', rf_clf), ('svm', svm_clf)],
            voting = 'soft')

In [35]:
vot_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(random_state=42)),
                             ('rf', RandomForestClassifier(random_state=42)),
                             ('svm', SVC(probability=True, random_state=42))],
                 voting='soft')

In [36]:
for clf in (ref_clf, rf_clf, svm_clf, vot_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.92


### *Bagging and Pasting*

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), 
                            n_estimators = 500, 
                            max_samples = 100, 
                            bootstrap = True, # for bagging (not pasting)
                            n_jobs = -1)

In [39]:
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=-1)

In [40]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.904

In [41]:
oob_clf = BaggingClassifier(DecisionTreeClassifier(), 
                            n_estimators = 500, 
                            max_samples = 100, 
                            bootstrap = True, 
                            n_jobs = -1, 
                            oob_score = True)

In [42]:
oob_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=100,
                  n_estimators=500, n_jobs=-1, oob_score=True)

In [43]:
y_pred = oob_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.904

In [44]:
oob_clf.oob_score_

0.9226666666666666

In [45]:
oob_clf.oob_decision_function_

array([[0.35074627, 0.64925373],
       [0.35658915, 0.64341085],
       [1.        , 0.        ],
       [0.01312336, 0.98687664],
       [0.02525253, 0.97474747],
       [0.11363636, 0.88636364],
       [0.40106952, 0.59893048],
       [0.06887755, 0.93112245],
       [0.93139842, 0.06860158],
       [0.84168865, 0.15831135],
       [0.50518135, 0.49481865],
       [0.04477612, 0.95522388],
       [0.70437018, 0.29562982],
       [0.82716049, 0.17283951],
       [0.93076923, 0.06923077],
       [0.09399478, 0.90600522],
       [0.03896104, 0.96103896],
       [0.92119565, 0.07880435],
       [0.64736842, 0.35263158],
       [0.96482412, 0.03517588],
       [0.03217158, 0.96782842],
       [0.23306233, 0.76693767],
       [0.88717949, 0.11282051],
       [0.98200514, 0.01799486],
       [0.95618557, 0.04381443],
       [0.00257732, 0.99742268],
       [0.97855228, 0.02144772],
       [1.        , 0.        ],
       [0.02849741, 0.97150259],
       [0.71276596, 0.28723404],
       [0.

### *Random Forests*

In [47]:
from sklearn.ensemble import RandomForestClassifier

In [51]:
rand_clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1)
rand_clf.fit(X_train, y_train)

RandomForestClassifier(max_leaf_nodes=16, n_estimators=500, n_jobs=-1)

In [52]:
y_pred = rand_clf.predict(X_test)

In [53]:
accuracy_score(y_test, y_pred)

0.92

In [54]:
from sklearn.datasets import load_iris

iris = load_iris()
X = iris["data"]
y = iris["target"]

In [56]:
rand_clf = RandomForestClassifier(n_estimators = 500, n_jobs = -1)
rand_clf.fit(X, y)

RandomForestClassifier(n_estimators=500, n_jobs=-1)

In [59]:
for name, score in zip(iris["feature_names"], rand_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.10599880324508258
sepal width (cm) 0.026481285908525984
petal length (cm) 0.42442825580521776
petal width (cm) 0.44309165504117376


### *Boosting*

*AdaBoost*

In [60]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 3), 
                             n_estimators = 200, 
                             algorithm = 'SAMME.R', 
                             learning_rate = 0.5)

In [61]:
ada_clf.fit(X_train, y_train)
y_pred = ada_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.88

*Gradient Boosting*

In [75]:
# with early stopping

from sklearn.ensemble import GradientBoostingRegressor
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.datasets import make_moons

X, y = make_moons(n_samples = 500, noise = 0.30, random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

gbrt = GradientBoostingRegressor(max_depth = 2, warm_start = True)

In [76]:
min_val_error = float('inf')
error_going_up = 0

for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_test)
    val_error = mean_squared_error(y_test, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break
            
            
print(gbrt.n_estimators)
print(min_val_error)

54
0.07118698905492556


#### *Question 8*

In [94]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)
mnist.target = mnist.target.astype(np.uint8)

from sklearn.model_selection import train_test_split

In [95]:
X, y = mnist.data, mnist.target

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size = 10000, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size = 10000, random_state = 42)

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

rand_clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1, random_state = 42)
ext_clf = ExtraTreesClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1, random_state = 42)
svm_clf = SVC(kernel = "rbf", gamma = 5, C = 0.001, random_state = 42)

val_error = []
estimators = [rand_clf, ext_clf, svm_clf]

for clf in estimators:
    print(clf)
    clf.fit(X_train, y_train)
#     y_val_predict = clf.predict(X_val)
#     val_error.append(accuracy_score(y_val, y_pred))

In [None]:
[clf.score(X_val, y_val) for clf in estimators]

In [None]:
from sklearn.ensemble import VotingClassifier

estimators = [
    ('random_forest', rand_clf),
    ('extra_random_forest', ext_clf),
    ('svm', svm_clf)
]

voting_clf = VotinClassifier(estimators)

In [None]:
voting_clf.fit(X_train, y_train)

In [None]:
voing_clf.score(X_val, y_val)

In [None]:
[clf.score(X_val, y_val) for clf in voting_clf.estimators_]

In [None]:
voting_clf.voting = "soft"
voting_clf.score(X_val, y_val)

In [None]:
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

In [None]:
[clf.score(X_test, y_test) for clf in voting_clf.estimators_]

#### *Question 9*