In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 4)

plt.rcParams['figure.figsize'] = (6, 6)
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['figure.titlesize'] = 14
plt.rcParams['axes.titlesize'] = 12

%matplotlib inline

### Exercise 7. Voting classifier

In [2]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)
mnist.target = mnist.target.astype(np.uint8)

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

In [30]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [12]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=4, random_state=42)
et_clf = ExtraTreesClassifier(n_estimators=100, max_depth=10, n_jobs=4, random_state=42)
lr_clf = LogisticRegression(random_state=42, solver='saga')
nn_clf = MLPClassifier(random_state=42)

hard_voting = VotingClassifier([('rf', rf_clf), ('et', et_clf), ('lr', lr_clf), ('nn', nn_clf)], voting='hard')

In [13]:
accuracy_output = {}

for estimator in [rf_clf, et_clf, lr_clf, nn_clf, hard_voting]:
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict(X_test)
    est_name = estimator.__class__.__name__
    accuracy_output[est_name] = accuracy_score(y_test, y_pred)
    print(est_name, ": ",  accuracy_output[est_name])

RandomForestClassifier :  0.9432
ExtraTreesClassifier :  0.9353




LogisticRegression :  0.9199
MLPClassifier :  0.9633




VotingClassifier :  0.9516


In [14]:
hard_voting.score(X_test, y_test)

0.9516

In [15]:
hard_voting.voting = "soft"
hard_voting.score(X_test, y_test)

0.9644

In [16]:
[estimator.score(X_test, y_test) for estimator in hard_voting.estimators_]

[0.9432, 0.9353, 0.9199, 0.9633]

### Exercise 8. Implement stacking

In [54]:
y_train.size, y_val.size, y_test.size

(50000, 10000, 10000)

In [65]:
layer1_estimators = [rf_clf, et_clf, lr_clf, nn_clf]
layer1_output = []

for estimator in layer1_estimators:
    layer1_output.append(pd.Series(estimator.predict(X_val)))

layer2_input = pd.concat(layer1_output, axis=1)
layer2_input.shape

(10000, 4)

In [66]:
blender_params = [{"max_depth": [10, 20, 50], 'n_estimators': [100, 200]}]
blender_cv = GridSearchCV(RandomForestClassifier(random_state=42, oob_score=True), blender_params, scoring='accuracy', cv=4, n_jobs=4, verbose=2)
blender_cv.fit(layer2_input, y_val)

Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed:    3.9s finished


GridSearchCV(cv=4,
             estimator=RandomForestClassifier(oob_score=True, random_state=42),
             n_jobs=4,
             param_grid=[{'max_depth': [10, 20, 50],
                          'n_estimators': [100, 200]}],
             scoring='accuracy', verbose=2)

In [67]:
blender_cv.best_params_

{'max_depth': 10, 'n_estimators': 200}

In [68]:
blender_cv.best_estimator_.oob_score_

0.9609

In [70]:
layer1_output_test = []

for estimator in layer1_estimators:
    layer1_output_test.append(pd.Series(estimator.predict(X_test)))

layer2_input_test = pd.concat(layer1_output_test, axis=1)
blender_cv.best_estimator_.score(layer2_input_test, y_test)

0.9625