# Exercises 8 and 9 (Chapter 7)

7. Load the MNIST data (introduced in Chapter 3), and:
- split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing)
- Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM classifier.
- Next, try to combine them into an ensemble that outperforms each individual classifier on the validation set, using soft or hard voting.
- Once you have found one, try it on the test set. How much better does it perform compared to the individual classifiers?

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
import pandas as pd

In [2]:
# For saving the models after they're trained
import pickle
from joblib import dump, load

In [3]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)

In [4]:
X, y = mnist["data"], mnist["target"]

In [5]:
from sklearn.model_selection import train_test_split
X_train_full, X_test, y_train_full, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full,
                                                     test_size=10000,
                                                     random_state=1989)

In [6]:
# Train RandomForest, ExtraTrees and SVM
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [7]:
from os.path import exists
if exists('outputs/grid_rf_ch7.joblib'):
    grid_rf = load('outputs/grid_rf_ch7.joblib')
else:
    rnd_clf = RandomForestClassifier(n_estimators=500, # number of trees
                                 n_jobs=-1) # use all the cores

    max_leaf_nodes_params = list(range(2, 500, 20))
    max_leaf_nodes_params.append(None)

    grid_rf = {
        'max_leaf_nodes': max_leaf_nodes_params
    }

    grid_rf = GridSearchCV(rnd_clf, grid_rf, cv = 3, scoring = 'accuracy')

    grid_rf.fit(X_train, y_train)
    
    dump(grid_rf, 'outputs/grid_rf_ch7.joblib')

In [8]:
grid_rf.best_score_

0.9656199975724475

In [9]:
# Now get the performance (score) on the validation set
from sklearn.metrics import accuracy_score
y_pred_rf = grid_rf.predict(X_val)
accuracy_score(y_val, y_pred_rf)

0.9696

In [10]:
# Now it's the turn of an ExtraTrees classifier
from sklearn.ensemble import ExtraTreesClassifier

In [11]:
if exists('outputs/gridsearch_xt_ch7.joblib'):
    gridsearch_xt = load('outputs/gridsearch_xt_ch7.joblib')
else:
    xt_clf = ExtraTreesClassifier(n_jobs=-1,
                              random_state=1989)
    # max_leaf_nodes and n_estimators
    max_leaf_nodes_params = list(range(2, 500, 35))
    max_leaf_nodes_params.append(None)

    grid_xt = {
        'max_leaf_nodes': max_leaf_nodes_params,
        'n_estimators': [250, 500, 750]
    }

    gridsearch_xt = GridSearchCV(xt_clf,
                                grid_xt,
                                cv = 3,
                                scoring = 'accuracy')

    gridsearch_xt.fit(X_train, y_train)
    
    dump(gridsearch_xt, 'outputs/gridsearch_xt_ch7.joblib')

In [12]:
gridsearch_xt.best_score_

0.9695399971755755

In [32]:
gridsearch_xt.best_params_

{'max_leaf_nodes': None, 'n_estimators': 750}

In [13]:
y_pred_xt = gridsearch_xt.predict(X_val)
accuracy_score(y_val, y_pred_xt)

0.973

In [14]:
# AND FINALLY! THE SVM PREDICTOR
# Look at the SVM chapter exercises
# For this I have to scale and center the data
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [15]:
if exists('outputs/svm_clf_ch7.joblib'):
    svm_clf = load('outputs/svm_clf_ch7.joblib')
else:
    svm_clf = Pipeline([
            ("scaler", StandardScaler()),
            ("svm_clf", SVC(kernel="rbf", probability=True))
        ])

    svm_clf.fit(X_train, y_train)
    
    dump(svm_clf, 'outputs/svm_clf_ch7.joblib')

In [16]:
y_pred_svm = svm_clf.predict(X_val)
accuracy_score(y_val, y_pred_svm)

0.9652

Now I'm gonna combine them in a Voting Classifier.
Note that if this is done with the default `VotingClassifier` class, it is going to retrain all the models. But here I want to compare the performance of them on their own with an ensemble that combines them, so I need a voting classifier that preserves the training.

In [17]:
# Solution using mlxtend
from mlxtend.classifier import EnsembleVoteClassifier
import copy
eclf_hard = EnsembleVoteClassifier(clfs=[grid_rf, gridsearch_xt, svm_clf],
                                   voting='hard',
                                   fit_base_estimators=False,
                                   use_clones=False)

eclf_hard.fit(X_train, y_train)

y_pred_vot_hard = eclf_hard.predict(X_val)
accuracy_score(y_val, y_pred_vot_hard)



0.9728

In [20]:
eclf_soft = EnsembleVoteClassifier(clfs=[grid_rf, gridsearch_xt, svm_clf],
                                   voting='soft',
                                   fit_base_estimators=False,
                                   use_clones=False)

eclf_soft.fit(X_train, y_train)

y_pred_vot_soft = eclf_soft.predict(X_val)
accuracy_score(y_val, y_pred_vot_soft)

0.9751

Now it's time to check the accuracy of the base models against the best voting classifier *on test data* (no validation data)

In [22]:
labels = ['Random Forest', 'ExtraTrees', 'SVM', 'Voting Classifier']

for clf, label in zip([grid_rf, gridsearch_xt, svm_clf, eclf_soft], labels):
    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print("Accuracy: %0.4f [%s]" 
          % (score, label))


Accuracy: 0.9701 [Random Forest]
Accuracy: 0.9732 [ExtraTrees]
Accuracy: 0.9645 [SVM]
Accuracy: 0.9733 [Voting Classifier]


9. 

- Run the individual classifiers from the previous exercise to make predictions on the validation set
- create a new training set with the resulting predictions: each training instance is a vector containing the set of predictions from all your classifiers for an image, and the target is the image’s class

In [27]:
# Create columns of predictions from each model on validation set
y_pred_rf.shape
y_pred_xt.shape
y_pred_svm.shape

(10000,)

In [31]:
new_X_train = np.vstack((y_pred_rf, y_pred_xt, y_pred_svm)).T

new_X_train[:5]

array([['6', '6', '6'],
       ['1', '1', '1'],
       ['5', '5', '5'],
       ['0', '0', '0'],
       ['3', '3', '3']], dtype=object)

- Train a classifier on this new training set. Congratulations, you have just trained a blender, and together with the classifiers it forms a stacking ensemble!

In [39]:
blender_xt = ExtraTreesClassifier(n_jobs=-1,
                                  random_state=1989)

# Using cross validation for this one too
max_leaf_nodes_blender = list(range(2, 20, 1))
max_leaf_nodes_blender.append(None)

grid_blender = {
    'max_leaf_nodes': max_leaf_nodes_blender,
    'n_estimators': [20, 30, 40, 50, 60, 70, 75, 80, 90, 100]
}

gridsearch_blender = GridSearchCV(blender_xt,
                                  grid_blender,
                                  cv = 3,
                                  scoring = 'accuracy')

gridsearch_blender.fit(new_X_train, y_val)

GridSearchCV(cv=3, estimator=ExtraTreesClassifier(n_jobs=-1, random_state=1989),
             param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                            13, 14, 15, 16, 17, 18, 19, None],
                         'n_estimators': [20, 30, 40, 50, 60, 70, 75, 80, 90,
                                          100]},
             scoring='accuracy')

In [40]:
gridsearch_blender.best_score_

0.9729000494530556

In [41]:
gridsearch_blender.best_params_

{'max_leaf_nodes': 6, 'n_estimators': 75}

- Now evaluate the ensemble on the test set. For each image in the test set, make predictions with all your classifiers, then feed the predictions to the blender to get the ensemble’s predictions. How does it compare to the voting classifier you trained earlier?

In [42]:
y_pred_test_rf = grid_rf.predict(X_test)
y_pred_test_xt = gridsearch_xt.predict(X_test)
y_pred_test_svm = svm_clf.predict(X_test)

new_X_test = np.vstack((y_pred_test_rf, y_pred_test_xt, y_pred_test_svm)).T

In [43]:
y_pred_test_blender = gridsearch_blender.predict(new_X_test)
accuracy_score(y_test, y_pred_test_blender)

0.9729