In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from sklearn.datasets import fetch_openml 

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

In [None]:
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (9, 6)

# Etude 1 - Ensemble Learning

Load the MNIST data and split it into a training set, a validation set, and a test set (e.g., use the first 50,000 instances for training, the next 10,000 for validation, and the last 10,000 for testing). Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM. Next, try to combine them into an ensemble that outperforms them all on the validation set, using a soft or hard voting classifier. Once you have found one, try it on the test set. How much better does it perform compared to the individual classifiers?

In [None]:
mnist = fetch_openml('mnist_784', version=1)
mnist.target = mnist.target.astype(np.int64)

In [None]:
X, y = mnist["data"], mnist["target"]

In [None]:
X.shape

(70000, 784)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=50000, random_state=42)

In [None]:
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, train_size=10000, random_state=42)

Let's train 4 classifiers: Random Forest classifier, Extra-Trees classifier, SVM, and KNN. Then let's look at their accuracy.

In [None]:
rforest = RandomForestClassifier(n_jobs=-1)
etree = ExtraTreesClassifier(n_jobs=-1)
svc = LinearSVC(max_iter=1500)
knn = KNeighborsClassifier(n_jobs=-1)

In [None]:
classifiers = [rforest, etree, svc, knn]
for classifier in classifiers:
    print(classifier)
    classifier.fit(X_train, y_train)  

RandomForestClassifier(n_jobs=-1)
ExtraTreesClassifier(n_jobs=-1)
LinearSVC(max_iter=1500)




KNeighborsClassifier(n_jobs=-1)


Let's view the score for each particular classifier. 

In [None]:
[classifier.score(X_val, y_val) for classifier in classifiers]    

[0.9672, 0.9678, 0.8612, 0.9675]

Now we combine these classifiers into a VotingClassifier ensemble:

In [None]:
vote_clf = VotingClassifier(
    [('rforest', rforest), ('etree', etree), ('svc', svc), ('knn', knn)],
    n_jobs=-1
)

In [None]:
vote_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('rforest', RandomForestClassifier(n_jobs=-1)),
                             ('etree', ExtraTreesClassifier(n_jobs=-1)),
                             ('svc', LinearSVC(max_iter=1500)),
                             ('knn', KNeighborsClassifier(n_jobs=-1))],
                 n_jobs=-1)

In [None]:
vote_clf.score(X_val, y_val)

0.9683

Let's view the score for each estimator in the voting classifier:

In [None]:
[estimator.score(X_val, y_val) for estimator in vote_clf.estimators_]

[0.9667, 0.9699, 0.8371, 0.9675]

The third classifier (SVC) produces the weakest score. Let's remove it from the voting classifier.

In [None]:
del vote_clf.estimators_[2]

In [None]:
vote_clf.estimators_

[RandomForestClassifier(n_jobs=-1),
 ExtraTreesClassifier(n_jobs=-1),
 KNeighborsClassifier(n_jobs=-1)]

In [None]:
vote_clf.score(X_val, y_val)

0.971

The score indeed improved a bit. Now let's use the same voting classifier but with the "soft" voting.

In [None]:
vote_clf.set_params(voting='soft')

VotingClassifier(estimators=[('rforest', RandomForestClassifier(n_jobs=-1)),
                             ('etree', ExtraTreesClassifier(n_jobs=-1)),
                             ('svc', LinearSVC(max_iter=1500)),
                             ('knn', KNeighborsClassifier(n_jobs=-1))],
                 n_jobs=-1, voting='soft')

Note: If we run the fit method without removing LinearSVC from the ensemble, we will get an error (since this classifier cannot return probabilities).

In [None]:
vote_clf.estimators_

[RandomForestClassifier(n_jobs=-1),
 ExtraTreesClassifier(n_jobs=-1),
 KNeighborsClassifier(n_jobs=-1)]

Let's estimate its accuracy on the validation set.

In [None]:
vote_clf.score(X_val, y_val)

0.9733

We obtained a slighly higher result than with soft  voting. Now let's estimate this classifier accuracy on the test set.

In [None]:
vote_clf.score(X_test, y_test)

0.9744

Note: you can get the same result by running predict() on the X_test and then using the accuracy_score() function.

# Etude 2 - Blender

Run the individual classifiers from the previous exercise to make predictions on the validation set, and create a new training set with the resulting predictions:
each training instance is a vector containing the set of predictions from all your classifiers for an image, and the target is the image’s class. Congratulations, you have just trained a blender, and together with the classifiers they form a stacking ensemble! Now let’s evaluate the ensemble on the test set. For each image in the test set, make predictions with all your classifiers, then feed the predictions to the blender to get the ensemble’s predictions. How does it compare to the voting classifier you trained earlier?

In [None]:
def blender(X, classifiers):
    y_blend = np.empty([len(X), len(classifiers)])
    for index, classifier in enumerate(classifiers):
        y_blend[:, index] = classifier.predict(X)
    return y_blend

Now we train the blender using the validation set (we should use the set that was not used in any way to train any of the predictors).

In [None]:
blender_rf = RandomForestClassifier(n_jobs=-1)
blender_rf.fit(blender(X_val, classifiers), y_val)

RandomForestClassifier(n_jobs=-1)

Now let's test out blender on the test set.

In [None]:
blender_rf.score(blender(X_test, classifiers), y_test)

0.972