# Dependencies

In [2]:
from sklearn.svm import SVC
from sklearn.datasets import fetch_openml
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, StackingClassifier

Load the MNIST dataset, split it into a
training set, a validation set, and a test set.

In [3]:
mnist = fetch_openml('mnist_784', as_frame=False)
X, y = mnist['data'], mnist['target']

X_train, y_train = X[:50000], y[:50000]
X_val, y_val = X[50000:60000], y[50000:60000]
X_test, y_test = X[60000:], y[60000:]

print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print(f'X_val: {X_val.shape}')
print(f'y_val: {y_val.shape}')
print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')

X_train: (50000, 784)
y_train: (50000,)
X_val: (10000, 784)
y_val: (10000,)
X_test: (10000, 784)
y_test: (10000,)


# Single Classifiers vs Ensemble

Train various classifiers, such as a random forest classifier, an extra-trees
classifier, and an SVM classifier. Combine them into an
ensemble that outperforms each individual classifier on the validation
set, using soft or hard voting.

In [4]:
# VotingClassifier with Hard Voting (Most predicted class)
hardvoting_clf = VotingClassifier(
    estimators=[
        ('RandomForests', RandomForestClassifier(random_state=1)),
        ('Extra-Trees', ExtraTreesClassifier(random_state=1)),
        ('Support Vector Machine', SVC(random_state=1))
    ],
    voting='hard'
)

hardvoting_clf.fit(X_train, y_train)

In [5]:
# VotingClassifier with Soft Voting (Largest average class probability)
softvoting_clf = VotingClassifier(
    estimators=[
        ('RandomForests', RandomForestClassifier(random_state=2)),
        ('Extra-Trees', ExtraTreesClassifier(random_state=2)),
        ('Support Vector Machine', SVC(probability=True, random_state=2))
    ],
    voting='soft'
)

softvoting_clf.fit(X_train, y_train)

In [8]:
from sklearn.base import clone

# Original estimators from the VotingClassifier
base_estimators = [
    ('RandomForests', RandomForestClassifier(random_state=1)),
    ('Extra-Trees', ExtraTreesClassifier(random_state=1)),
    ('Support Vector Machine', SVC(random_state=1))
]

for name, model in base_estimators:
    cloned_model = clone(model)
    cloned_model.fit(X_train, y_train)
    print(name, "=", cloned_model.score(X_val, y_val))

RandomForests = 0.9732
Extra-Trees = 0.9743
Support Vector Machine = 0.9802


In [9]:
print("Hard Voting Classifier:")
print(hardvoting_clf.score(X_val, y_val))

print("\nSoft Voting Classifier:")
print(softvoting_clf.score(X_val, y_val))

Hard Voting Classifier:
0.9772

Soft Voting Classifier:
0.9803


SoftVotingClassifier shows the top score with slight difference from HardVotingClassifier.

# Blender

Run the individual classifiers to make predictions on the validation set, and create a new training set with the resulting predictions: each training instance is a vector containing the set of predictions from all classifiers for an image, and the target is the image’s class. Train a classifier on this new training set to get a blender. Evaluate the ensemble on the test set.
Try again using a StackingClassifier.

In [None]:
import numpy as np

In [None]:
# Random Forests Classifier
rf_clf = RandomForestClassifier(random_state=3)
rf_clf.fit(X_train, y_train)

# Extra-Trees Classifier
et_clf = ExtraTreesClassifier(random_state=3)
et_clf.fit(X_train, y_train)

# Support Vector Machine Classifier
svm_clf = SVC(random_state=3)
svm_clf.fit(X_train, y_train)

In [None]:
# Making predictions and creating new training dataset of out it
rf_pred = rf_clf.predict(X_val)
et_pred = et_clf.predict(X_val)
svm_pred = svm_clf.predict(X_val)

new_X = np.column_stack((rf_pred, et_pred, svm_pred))

In [None]:
# Creating a new classifier model
rf2 = RandomForestClassifier(random_state=42)
rf2.fit(new_X, y_val)

In [None]:
# Evaluating the ensemble using test set
rf_test = rf_clf.predict(X_test)
et_test = et_clf.predict(X_test)
svm_test = svm_clf.predict(X_test)

new_X_test = np.column_stack((rf_test, et_test, svm_test))

rf2_pred = rf2.predict(new_X_test)
print(f'Accuracy score: {round(accuracy_score(y_test, rf2_pred), 4)}')

Accuracy score: 0.9751


In [None]:
# The same task using StackingClassifier
stack_clf = StackingClassifier(
    estimators = [
        ('RandomForests', RandomForestClassifier(random_state=4)),
        ('Extra-Trees', ExtraTreesClassifier(random_state=4)),
        ('Support Vector Machine', SVC(probability=True, random_state=4))
    ],
    final_estimator=RandomForestClassifier(random_state=5)
)

stack_clf.fit(X_train, y_train)

In [None]:
stack_pred_val = stack_clf.predict(X_val)
print(f'Validation score: {round(accuracy_score(y_val, stack_pred_val), 4)}')

stack_pred_test = stack_clf.predict(X_test)
print(f'Test Accuracy score: {round(accuracy_score(y_test, stack_pred_test), 4)}')

Validation score: 0.9818
Test Accuracy score: 0.9799


StackingClassifier shows better results than stacking classifier made from scratch, because it is more optimized and expected to show greater performance.