# Testing MNIST dataset on different classifiers with an ensemble

In [1]:
import numpy as np

In [2]:
from sklearn.datasets import fetch_mldata

In [3]:
mnist = fetch_mldata('MNIST original')



In [4]:
mnist

{'DESCR': 'mldata.org dataset: mnist-original',
 'COL_NAMES': ['label', 'data'],
 'target': array([0., 0., 0., ..., 9., 9., 9.]),
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}

In [5]:
X, y = mnist["data"[:50000]], mnist["target"[:50000]]
#Xval, yval = mnist["data"[50000:]], mnist["target"[50000:]]

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
shuffle_index=np.random.permutation(50000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

# Random Forest Classifier Training and prediction

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
rnd_clf = RandomForestClassifier(n_estimators=1000, max_leaf_nodes=32, n_jobs=-1)
#rnd_clf.fit(X_train, y_train)

In [11]:
#y_pred_rf = rnd_clf.predict(X_test)

# Extra Trees Classifier Training and Prediction

In [12]:
from sklearn.ensemble import ExtraTreesClassifier

In [13]:
extrees_clf = ExtraTreesClassifier(n_estimators=1000, max_leaf_nodes=32, n_jobs=-1)
#extrees_clf.fit(X_train, y_train)

In [14]:
#y_pred_et = extrees_clf.predict(X_test)

# Support Vector Machine Training and Prediction

In [16]:
from sklearn.svm import LinearSVC
svm_clf = LinearSVC(random_state=42)

svm_clf = SVC(C=1, probability=True)
svm_clf.fit(X_train, y_train)

y_pred_svc = svm_clf.predict(X_test)

# Testing original classifiers, creation of a voting classifier, testing the various classifiers

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('rf', rnd_clf), ('svc', svm_clf),('et', extrees_clf)], voting='hard')

In [18]:
for clf in (extrees_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

ExtraTreesClassifier 0.8547142857142858
RandomForestClassifier 0.8670714285714286




LinearSVC 0.8454285714285714




VotingClassifier 0.8682142857142857


# Removing SVM to see if performance can improve for VotingClassifier, also looking at soft classification

In [21]:
voting2_clf = VotingClassifier(
    estimators=[('rf', rnd_clf), ('et', extrees_clf)], voting='soft')
voting2_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=32,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weig...ators=1000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None)

In [22]:
y_pred_voting2 = voting2_clf.predict(X_test)

In [24]:
print(accuracy_score(y_test, y_pred_voting2))

0.864


# Achieving slightly lower accuracy on classification than the use of SVC and hard classification. Would need to change one or the other to find the sole attribution.

# Creation of a blender to make predictions

In [27]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
# Note, this should be done before model training to ensure that data is 
# setup properly.

In [29]:
estimators=[rnd_clf, svm_clf, extrees_clf]
X_val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32)

In [30]:
for index, estimator in enumerate(estimators):
    X_val_predictions[:, index] = estimator.predict(X_val)

In [31]:
X_val_predictions

array([[2., 8., 2.],
       [1., 1., 1.],
       [5., 5., 5.],
       ...,
       [7., 4., 7.],
       [7., 7., 7.],
       [2., 2., 2.]], dtype=float32)

# Here comes the fun part where we get to design, build, and implement a blender

In [32]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.fit(X_val_predictions, y_val)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=True, random_state=42, verbose=0, warm_start=False)

In [33]:
rnd_forest_blender.oob_score_

0.9006428571428572

# The blender performs better on this data than any individual classifier, and the ensemble

#### Let's test it on the test data and feed multiple layers of the blender.

In [34]:
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)
for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

In [35]:
y_pred = rnd_forest_blender.predict(X_test_predictions)

In [36]:
accuracy_score(y_test, y_pred)

0.892

# Best overall accuracy for all tested methods so far.