# MNIST CLASSIFIER

In [13]:
# loads common libraries
import sklearn
import numpy as np
import pandas as pd

# to plot pretty figures
#%matplotlib inline
import matplotlib.pyplot as plt

import matplotlib as mpl
mpl.rc("axes", labelsize=14)
mpl.rc("xtick", labelsize=12)
mpl.rc("ytick", labelsize=12)
mpl.rc("font", family="Azeret Mono", weight="bold")

In [17]:
data_train = pd.read_csv("datasets/mnist/mnist_train.csv")
data_test = pd.read_csv("datasets/mnist/mnist_test.csv")

print(f"Train {data_train.shape} \n Test {data_test.shape}")

Train (60000, 785) 
 Test (10000, 785)


In [19]:
data_train.head()

Unnamed: 0,label,1x1,1x2,1x3,1x4,1x5,1x6,1x7,1x8,1x9,...,28x19,28x20,28x21,28x22,28x23,28x24,28x25,28x26,28x27,28x28
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# select the first 50000 for train set
X_train, y_train = data_train[:50000].drop("label", axis=1), data_train["label"][:50000]

In [21]:
# select the next 10000 for validation
X_validation, y_validation = data_train[50000:60000].drop("label", axis=1), data_train["label"][50000:60000]

# prepare test
X_test, y_test = data_test.drop("label", axis=1), data_test["label"]

## 1. Train various classifiers

In [22]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm = LinearSVC(max_iter=100, tol=20, dual=True, random_state=42)
mlp = MLPClassifier(random_state=42)

estimators = [random_forest, extra_trees, svm, mlp]

for estimator in estimators:
    print("Training time", estimator)
    estimator.fit(X_train, y_train)

[estimator.score(X_validation, y_validation) for estimator in estimators]

Training time RandomForestClassifier(random_state=42)
Training time ExtraTreesClassifier(random_state=42)
Training time LinearSVC(dual=True, max_iter=100, random_state=42, tol=20)
Training time MLPClassifier(random_state=42)


[0.9736, 0.9743, 0.8662, 0.9635]

## 2. Combining the classifiers into an ensemble that outperforms them all

In [None]:
from sklearn.ensemble import VotingClassifier

named_estimators = [
    ("random forest", random_forest),
    ("extra trees", extra_trees),
    ("SVM", svm),
    ("MLP", mlp)
]

voting_classifier = VotingClassifier(named_estimators)

voting_classifier.fit(X_train, y_train)

voting_classifier.score(X_validation, y_validation)
