# Accurate MNIST

The goal of this notebook is to train a model (using scikit-learn) to achieve at least 97% accuracy on the MNIST dataset. This is my solution to Ch3, exercise 1 in Hands-on ML using scikit-learn and tensorflow

In [1]:
import numpy as np
from sklearn.datasets import fetch_mldata

mnist = fetch_mldata('MNIST original')
mnist

{'COL_NAMES': ['label', 'data'],
 'DESCR': 'mldata.org dataset: mnist-original',
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'target': array([ 0.,  0.,  0., ...,  9.,  9.,  9.])}

In [2]:
X = mnist.data   # (70000, 784)
y = mnist.target # (70000,)
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

In [3]:
from sklearn.ensemble import RandomForestClassifier

rfc_clf = RandomForestClassifier() # default params to start

In [4]:
from sklearn.metrics import accuracy_score
# first, let's see how the default params perform
rfc_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [5]:
pred = rfc_clf.predict(X_test)

In [6]:
score = accuracy_score(y_test, pred)
print(score) # initial score = 94.94; not a bad starting point

0.9496


# Running a Grid Search on the RandomForestClassifier

In [7]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators' : [10, 20, 30],
          'max_features' : [None, 'sqrt', 'log2'],
          'max_depth' : [None, 10]
         }
grid_search = GridSearchCV(rfc_clf, params, cv=5, verbose=3)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] max_depth=None, max_features=None, n_estimators=10 ..............
[CV]  max_depth=None, max_features=None, n_estimators=10, score=0.939192, total= 1.4min
[CV] max_depth=None, max_features=None, n_estimators=10 ..............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.4min remaining:    0.0s


[CV]  max_depth=None, max_features=None, n_estimators=10, score=0.938594, total= 1.4min
[CV] max_depth=None, max_features=None, n_estimators=10 ..............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  2.8min remaining:    0.0s


[CV]  max_depth=None, max_features=None, n_estimators=10, score=0.937750, total= 1.8min
[CV] max_depth=None, max_features=None, n_estimators=10 ..............
[CV]  max_depth=None, max_features=None, n_estimators=10, score=0.930316, total= 1.7min
[CV] max_depth=None, max_features=None, n_estimators=10 ..............
[CV]  max_depth=None, max_features=None, n_estimators=10, score=0.940480, total= 1.7min
[CV] max_depth=None, max_features=None, n_estimators=20 ..............
[CV]  max_depth=None, max_features=None, n_estimators=20, score=0.948272, total= 3.3min
[CV] max_depth=None, max_features=None, n_estimators=20 ..............
[CV]  max_depth=None, max_features=None, n_estimators=20, score=0.941926, total= 3.4min
[CV] max_depth=None, max_features=None, n_estimators=20 ..............
[CV]  max_depth=None, max_features=None, n_estimators=20, score=0.945000, total= 2.9min
[CV] max_depth=None, max_features=None, n_estimators=20 ..............
[CV]  max_depth=None, max_features=None, n_est

[CV]  max_depth=10, max_features=None, n_estimators=20, score=0.931977, total= 1.8min
[CV] max_depth=10, max_features=None, n_estimators=30 ................
[CV]  max_depth=10, max_features=None, n_estimators=30, score=0.931445, total= 2.4min
[CV] max_depth=10, max_features=None, n_estimators=30 ................
[CV]  max_depth=10, max_features=None, n_estimators=30, score=0.930428, total= 2.3min
[CV] max_depth=10, max_features=None, n_estimators=30 ................
[CV]  max_depth=10, max_features=None, n_estimators=30, score=0.915833, total= 2.4min
[CV] max_depth=10, max_features=None, n_estimators=30 ................
[CV]  max_depth=10, max_features=None, n_estimators=30, score=0.926898, total= 2.3min
[CV] max_depth=10, max_features=None, n_estimators=30 ................
[CV]  max_depth=10, max_features=None, n_estimators=30, score=0.933228, total= 2.4min
[CV] max_depth=10, max_features=sqrt, n_estimators=10 ................
[CV]  max_depth=10, max_features=sqrt, n_estimators=10, sc

[CV]  max_depth=5, max_features=sqrt, n_estimators=10, score=0.788250, total=   1.3s
[CV] max_depth=5, max_features=sqrt, n_estimators=10 .................
[CV]  max_depth=5, max_features=sqrt, n_estimators=10, score=0.812787, total=   1.2s
[CV] max_depth=5, max_features=sqrt, n_estimators=10 .................
[CV]  max_depth=5, max_features=sqrt, n_estimators=10, score=0.820857, total=   1.3s
[CV] max_depth=5, max_features=sqrt, n_estimators=20 .................
[CV]  max_depth=5, max_features=sqrt, n_estimators=20, score=0.839150, total=   2.4s
[CV] max_depth=5, max_features=sqrt, n_estimators=20 .................
[CV]  max_depth=5, max_features=sqrt, n_estimators=20, score=0.811031, total=   2.5s
[CV] max_depth=5, max_features=sqrt, n_estimators=20 .................
[CV]  max_depth=5, max_features=sqrt, n_estimators=20, score=0.826083, total=   2.4s
[CV] max_depth=5, max_features=sqrt, n_estimators=20 .................
[CV]  max_depth=5, max_features=sqrt, n_estimators=20, score=0.8

[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 207.9min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 20, 30], 'max_features': [None, 'sqrt', 'log2'], 'max_depth': [None, 10, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [13]:
rfc_best = grid_search.best_estimator_
pred = rfc_best.predict(X_test)
score = accuracy_score(y_test, pred)
score

# 96.31% test accuracy - not bad, but still not 97%

0.96309999999999996

In [15]:
# seems like the most important parameter is n_estimators
# we'll run another grid search with different n_estimators values
rfc_clf = RandomForestClassifier() # starting over with a new blank classifier
grid_search = GridSearchCV(rfc_clf, {'n_estimators' : [30, 35, 40, 45]},
                           cv=5, verbose=3)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] n_estimators=30 .................................................
[CV] .................. n_estimators=30, score=0.961100, total=   9.1s
[CV] n_estimators=30 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.8s remaining:    0.0s


[CV] .................. n_estimators=30, score=0.957007, total=   8.8s
[CV] n_estimators=30 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   19.3s remaining:    0.0s


[CV] .................. n_estimators=30, score=0.959917, total=   9.2s
[CV] n_estimators=30 .................................................
[CV] .................. n_estimators=30, score=0.957656, total=   9.1s
[CV] n_estimators=30 .................................................
[CV] .................. n_estimators=30, score=0.966155, total=   8.8s
[CV] n_estimators=35 .................................................
[CV] .................. n_estimators=35, score=0.963099, total=  10.5s
[CV] n_estimators=35 .................................................
[CV] .................. n_estimators=35, score=0.961256, total=  10.4s
[CV] n_estimators=35 .................................................
[CV] .................. n_estimators=35, score=0.959250, total=  10.4s
[CV] n_estimators=35 .................................................
[CV] .................. n_estimators=35, score=0.958156, total=  10.1s
[CV] n_estimators=35 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  4.1min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [30, 35, 40, 45]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=3)

In [18]:
grid_search.best_params_

{'n_estimators': 45}

In [16]:
rfc_clf_best = grid_search.best_estimator_
pred = rfc_clf_best.predict(X_test)
score = accuracy_score(y_test, pred)
score

# 96.70% test accuracy - so close I can taste it!!

0.96699999999999997

# Gradient Tree Boosting

