# Accurate MNIST

The goal of this notebook is to train a model (using scikit-learn) to achieve at least 97% accuracy on the MNIST dataset. This is my solution to Ch3, exercise 1 in Hands-on ML using scikit-learn and tensorflow

In [None]:
import numpy as np
from sklearn.datasets import fetch_mldata

mnist = fetch_mldata('MNIST original')
mnist

In [None]:
X = mnist.data   # (70000, 784)
y = mnist.target # (70000,)
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc_clf = RandomForestClassifier() # default params to start

In [None]:
from sklearn.metrics import accuracy_score
# first, let's see how the default params perform
rfc_clf.fit(X_train, y_train)

In [None]:
pred = rfc_clf.predict(X_test)

In [None]:
score = accuracy_score(y_test, pred)
print(score) # initial score = 94.94; not a bad starting point

# Running a Grid Search on the RandomForestClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'n_estimators' : [10, 20, 30],
          'max_features' : [None, 'sqrt', 'log2'],
          'max_depth' : [None, 10]
         }
grid_search = GridSearchCV(rfc_clf, params, cv=5, verbose=2)
grid_search.fit(X_train, y_train)

In [None]:
rfc_best = grid_search.best_estimator_
pred = rfc_best.predict(X_test)
score = accuracy_score(y_test, pred)
score

# 96.31% test accuracy - not bad, but still not 97%

In [None]:
# seems like the most important parameter is n_estimators
# we'll run another grid search with different n_estimators values
rfc_clf = RandomForestClassifier() # starting over with a new blank classifier
grid_search = GridSearchCV(rfc_clf, {'n_estimators' : [30, 35, 40, 45]},
                           cv=5, verbose=3)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
rfc_clf_best = grid_search.best_estimator_
pred = rfc_clf_best.predict(X_test)
score = accuracy_score(y_test, pred)
score

# 96.70% test accuracy - so close I can taste it!!

# Gradient Tree Boosting

