# MNIST handwritten digit classification with Random Forest

In [1]:
#import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.utils import check_random_state

Load the dataset - note that each element of the dataset is a vector of 784 values:

In [2]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

Split the dataset into 60k training samples, 10k test samples

In [3]:
# Split data into train and test subsets
random_state = check_random_state(9)
permutation = random_state.permutation(X.shape[0])
X = X[permutation]
y = y[permutation]
X = X.reshape((X.shape[0], -1))
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=60000, test_size=10000, shuffle=True)

Create the RF classifier with 'n_estimator' trees and fit to the training data and labels:

In [4]:
rf=RandomForestClassifier(n_estimators=200)
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

Make predictions using the test samples:

In [5]:
predictions=rf.predict(X_test)

Check the predictions against the test labels:

In [6]:
correct = 0
wrong = 0
for i in range(len(predictions)):
    if predictions[i] == y_test[i]:
        correct += 1
    else:
        wrong += 1
        
print ('Correct:',correct,' Wrong:',wrong,' Accuracy:',(correct/len(X_test))*100,'%')

Correct: 9710  Wrong: 290  Accuracy: 97.1 %
