In [38]:
# import libraries and load dataset

import scipy.io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, precision_score, recall_score

from keras.datasets import mnist


In [39]:
# Split data into training, testing and development sets
(x_train, y_train), (x_remaining, y_remaining) = mnist.load_data()
x_valid, x_test, y_valid, y_test = train_test_split(x_remaining, y_remaining, test_size = 0.5)

In [40]:
'''Training set is used for training the model being used, it is creating a model to use to learn from.
   Test set is used for testing the model, to identify how accurate the model is.
   Development / validation set is used during training to help the model become more accurate.  The test set is not involved with
   this process until you are happy with the results from the validation set'''

'Training set is used for training the model being used, it is creating a model to use to learn from.\n   Test set is used for testing the model, to identify how accurate the model is.\n   Development / validation set is used during training to help the model become more accurate.  The test set is not involved with\n   this process until you are happy with the results from the validation set'

In [41]:
# display training and test shapes
print("x training shape: ", x_train.shape)
print("y training shape: ", y_train.shape)
print("x testing shape: ", x_test.shape)
print("y testing shape: ", y_test.shape)
print("x validation shape: ", x_valid.shape)
print("y validation shape: ", y_valid.shape)

x training shape:  (60000, 28, 28)
y training shape:  (60000,)
x testing shape:  (5000, 28, 28)
y testing shape:  (5000,)
x validation shape:  (5000, 28, 28)
y validation shape:  (5000,)


In [42]:
# Convert x training set into 2d array
nsamples, nx, ny = x_train.shape
x_train = x_train.reshape((nsamples,nx*ny))

In [43]:
# Convert x test set into 2d array
nsamples, nx, ny = x_test.shape
x_test = x_test.reshape((nsamples,nx*ny))

In [44]:
# Create a random forest
forest = RandomForestClassifier(max_depth = 9)
forest.fit(x_train, y_train)

In [45]:
'''I have chosen to use the max_depth parameter to tune because as this allows the tree to expand further thus providing finer 
   results
   I have decided to use the value 10 on the parameter as this appears to have a better result based on the confusion matrix'''

'I have chosen to use the max_depth parameter to tune because as this allows the tree to expand further thus providing finer \n   results\n   I have decided to use the value 10 on the parameter as this appears to have a better result based on the confusion matrix'

In [46]:
# Create the predictions
y_pred = forest.predict(x_test).reshape(-1,1)

In [47]:
# Confusion matrix
conf = confusion_matrix(y_test, y_pred)
matrix = pd.DataFrame(conf)
matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,495,0,0,1,0,0,3,1,1,1
1,0,540,2,2,0,1,3,1,3,0
2,6,1,465,6,7,0,2,13,2,2
3,1,0,11,464,2,8,1,5,9,4
4,1,1,2,0,431,0,6,1,3,25
5,3,2,1,8,4,444,8,2,1,7
6,2,1,1,0,1,4,464,0,1,0
7,1,3,15,2,0,0,0,451,3,15
8,2,2,2,4,1,4,4,2,466,16
9,3,5,2,10,13,1,1,4,3,478


In [48]:
'''The model struggles with the identifying 9 the most, followed by 2 and 5. '''

'The model struggles with the identifying 9 the most, followed by 2 and 5. '

In [49]:
# Accuracy score
score = forest.score(x_test, y_test)

print(f"The accuracy score is: {score} ")

The accuracy score is: 0.9396 


In [50]:
# Precision and recall score
precision = precision_score(y_test, y_pred, average = "micro" )
recall = recall_score(y_test, y_pred, average = "micro" )

print(f"The precision score is: {precision} ")
print(f"The recall score is: {recall} ")

The precision score is: 0.9396 
The recall score is: 0.9396 


In [51]:
# f1 score
f1 = f1_score(y_test, y_pred, average = None)

print(f"The f1 score is: {f1} ")

The f1 score is: [0.97440945 0.97560976 0.92537313 0.9261477  0.92787944 0.94267516
 0.96066253 0.92989691 0.93668342 0.89513109] 
