In [23]:
import numpy as np
from tensorflow.keras.datasets import mnist

In [24]:
(trainX, trainY), (testX, testY) = mnist.load_data()
trainX = trainX.reshape(-1, 28 * 28).astype("float32") / 255.0
testX = testX.reshape(-1, 28 * 28).astype("float32") / 255.0


#since we have to use 75% data for each image class for training and the rest for testing 
#and the ration in which the train and test set is splitted by default from mnist is different so, we
#merge the entire dataset and split later after grouping data from each image class.

dataX = np.concatenate((trainX,testX), axis=0)
dataY = np.concatenate((trainY,testY), axis=0)
print(dataX.shape)

(70000, 784)


In [25]:
# now we group into different image classes

In [26]:
img_cls = [[],[],[],[],[],[],[],[],[],[]]

for i in range(70000):
    img_cls[int(dataY[i])].append(dataX[i])

In [27]:
# now splitting into 75% train set and 25% test set
trainX = []
trainY = np.array([])
testX = []
testY = np.array([])

for i in range(10):
    l = img_cls[i].__len__()
    b = int(l*0.75)
    trainX += img_cls[i][0:b]
    testX += img_cls[i][b:l]
    trainY = np.concatenate((trainY,np.ones(b)*i))
    testY = np.concatenate((testY,np.ones(l-b)*i))

trainX = np.array(trainX)
trainY = np.array(trainY)
testX = np.array(testX)
testY = np.array(testY)

In [28]:
from sklearn import metrics

In [29]:
from sklearn.tree import DecisionTreeClassifier
# here we see decision tree model

# first we do for a decision tree of depth 20
model20 = DecisionTreeClassifier(max_depth=20).fit(trainX,trainY)
predicted_digit = model20.predict(testX)
print("Accuracy for decision tree of depth 20 : ",metrics.accuracy_score(testY, predicted_digit))

# second we do for a decision tree of depth 50
model50 = DecisionTreeClassifier(max_depth=50).fit(trainX,trainY)
predicted_digit = model50.predict(testX)
print("Accuracy for decision tree of depth 50 : ",metrics.accuracy_score(testY, predicted_digit))

Accuracy for decision tree of depth 20 :  0.8837475007140817
Accuracy for decision tree of depth 50 :  0.8810625535561268


In [30]:
from sklearn.ensemble import RandomForestClassifier
# here we see random forest classification

modelRf = RandomForestClassifier().fit(trainX,trainY)
predicted_digit = modelRf.predict(testX)
print("Accuracy for random forest : ",metrics.accuracy_score(testY, predicted_digit))

Accuracy for random forest :  0.9722936303913168


In [31]:
from sklearn.model_selection import cross_val_score

In [32]:
# applying cross validation for decision tree
accuracies = cross_val_score(model20, dataX, dataY, cv=10) #10 fold cross validation
print("----Cross-Validating Decision Tree of Depth 20----")
print("Mean Accuracy = " + str(accuracies.mean()))
print("Standard Deviation = " + str(accuracies.std()))

----Cross-Validating Decision Tree of Depth 20----
Mean Accuracy = 0.8765857142857142
Standard Deviation = 0.006307575620163579


In [33]:
# applying cross validation for decision tree
accuracies = cross_val_score(model50, dataX, dataY, cv=10) #10 fold cross validation
print("----Cross-Validating Decision Tree of Depth 50----")
print("Mean Accuracy = " + str(accuracies.mean()))
print("Standard Deviation = " + str(accuracies.std()))

----Cross-Validating Decision Tree of Depth 50----
Mean Accuracy = 0.8733285714285716
Standard Deviation = 0.006086468766108667


In [34]:
# applying cross validation for random forest
accuracies = cross_val_score(modelRf, dataX, dataY, cv=10) #10 fold cross validation
print("----Cross-Validating Random Forest----")
print("Mean Accuracy = " + str(accuracies.mean()))
print("Standard Deviation = " + str(accuracies.std()))

----Cross-Validating Random Forest----
Mean Accuracy = 0.9694285714285715
Standard Deviation = 0.0034309515548064163
