# Introduction

###### In this notebook, I am going to try out different models and find the one that will perform best in identifying hand-writen digits. The dataset I will be using for training and testing my model is the MNIST dataset. I will compare it to human level performance of 87.5%.

#### Use Case

###### Cognitive Assessments
###### Such a model can be used for evaluating handwriting patterns for cognitive or motor disorders (e.g., Parkinson’s).

#### Domain: Medical diagnostics, neurology.

# Importing Packages

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Reading the Train and Test datasets

#### Read and View Train Data

In [None]:
mnist_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MNIST_train.csv')
mnist_train

Unnamed: 0.1,Unnamed: 0,index,labels,0,1,2,3,4,5,6,...,774,775,776,777,778,779,780,781,782,783
0,0,0,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,3,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,4,9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,59995,59995,8,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59996,59996,59996,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59997,59997,59997,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59998,59998,59998,6,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Drop columns that are not real data
mnist_train = mnist_train.drop(['Unnamed: 0',	'index'], axis = 1)
mnist_train

Unnamed: 0,labels,0,1,2,3,4,5,6,7,8,...,774,775,776,777,778,779,780,781,782,783
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59996,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59997,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
59998,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Read and View Test Data

In [None]:
mnist_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MNIST_test.csv')
mnist_test

Unnamed: 0.1,Unnamed: 0,index,labels,0,1,2,3,4,5,6,...,774,775,776,777,778,779,780,781,782,783
0,0,0,7,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,4,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,9995,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,9996,9996,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,9997,9997,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,9998,9998,5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Drop columns that are not real data
mnist_test = mnist_test.drop(['Unnamed: 0',	'index'], axis = 1)
mnist_test

Unnamed: 0,labels,0,1,2,3,4,5,6,7,8,...,774,775,776,777,778,779,780,781,782,783
0,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Convert The Datasets To Numpy Arrays

In [None]:
mnist_train = mnist_train.to_numpy()
mnist_train

array([[5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       ...,
       [5, 0, 0, ..., 0, 0, 0],
       [6, 0, 0, ..., 0, 0, 0],
       [8, 0, 0, ..., 0, 0, 0]])

##### Get the Training Features

In [None]:
X_train = mnist_train[:, 1:]
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

##### Get the Traning Labels

In [None]:
y_train = mnist_train[:, 0]
y_train

array([5, 0, 4, ..., 5, 6, 8])

In [None]:
print('The shape of X_train is :', X_train.shape)
print()
print('The shape of y_train is :', y_train.shape)

The shape of X_train is : (60000, 784)

The shape of y_train is : (60000,)


# PCA

In [None]:
# import pandas as pd
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler

In [None]:
# # 1. Standardize the features
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)

# # 2. Apply PCA
# pca = PCA(n_components=700)
# principal_components = pca.fit_transform(X_train_scaled)

# # 3. Convert result to DataFrame
# # pca_df = pd.DataFrame(principal_components, columns=['PC1', 'PC2'])

# # 4. View the first two principal components
# # print(pca_df)
# principal_components

In [None]:
# PC = pca_df.to_numpy()
# PC

In [None]:
# plt.figure(figsize= (12,8))
# plt.scatter(PC[:, 0], PC[:, 1], c=y_train, alpha = 0.5, s = 10)

# Naive Bayes Classifier

###### In the below, we code the Naive Bayes classifier by creating a class with the *fit* and *predict* methods contained therein.
###### Note that the Naive Bayes assumes the predictor variables are independence and hence there is no covariance (i.e. covariances are zero). Therefore, we are only interested in the variances of the predictors.

In [None]:
class GaussNB():

  def fit(self, X, y, epsilon = 1e-3):

    self.likelihoods = dict() # NNG: Define a dictionay called likelihoods and store
                              #      it in self for ater use.
                              #      Question: should we call this likelihoods_params
                              #      because, based on the code below, it stores that
                              #      parameters of the likelihoods rather than the
                              #      likelihoods themselves?
    self.priors = dict()      #      Define a dictionay called likelihood and store
                              #      it in self for later use.

    self.K = set(y.astype(int)) # NNG: get the unique class labels, call it K
                                #      and store in self for later use.

    for k in self.K:  # NNG: for each unique class label

      x_k = X[y==k]  # NNG: subset the predictors for that class
      # NNG: Since we don't need the covariance as we assume there is no covariance
      #      for Naive Bayes. That is, there is assumption of independence in the
      #      predictors, which implies 0 covariance.
      # NNG: calculate the means and variances (covariances in general?) for
      #      the predictors for that class label.
      self.likelihoods[k] = {"mean": x_k.mean(axis = 0), "cov": x_k.var(axis = 0) + epsilon}
      # NNG: calculate the of that class label in the entire dataset
      self.priors[k] = len(x_k)/len(X)

  def predict(self, X):

    N, D = X.shape # NNG: get the dimensions of the predictors (for test set usually) dataset
    P_hat = np.zeros((N, len(self.K))) # NNG: define N rows and K (unique class size) columns of zeros
                                       #      to be filled later.
                                       #      Note np.zeros takes a tupple as argument.

    for k, l in self.likelihoods.items():
      # NNG: Use the log of the Bayes Formula to calculate the predicted probabilities for class k.

      P_hat[:, k] = mvn.logpdf(X, l["mean"], l["cov"]) + np.log(self.priors[k])

      #      Questions: 1. What happened to the denominator of the Bayes Formula, i.e. pdf(X)
      #      Comment: These will not give the actual predicted probabilities but would be sufficient for
      #               picking the classes as log is a monotonic function.
      #      Answer: Since the denominator is a constant independent of the classes, Taking the log
      #              will lead to subtraction of a constant term (i.e. log pdf(X)) to each value and
      #              will not affect our picking of the classes since all likelihoods will be inflated
      #              by the same factor, i.e. log pdf(X).


    return P_hat.argmax(axis = 1)



In [None]:
from scipy.stats import multivariate_normal as mvn

#### Define a function to check for model accuracy.

In [None]:
def accuracy(y, y_hat):
  return np.mean(y==y_hat)

##### Train the Model On The Training Dataset

In [None]:
# Initialize
gnb = GaussNB()

In [None]:
# Fit Model
gnb.fit(X_train, y_train)

In [None]:
# Predict on the training dataset
y_hat_train = gnb.predict(X_train)
y_hat_train

array([3, 0, 4, ..., 8, 6, 8])

#### Testing on Training data




In [None]:
accuracy(y_train, y_hat_train)

np.float64(0.5938)

###### From the above we see that the training accuracy is 59.38%. I proceed to check the accuracy on the test dataset, which is what we are mainly concerned about since the model is expected to be used to predict unseen data (i.e. new data points the model has not seen before).

#### Testing on Test data

###### Convert Test Data To A Numpy Array

In [None]:
mnist_test = mnist_test.to_numpy()
mnist_test

array([[7, 0, 0, ..., 0, 0, 0],
       [2, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [4, 0, 0, ..., 0, 0, 0],
       [5, 0, 0, ..., 0, 0, 0],
       [6, 0, 0, ..., 0, 0, 0]])

###### Get the Test Features

In [None]:
X_test = mnist_test[:, 1:]
X_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

###### Get The Test Labels

In [None]:
y_test = mnist_test[:,0]
y_test

array([7, 2, 1, ..., 4, 5, 6])

###### Predict Using The Test Features

In [None]:
y_hat_test = gnb.predict(X_test)
y_hat_test

array([9, 2, 1, ..., 9, 8, 6])

###### Test The Results With The Test Labels for Accuracy

In [None]:
accuracy(y_test, y_hat_test)

np.float64(0.5878)

###### From the above, it can be seen that the Gaussian Niave Bayes achives a 58.78% accuracy on the test data. This is far lower than the human level accuracy of 87.5%.

###### I am going to try the non-naive Bayes Gaussian Classifier on the dataset and see how that performes. Recall that in the Naive Bayes, we assume that the features are independent of each other, which led to covariances that are zeros in our code for the Naive Bayes. I modify the code now to include non-zero covariances as the Non-Naive Bayes does not assume independence of the features.

# Non-Naive Bayes Classifier (Gauss Bayes)

In [None]:
class GaussBayes():

  def fit(self, X, y, epsilon = 1e-3):

    self.likelihoods = dict() # NNG: Define a dictionay called likelihoods and store
                              #      it in self for ater use.
                              #      Question: should we call this likelihoods_params
                              #      because, based on the code below, it stores that
                              #      parameters of the likelihoods rather than the
                              #      likelihoods themselves?
    self.priors = dict()      #      Define a dictionay called likelihood and store
                              #      it in self for later use.

    self.K = set(y.astype(int)) # NNG: get the unique class labels, call it K
                                #      and store in self for later use.

    for k in self.K:  # NNG: for each unique class label

      x_k = X[y==k, :]  # NNG: subset the predictors for that class
      N_k, D = x_k.shape
      mu_k = x_k.mean(axis = 0)
      # NNG: Since we don't need the covariance as we assume there is no covariance
      #      for Naive Bayes. That is, there is assumption of independence in the
      #      predictors, which implies 0 covariance.
      # NNG: calculate the means and covariances for the predictors for that class label.
      self.likelihoods[k] = {"mean": x_k.mean(axis = 0),
                             "cov": (1/(N_k -1))*np.matmul((x_k - mu_k).T, (x_k - mu_k))+epsilon*np.identity(D)}
      # NNG: calculate the of that class label in the entire dataset
      self.priors[k] = len(x_k)/len(X)

  def predict(self, X):

    N, D = X.shape # NNG: get the dimensions of the predictors (for test set usually) dataset
    P_hat = np.zeros((N, len(self.K))) # NNG: define N rows and K (unique class size) columns of zeros
                                       #      to be filled later.
                                       #      Note np.zeros takes a tupple as argument for
                                       #      for multidimensional.

    for k, l in self.likelihoods.items():
      # NNG: Use the log of the Bayes Formula to calculate the predicted probabilities for class k.
      #      Questions: 1. What happened to the denominator of the Bayes Formula, i.e. pdf(X)
      #      Comment: These will not give the actual predicted probabilities but would be sufficient for
      #               picking the classes as log is a monotonic function.
      #      Answer: Since the denominator is a constant independent of the classes, Taking the log
      #              will lead to subtraction of a constant term (i.e. log pdf(X)) to each value and
      #              will not affect our picking of the classes since all likelihoods will be inflated
      #              by the same factor, i.e. log pdf(X)

      P_hat[:, k] = mvn.logpdf(X, l["mean"], l["cov"]) + np.log(self.priors[k])

    return P_hat.argmax(axis = 1)


##### Train the Model On The Training Dataset

In [None]:
# Initialize
g_non_naive = GaussBayes()

In [None]:
# Fit the Model
g_non_naive.fit(X_train, y_train)

#### Testing on Training data

In [None]:
# Predict on Train Data
y_hat_non_naive_train = g_non_naive.predict(X_train)
y_hat_non_naive_train

array([3, 0, 4, ..., 8, 6, 8])

In [None]:
# Check Accuracy on Train Data
accuracy(y_train, y_hat_non_naive_train)

np.float64(0.78565)

#### Testing on Test data

###### Predict Using Test Data

In [None]:
y_hat_non_naive_test = g_non_naive.predict(X_test)
y_hat_non_naive_test

array([7, 2, 1, ..., 9, 5, 6])

###### Accuracy on Test Data

In [None]:
accuracy(y_test, y_hat_non_naive_test)

np.float64(0.7532)

###### It can be seen from the accuracy that the Non-Naive Bayes achives an accuracy of 75.32% on the test data, which is better than the performance of the Naive Bayes Classifier. However, this peformance is still lower than the human level performance of 87.5%.
###### I can try other models to see if performance can be improved.

# K Nearest Neighbor Classifier

#### K Nearest Neighbor Class

In [None]:
class KNNClassifier():

  def fit(self, X, y):
    self.X = X # Store the features into self to be used by the later fuctions
    self.y = y # Store the labels into self to be used by later functions

  def predict(self, X, K, epsilon = 1e-3):
    N = len(X) # Find the number of rows of the dataset for prediction
    y_hat = np.zeros(N) # Create placeholder for the predicted labels
    for i in range(N): # for each row of the data
      dist2 = np.sum((self.X - X[i])**2, axis = 1) # Fint the squared distances
      idxt = np.argsort(dist2)[:K] # Find the index of the lowest K squared distances
      gamma_k = 1/(np.sqrt(dist2[idxt] + epsilon)) # Find the inverse of the distances.
                                                   # This serves to put high weights on the closest points.
                                                   # The epsilon is needed to prevent explosion of too small distances.
      y_hat[i] = np.bincount(self.y[idxt], weights = gamma_k).argmax() # Create a bincount of the corresponding
                                                                       # labels by weighing them.

    return y_hat



In [None]:
knn_instance = KNNClassifier()

In [None]:
knn_instance.fit(X_train, y_train)

###### Testing on the Training Data

In [None]:
# # Predict on train Data
# y_hat_knn_train = knn_instance.predict(X_train, K=200)
# y_hat_knn_train

In [None]:
# # Accuracy on train data
# accuracy(y_train, y_hat_knn_train)

###### Testing on the Test Data

In [None]:
# Predict on Test Data
y_hat_knn = knn_instance.predict(X_test, K=200)
y_hat_knn

array([7., 2., 1., ..., 4., 5., 6.])

In [None]:
accuracy(y_test, y_hat_knn)

np.float64(0.9303)

###### It can be seen from above that the K nearest Neighbor performs at 93.03% accuracy, which is higher than the human level performance of 87.5%.

###### We prefer the K Nearest Neighbor model as it serves as an improvement over the human level performance and can use it for the given use case above.