This program is to write a python code that uses Naive Bayes algorithm to classify the provided images 

**Import Libraries**

In [1]:
import numpy as np
import pandas as pd
import math
import collections
from urllib.request import *

## Data Preprocessing

In [2]:
def readImageFile(pathToFile, reshape):
    '''
    Description:
        Opens the image present in the given path and reads each line of the file into a list 
        List holds 28x28 for each of the 5000 image values 
        
    Input:
        pathToFile: Path to file to be opened
        reshape: size to reshape the values
    Output:
        list of image values
    '''
    retArr = []
    with urlopen(pathToFile) as imageFile:
        for line in imageFile:
            line = line.decode('utf-8').rstrip("\n\r")
            retArr.append(list(line))
    retArr = np.array(retArr)
    retArr = np.reshape(retArr,(reshape[0],reshape[1],reshape[2]))
    return retArr.tolist()

In [3]:
def readLabelsFile(pathToFile):
    '''
    Description:
        Opens the labels files present in the given path and reads each line of the file and converts into a list
        
    Input:
        pathToFile: Path to file to be opened
        
    Output:
        list of labels
    '''
    with urlopen(pathToFile) as labelFile:
        lines = labelFile.read().decode('utf-8').replace('\n', '')
        labels = []                                                                  
        for i in range(len(lines)):                                                    
            labels.append(lines[i])  
    labels = [int(i) for i in labels] 
    return labels

### Train Data Processing

**Read Training Data and Labels**

In [4]:
trainLabels = readLabelsFile('https://raw.githubusercontent.com/sandeepmundru/CSCI-680/master/traininglabels.txt')
trainImages = readImageFile('https://raw.githubusercontent.com/sandeepmundru/CSCI-680/master/trainingimages.txt',[len(trainLabels),28,28])

**3D list to calculate probability and applying Laplace smoothing to the categorical data**

In [5]:
def laplaceSmoothing(freqencies, matrix):
    '''
    Description:
        Applying laplace smoothing to categorical data
        
    Input:
        frequencies: the dictionary that contains count of occurances of each label in train data
        matrix: Image matrix to which smoothing is to be applied
        
    Output:
        matrix on which laplace smoothing is applied
    '''
    for index in freqencies.keys():
        for xaxis in range(0,28):
            for yaxis in range(0,28):
                matrix[index][xaxis][yaxis] = (0.5 , freqencies[index] , freqencies[index]/5000)      
    
    return matrix

In [6]:
# 3D matrix
matrix = [[[0 for _ in range(0,28)] for _ in range( 0,28 ) ] for _ in range( 0,10 )]

#applying laplace smoothing
laplaceMatrix = laplaceSmoothing(collections.Counter(trainLabels),matrix)

**Calculating the feature values**

In [7]:
for image in range(0,len(trainImages)):
    for index1 in range(0,28):
        for index2 in range(0,28):
            if trainImages[image][index1][index2] is not ' ':
                zero = (laplaceMatrix[int(trainLabels[image])][index1][index2][0]+1)
                one = (laplaceMatrix[int(trainLabels[image])][index1][index2][1])
                two = (laplaceMatrix[int(trainLabels[image])][index1][index2][2])                                                                                                    

                laplaceMatrix[int(trainLabels[image])][index1][index2] = (zero, one, two)

### Test Data Processing

**Read Testing Data and Labels**

In [8]:
testLabels = readLabelsFile('https://raw.githubusercontent.com/sandeepmundru/CSCI-680/master/testlabels.txt')
testImages = readImageFile('https://raw.githubusercontent.com/sandeepmundru/CSCI-680/master/testimages.txt',[len(testLabels),28,28])

## Naive Bayes Approach

In [9]:
class naiveBayes:
    '''
    Description:
        This class contains the methods Naive Bayes classifier
    '''
    def __init__(self):
        '''
            constructor/initialization method
        '''
        pass
    
    def fit(self, xTrain, yTrain, laplacedMat):
        '''
        Description:
            Train Naive Bayes with train data i.e. Store data for Naive Bayes Classifier
        Input:
            xTrain: training data
            yTrain: labels of training data set
            laplacedMat: matrix on which Laplace smoothing is applied

        Output:
            None
        '''
        self.trainData = xTrain
        self.trainLabels = yTrain
        self.laplacedMat = laplacedMat

        if len(self.trainLabels) != len(self.trainData):
            raise ValueError("Unequal Length of labels and data")
        
        
        # Count the occurence of each digit (label) in training data
        self.frequencies = collections.Counter(self.trainLabels)

    def predict(self, xTest):
        '''
        Description:
            Apply Naive Bayes on test data
        Input:
            xTest: testing data
        Output:
            predicted labels 
        '''
        self.testData = xTest
        predictions = []
        for image in range(0, len(self.testData)):
            naiveBayesPred = []
            for value in range(0,len(set(self.trainLabels))):
                naiveBayesVal = 0
                for index1 in range(0, 28):
                    for index2 in range(0,28):
                        if self.testData[image][index1][index2] is not ' ':
                            zero = self.laplacedMat[value][index1][index2][0]
                            one = self.laplacedMat[value][index1][index2][1]                            
                            two = self.laplacedMat[value][index1][index2][2]
                            naiveBayesVal = naiveBayesVal + math.log(zero / one * two)
                    naiveBayesVal = naiveBayesVal + math.log(self.frequencies[value]/len(self.testData))
                naiveBayesPred.append([value, naiveBayesVal])
            nbmaxval = []

            for elem in range(0,len(set(self.trainLabels))):
                nbmaxval.append(naiveBayesPred[elem][1])
            maximVl = max(nbmaxval)
            
            for elem in range(0,len(set(self.trainLabels))):
                if maximVl == naiveBayesPred[elem][1]:
                    predictions.append(naiveBayesPred[elem][0])
        
        return predictions
    
    

In [10]:
nbc = naiveBayes()

In [11]:
nbc.fit(trainImages, trainLabels, laplaceMatrix)

In [12]:
predictions = nbc.predict(testImages)

In [13]:
def printMetrics(actual, predicted):
    '''
    Description:
        Calculate the metrics and confusion matrix
    Input:
        actual: actual labels for test data
        predicted: predicted labels for test data
    Output:
        prints confusion matrix and accuracy metric
    '''
    
    #build confusion matrix
    confusionMatrix = pd.crosstab(pd.Series(actual, name = 'Actual'), pd.Series(predicted, name = 'Predicted'))

    # normalize confusion matrix
    normalizedCM = confusionMatrix / confusionMatrix.sum(axis=1)
    normalizedCM = normalizedCM.round(2)
    print('*'*20 + ' Confusion Matrix '+ '*'*20)
    print(normalizedCM)
    
    #calculate accuracy
    print('*'*20 + ' Accuracy '+ '*'*20)
    accuracy = np.sum(normalizedCM.values.diagonal(0))
    print('\nAccuracy: {}% \n'.format(accuracy*10))
    
    #print actual vs predcted labels
    print('*'*20 + ' Actual vs Predicted labels '+ '*'*20)
    print('\nSNO \t Actual label \t Predicted label')
    print('----\t-------------\t-----------------')
    for index in range(1,len(actual)+1):
        print(index, '\t\t',actual[index-1], '\t\t', predicted[index-1])

In [14]:
printMetrics(testLabels, predictions)

******************** Confusion Matrix ********************
Predicted     0     1     2     3     4     5     6     7     8     9
Actual                                                               
0          0.98  0.00  0.00  0.00  0.00  0.00  0.00  0.00  0.02  0.00
1          0.00  0.77  0.01  0.03  0.00  0.00  0.01  0.00  0.19  0.00
2          0.07  0.00  0.81  0.04  0.01  0.00  0.07  0.00  0.03  0.00
3          0.01  0.00  0.01  0.85  0.00  0.00  0.01  0.04  0.03  0.05
4          0.00  0.00  0.02  0.00  0.79  0.00  0.05  0.00  0.08  0.08
5          0.16  0.00  0.01  0.26  0.05  0.03  0.01  0.01  0.35  0.05
6          0.07  0.00  0.08  0.00  0.01  0.00  0.77  0.00  0.06  0.00
7          0.03  0.01  0.03  0.01  0.04  0.00  0.00  0.74  0.10  0.06
8          0.03  0.00  0.03  0.16  0.03  0.00  0.00  0.03  0.72  0.01
9          0.02  0.00  0.01  0.04  0.17  0.00  0.00  0.05  0.03  0.67
******************** Accuracy ********************

Accuracy: 71.3% 

******************** Actual vs 

682 		 3 		 7
683 		 2 		 2
684 		 3 		 3
685 		 5 		 8
686 		 6 		 6
687 		 8 		 3
688 		 8 		 8
689 		 6 		 6
690 		 2 		 2
691 		 3 		 3
692 		 1 		 3
693 		 0 		 0
694 		 5 		 8
695 		 8 		 3
696 		 9 		 7
697 		 2 		 6
698 		 9 		 9
699 		 6 		 6
700 		 7 		 7
701 		 0 		 0
702 		 4 		 4
703 		 8 		 8
704 		 7 		 7
705 		 1 		 1
706 		 7 		 7
707 		 4 		 4
708 		 1 		 8
709 		 0 		 0
710 		 9 		 3
711 		 7 		 7
712 		 2 		 2
713 		 0 		 0
714 		 0 		 0
715 		 9 		 9
716 		 1 		 1
717 		 7 		 8
718 		 8 		 0
719 		 7 		 4
720 		 8 		 8
721 		 4 		 4
722 		 7 		 9
723 		 2 		 6
724 		 0 		 0
725 		 4 		 4
726 		 6 		 6
727 		 0 		 0
728 		 3 		 8
729 		 1 		 1
730 		 1 		 1
731 		 3 		 3
732 		 3 		 3
733 		 9 		 9
734 		 6 		 2
735 		 7 		 7
736 		 4 		 4
737 		 1 		 1
738 		 5 		 2
739 		 3 		 3
740 		 0 		 0
741 		 8 		 8
742 		 7 		 8
743 		 3 		 3
744 		 9 		 9
745 		 6 		 6
746 		 9 		 9
747 		 3 		 3
748 		 5 		 8
749 		 0 		 0
750 		 2 		 3
751 		 7 		 7
752 		 4 		 2
753 		