In [2]:
import numpy as np
import pandas as pd
import numpy.linalg as linal

### **Initialiazing the different Datasets**

In [3]:
train = pd.read_csv(r'.\train_data.csv', header=None)
xtrain = train.to_numpy()

train = pd.read_csv(r'.\train_label.csv', header=None)
ytrain = train.to_numpy()

test  = pd.read_csv(r'.\test_data.csv', header=None)
xtest = test.to_numpy()

test  = pd.read_csv(r'.\test_label.csv', header=None)
ytest = test.to_numpy()

val   = pd.read_csv(r'.\val_data.csv', header=None)
xval  = val.to_numpy() 

val   = pd.read_csv(r'.\val_label.csv', header=None)
yval  = val.to_numpy()

### **K-Nearest Neighbours Classification algorithm**

In [4]:
# assigns class for every datapoint 

def knn (x,y,point,output,k) :
    vec         = x - point
    dist        = np.diag(vec@vec.T)
    index       = np.argpartition(dist, k)
    min_dist    = np.vectorize(int)(y[index].T[0]) [0:k]
    pred_class  = np.bincount(min_dist).argmax()

    return pred_class

In [5]:
# determining the class for every datapoint using the training data as reference 

def classifier(x,y,k) :
    sum = 0
    confusion_matrix = np.vectorize(int)(np.zeros((5,5)))

    for j in range(len(y)) :
        actual_class = int(y[j,0])
        pred_class = knn(xtrain,ytrain,x[j,:],y[j,0],k)
        confusion_matrix[pred_class,actual_class] = confusion_matrix[pred_class,actual_class] + 1
        
    print( confusion_matrix )
    print(f'classification accuracy : {np.sum(np.diag(confusion_matrix))/np.sum(confusion_matrix) * 100:.2f} %' )

In [6]:
for k in [1,7,15] :
    print(f'k = {k} : training results :')
    classifier(xtrain,ytrain,k)
    print(f'k = {k} : validation results :')
    classifier(xval,yval,k)
    print(f'k = {k} : test data results :')
    classifier(xtest,ytest,k)
    print('\n')

k = 1 : training results :
[[700   0   0   0   0]
 [  0 700   0   0   0]
 [  0   0 700   0   0]
 [  0   0   0 700   0]
 [  0   0   0   0 700]]
classification accuracy : 100.00 %
k = 1 : validation results :
[[ 85  13   8   7  32]
 [ 24  72  28  28  14]
 [ 20  66 111  56  32]
 [ 12  35  39 106  12]
 [ 59  14  14   3 110]]
classification accuracy : 48.40 %
k = 1 : test data results :
[[31  8  3  3 21]
 [ 9 31 14 10  8]
 [14 38 54 29 13]
 [12 16 23 51  6]
 [34  7  6  7 52]]
classification accuracy : 43.80 %


k = 7 : training results :
[[435  46  28  17  81]
 [ 44 400  66  53  40]
 [ 92 179 534 161  71]
 [ 26  57  53 463  23]
 [103  18  19   6 485]]
classification accuracy : 66.20 %
k = 7 : validation results :
[[ 87  20  11   4  30]
 [ 17  70  24  15  12]
 [ 28  80 138  72  33]
 [ 10  24  22 107  10]
 [ 58   6   5   2 115]]
classification accuracy : 51.70 %
k = 7 : test data results :
[[37  9  2  1 20]
 [ 8 38 12  4  5]
 [18 37 71 38 13]
 [ 8 15 11 57  2]
 [29  1  4  0 60]]
classificatio

---
### **Naive Bayes Classifier**
Computing the Prior Probabilities

In [7]:
def calculate_params(var) :
    mean = var.mean(axis=0)
    cvar = var - mean
    cov  = cvar.T@cvar/len(var)
    return 0.2,mean,cov

P0, mean0, cov0 = calculate_params(xtrain[0:700,:])
P1, mean1, cov1 = calculate_params(xtrain[700:1400,:])
P2, mean2, cov2 = calculate_params(xtrain[1400:2100,:])
P3, mean3, cov3 = calculate_params(xtrain[2100:2800,:])
P4, mean4, cov4 = calculate_params(xtrain[2800:3500,:])

$$P(\bar x | y_i) = \frac{1}{\sqrt{2\pi|C_i|}} \cdot e^{-\frac{1}{2} \cdot \left( \bar x - \bar \mu \right)^T C^{-1} \left( \bar x - \bar \mu \right) }$$

In [8]:
def calculate_conditional_probability(x, mean, cov) :
    x = x [np.newaxis]
    mean = mean [np.newaxis]
    z = x.T - mean.T 
    expo = -0.5*z.T@linal.inv(cov)@z
    const = 1/( (2*np.pi)**81 *linal.det(cov) )**0.5
    value = const*np.exp(expo)
    return value

In [9]:
def bayes_classifier(X,Y) :
    confusion_matrix = np.vectorize(int)(np.zeros((5,5)))
    for index in range(len(X)) :
        max = 0
        pred_class = -1
        actual_class = int(Y[index,0])
        for i,j,k in [(mean0, cov0, 0), (mean1, cov1, 1), (mean2, cov2, 2), (mean3, cov3, 3), (mean4, cov4, 4)] :
            val = calculate_conditional_probability(X[index], i, j) [0,0]
            if (val > max) : 
                max = val
                pred_class = k

        confusion_matrix[pred_class, actual_class] = confusion_matrix[pred_class, actual_class] + 1

    num = np.sum(np.diag(confusion_matrix))
    den = np.sum(confusion_matrix) 
    accuracy = num*100/den 
    return confusion_matrix, accuracy 


In [10]:
bayes_classifier(xtrain,ytrain)

(array([[545,  33,  17,   8,  21],
        [ 30, 574,  17,  19,  27],
        [ 17,  38, 617,  32,  15],
        [ 22,  24,  27, 629,  13],
        [ 86,  31,  22,  12, 624]]),
 85.4)

In [11]:
bayes_classifier(xval, yval)

(array([[ 83,  22,  23,   8,  23],
        [ 24, 108,  51,  32,  15],
        [ 18,  34,  88,  37,  13],
        [ 10,  23,  28, 113,   6],
        [ 65,  13,  10,  10, 143]]),
 53.5)

In [12]:
bayes_classifier(xtest,ytest)

(array([[44, 10,  4,  0, 16],
        [18, 63, 21,  9,  6],
        [ 5, 11, 49, 25,  6],
        [ 7, 11, 19, 61,  9],
        [26,  5,  7,  5, 63]]),
 56.0)