In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import scipy.stats as s

In [2]:
train_path = "./DevanagariHandwrittenCharacterDataset/Train"

test_path = "./DevanagariHandwrittenCharacterDataset/Test"

# converting images to dataset

In [3]:
unique_labels = os.listdir(train_path)

In [4]:
# This function will return the list of images which are inside a particular folder

def list_of_images(folder_path,folder):
    return os.listdir(os.path.join(folder_path,folder))

In [5]:
# This function will convert an image into an array of shape (1024,1)

def read_image(folder_path,folder,image):
    folder_path = os.path.join(folder_path,folder)
    image_path = os.path.join(folder_path,image  )
    image = plt.imread(image_path)
    return image.reshape(image.shape[0]*image.shape[1],)

In [6]:
# This function will convert folder of images into a dataframe and will split it into train data and cross validation data(only for train folder) 

def stacking_row_verctors_train(folder_path,folder):
    
    images_list_train = list_of_images(folder_path,folder)
    
    images_train = []
    
    for img in images_list_train:
        images_train.append(read_image(folder_path,folder,img))
        
    return np.array(images_train[0:1360]), np.array(images_train[1360:])

In [7]:
# This function will convert folder of images into a dataframe (for test folder) 

def stacking_row_vector_test(folder_path,folder):
    images_list_test = list_of_images(folder_path,folder)
    
    images_test = []
    
    for img in images_list_test:
        images_test.append(read_image(folder_path,folder,img))
    
    return np.array(images_test)

In [8]:
train_data = []
cv_data = []

for column in unique_labels:
    train_folder_matrix, cv_folder_matrix = stacking_row_verctors_train(train_path,column)
    
    train_data.append(train_folder_matrix)
    cv_data.append(cv_folder_matrix)
    
train_data = np.concatenate(train_data,axis=0)
cv_data = np.concatenate(cv_data,axis=0)

In [9]:
train_data.shape

(62560, 1024)

In [10]:
cv_data.shape

(15640, 1024)

In [11]:
test_data = []
for column in unique_labels:
    test_folder_matrix = stacking_row_vector_test(test_path,column)
    
    test_data.append(test_folder_matrix)
    
test_data = np.concatenate(test_data,axis=0)

In [12]:
test_data.shape

(13800, 1024)

In [13]:
train_label_lenght = int(0.8*1700)
cv_label_lenght = int(0.2*1700)

In [15]:
train_labels = []
cv_labels = []
for folder_name in unique_labels:
    train_labels = train_labels + [folder_name]*train_label_lenght
    cv_labels = cv_labels + [folder_name]*cv_label_lenght

In [16]:
train_data = pd.DataFrame(data=train_data)

In [17]:
train_data['Label'] = train_labels

In [18]:
train_data.shape

(62560, 1025)

In [19]:
cv_data = pd.DataFrame(data=cv_data)
cv_data['Label'] = cv_labels

In [20]:
cv_data.shape

(15640, 1025)

In [22]:
test_labels = []
for folder_name in unique_labels:
    test_labels = test_labels + [folder_name]*300

In [23]:
test_data = pd.DataFrame(data=test_data)
test_data['Label'] = test_labels

In [24]:
test_data.shape

(13800, 1025)

# applying PCA 

In [25]:
data = pd.concat([train_data,cv_data,test_data])

In [26]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,digit_9
13796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,digit_9
13797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,digit_9
13798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,digit_9


In [27]:
data_labels = np.array(data['Label']).reshape(data['Label'].shape[0],1)

In [28]:
data_labels.shape

(92000, 1)

In [55]:
X = data.iloc[:,0:1024]

In [56]:
# X

In [57]:
X = np.array(X)

In [58]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [59]:
mu = np.mean(X,axis=0)

In [60]:
mu.shape

(1024,)

In [61]:
mu = mu.reshape(-1,mu.shape[0])

In [62]:
mu.shape

(1, 1024)

In [63]:
X_dash = X - np.mean(X,axis=0)

In [64]:
X_dash

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [65]:
X_dash.shape

(92000, 1024)

In [66]:
X_DASH = X - mu

In [67]:
if (X_dash == X_DASH).all():
    print("true")

true


In [68]:
#covariance matrix
sigma_hat = (1/data.shape[0])*np.matmul(X_dash.T,X_dash)

In [69]:
sigma_hat.shape

(1024, 1024)

In [70]:
# applying singular value decomposition on COV MATRIX

In [71]:
sigma_hat_decompose = np.linalg.svd(sigma_hat)

In [72]:
len(sigma_hat_decompose)

3

In [73]:
Q = sigma_hat_decompose[0]

In [74]:
Q.shape

(1024, 1024)

In [200]:
Q_tilda = Q[:,0:100]

In [201]:
Q_tilda.shape

(1024, 100)

In [202]:
X_new = np.matmul(X_dash,Q_tilda)

In [203]:
X_new.shape

(92000, 100)

In [204]:
new_data = pd.DataFrame(data=X_new)

In [205]:
new_data['Label'] = data_labels

In [206]:
new_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Label
0,1.414902,-1.031828,0.724808,1.792487,1.230461,-1.463575,-0.441179,2.028708,0.246965,-0.639972,...,-0.530323,0.720323,-0.273598,0.014296,0.253026,-0.656136,0.22436,-0.619588,0.203224,character_10_yna
1,2.724355,-0.228752,0.904359,2.460131,2.124608,0.123733,0.113514,1.210024,0.608754,0.112907,...,0.095393,0.051793,0.174786,0.607917,-0.03257,0.45263,0.632624,0.016626,-0.21858,character_10_yna
2,1.750747,0.401532,-2.698068,1.442509,1.538423,-0.314052,-0.370063,1.407778,3.653632,2.662185,...,-0.132746,-0.676613,-0.703744,0.098502,1.530413,0.267359,0.506167,0.313099,0.053379,character_10_yna
3,2.610647,-2.533,-0.971774,0.879262,3.542826,0.043539,-1.996213,0.302775,1.27398,0.678326,...,0.103607,0.525959,0.224528,0.462632,-0.466329,-0.693745,-0.138779,0.5159,0.319045,character_10_yna
4,2.822666,-1.521463,0.974275,1.029498,1.76163,-0.134133,-0.297633,0.947978,2.245895,0.050705,...,-0.152932,-0.258581,0.566438,-0.135783,-0.528918,0.902679,-0.001463,0.31078,0.230328,character_10_yna


In [207]:
new_data.shape

(92000, 101)

In [208]:
new_train_df = new_data.iloc[0:train_data.shape[0],:]
new_cv_df = new_data.iloc[train_data.shape[0]:(train_data.shape[0]+cv_data.shape[0]),:]
new_test_df = new_data.iloc[(train_data.shape[0]+cv_data.shape[0]):,:]

In [209]:
new_train_df.shape

(62560, 101)

In [210]:
new_cv_df.shape

(15640, 101)

In [211]:
new_test_df.shape

(13800, 101)

# let's Implement Naive Bayes Algorithm from scratch on the training dataset

In [212]:
list_mu_sigma = list()  #list_mu_sigma will hold the value of mean and cov matrix for each column


for folder_name in unique_labels:

    temp_list = list()
    
    mu_hat_feature = np.array(new_train_df[new_train_df['Label'] == folder_name].mean())

    sigma_hat_feature = np.array(new_train_df[new_train_df['Label'] == folder_name].cov())
    
    temp_list.append(mu_hat_feature)
    temp_list.append(sigma_hat_feature)
    
    list_mu_sigma.append(temp_list)

In [213]:
# list_mu_sigma

In [214]:
def mock_test(data):
    
    inputs = np.array(data.iloc[:,0:100])
    probability_list = []
    
    for i in range(0,46):

        probability = s.multivariate_normal.pdf(inputs,list_mu_sigma[i][0],list_mu_sigma[i][1])
    
        probability_list.append(probability)
        
        class_probability = np.array(probability_list).T
        
    return  np.argmax(class_probability,axis=1)


# Testing the Algorithm on Cross Validation (CV) data

In [215]:
CV_predicted = mock_test(new_cv_df)

In [216]:
CV_predicted

array([ 0,  0,  0, ..., 45, 45, 45], dtype=int64)

In [217]:
CV_actual = new_cv_df['Label'].replace(unique_labels,list(range(0,46)))

In [218]:
CV_accuracy = np.count_nonzero(CV_actual == CV_predicted)/new_cv_df.shape[0]

In [219]:
CV_accuracy

0.8204603580562659

# calculating the accuracy on Test data

In [220]:
Test_predicted = mock_test(new_test_df)

In [221]:
Test_predicted.shape

(13800,)

In [222]:
Test_actual = new_test_df['Label'].replace(unique_labels,list(range(0,46)))

In [230]:
Test_accuracy = str((np.count_nonzero(Test_actual == Test_predicted)/new_test_df.shape[0])*100)+' %'

In [231]:
Test_accuracy

'90.09420289855072  %'