In [1]:
import cv2
import pandas as pd
import numpy as np

def preprocessing(val=True):
    
    data = pd.read_csv('data/x_train_gr_smpl.csv')
    labels = pd.read_csv('data/y_train_smpl.csv')

    if(val):
        ## ---------------- Data preparation ---------------- ##
        X_train = []
        for i in range(data.shape[0]):
            img = np.uint8(data.iloc[i])
            edited = cv2.Canny(img, 10, 30)
            edited = cv2.GaussianBlur(edited, (5, 5), 0)
            X_train.append(edited.reshape((1,-1))[0])

        data = pd.DataFrame(X_train)
        ## -------------------------------------------------- ##

    return data, labels

In [2]:
# Preparing the data without preprocessing

from sklearn.model_selection import train_test_split

data, labels = preprocessing(False)
data['label'] = labels

X = data.iloc[:, :-1]
y = data['label']

# Point 2: Data Randomization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [3]:
# Point 4: Classification with Naive Bayes

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.29      0.16      0.21       462
           1       0.24      0.24      0.24       607
           2       0.09      0.77      0.15       133
           3       0.78      0.36      0.49       442
           4       0.73      0.19      0.30       685
           5       0.49      0.46      0.47       724
           6       0.41      0.61      0.49       258
           7       0.10      0.57      0.17        77
           8       0.70      0.09      0.16       699
           9       0.12      0.23      0.16        91

    accuracy                           0.29      4178
   macro avg       0.40      0.37      0.28      4178
weighted avg       0.51      0.29      0.31      4178



In [4]:
# Preparing the data with preprocessing

from sklearn.model_selection import train_test_split

data, labels = preprocessing()
data['label'] = labels

X = data.iloc[:, :-1]
y = data['label']

# Point 2: Data Randomization
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [5]:
# Point 4: Classification with Naive Bayes

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.38      0.44       479
           1       0.45      0.45      0.45       607
           2       0.22      0.74      0.34       139
           3       0.78      0.85      0.82       463
           4       0.84      0.85      0.85       675
           5       1.00      0.81      0.89       713
           6       0.74      0.82      0.78       245
           7       0.69      0.77      0.73        70
           8       0.96      0.74      0.83       691
           9       0.79      0.83      0.81        96

    accuracy                           0.71      4178
   macro avg       0.70      0.72      0.69      4178
weighted avg       0.76      0.71      0.72      4178



In [6]:
confusion_matrix(y_test, y_pred)

array([[184, 154,  60,  35,   1,   0,  31,  10,   3,   1],
       [127, 274, 117,  23,   9,   0,  38,   6,   2,  11],
       [ 11,   9, 103,   8,   3,   0,   0,   0,   5,   0],
       [  5,  55,   9, 393,   1,   0,   0,   0,   0,   0],
       [  1,  55,  28,  12, 576,   0,   0,   1,   2,   0],
       [  0,  26,  49,   0,  59, 575,   0,   0,   4,   0],
       [ 29,   8,   1,   6,   0,   0, 201,   0,   0,   0],
       [  0,   0,   7,   0,   1,   0,   0,  54,   0,   8],
       [  6,  25,  86,  22,  36,   0,   0,   6, 509,   1],
       [  0,   4,   3,   2,   2,   0,   0,   1,   4,  80]])

In [7]:
# Trying point 4 without randomizing the data

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report


data, labels = preprocessing()
data['label'] = labels

samples_per_label = int((len(data) * 0.66) / 10)

counters = np.zeros(10)

data = data.sample(frac=1)
selected_train = []
selected_test = []

for _, dat in data.iterrows():
    l = dat['label'] 
    if(counters[l] < samples_per_label):
        selected_train.append(dat)
        counters[l] = counters[l] + 1
    else:
        selected_test.append(dat)
        
df_train = pd.DataFrame(data=selected_train, columns=data.columns)
X_train = df_train.iloc[:, :-1]
y_train = df_train['label']

df_test = pd.DataFrame(data=selected_test, columns=data.columns)
X_test = df_test.iloc[:, :-1]
y_test = df_test['label']

gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.49      0.39      0.43       575
           1       0.53      0.50      0.51      1025
           2       0.00      0.00      0.00         0
           3       0.75      0.90      0.82       485
           4       0.86      0.83      0.84      1265
           5       1.00      0.81      0.90      1325
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.98      0.71      0.82      1235
           9       0.00      0.00      0.00         0

   micro avg       0.71      0.71      0.71      5910
   macro avg       0.46      0.41      0.43      5910
weighted avg       0.81      0.71      0.75      5910

