### Import libraries and data

In [1]:
import csv #needed to import the CSV file
import math #needed for using the square root operation
from datetime import datetime #needed to measure the execution time

#get the training data from CSV file (from here: https://www.kaggle.com/c/digit-recognizer/data) - Label column should be moved to the end of the table
with open('digit_recognizer-train.csv', newline='') as f:
    reader = csv.reader(f)
    dataset = list(reader)

### Convert dataset elements type from String to Integer

In [2]:
for row in range(1, len(dataset), 1):
    for column in range(len(dataset[row])):
        dataset[row][column] = int(dataset[row][column])

### Convert dataset to pandas to better visualize and remove redundant columns

In [3]:
from pandas import DataFrame
df = DataFrame (dataset[1:], columns=dataset[0])

#remove columns that are all 0s
df = df.loc[:, (df != 0).any(axis=0)] #this brings the number of columns down from 785 to 662

#convert data back to list
dataset = df.values.tolist()

In [4]:
df.describe()

Unnamed: 0,pixel12,pixel13,pixel14,pixel15,pixel32,pixel33,pixel34,pixel35,pixel36,pixel37,...,pixel771,pixel772,pixel773,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,label
count,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,...,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0
mean,0.003,0.01119,0.005143,0.000214,0.000381,0.00131,0.010548,0.027262,0.050905,0.066405,...,0.60281,0.489238,0.340214,0.219286,0.117095,0.059024,0.02019,0.017238,0.002857,4.456643
std,0.56812,1.626927,1.053972,0.043916,0.078072,0.232634,1.131661,2.310396,3.121847,3.259128,...,10.69603,9.480066,7.950251,6.31289,4.633819,3.274488,1.75987,1.894498,0.414264,2.88773
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
max,116.0,254.0,216.0,9.0,16.0,47.0,157.0,254.0,255.0,243.0,...,255.0,255.0,255.0,254.0,254.0,253.0,253.0,254.0,62.0,9.0


### Make predictions

In [5]:
#define function to calculate the euclidean distance between 2 data points 
def get_eucldistance(point1, point2):
    coordinatedistance = totaldistance = 0.0
    for i in range(len(point1)-1):
        coordinatedistance = (point1[i] - point2[i])**2
        totaldistance = totaldistance + coordinatedistance
    return math.sqrt(totaldistance)

#define function to find k nearest neighbors for a datapoint in a dataset, k being a parameter 
def get_kneighbors(datapoint, dataset, kvalue):
    def myFunc(e):
        return e[1]
    templist = list()
    for row in dataset:
        distance = get_eucldistance(datapoint, row)
        templist.append((row,distance))
    templist.sort(key=myFunc)
    return(templist[:kvalue])

#define function to find the most common label in a list of datapoints
def get_commonlabel(kneighbors):
    klabels = list()
    for row in kneighbors:
        klabels.append(row[0][len(row[0])-1])
        #print(row[0][len(row)])
    return max(set(klabels), key=klabels.count)

def performpredictions(dataset, k_neighbors, validationsetlength):
    results_metrics = list()
    validationdata = dataset[:validationsetlength]
    trainingdata = dataset[validationsetlength:]
    TP = TN = FP = FN  = 0
    i = 0
    for row in validationdata:
        i += 1
        if (i%100 ==0):
            print(f"Now completed line: {i} at {datetime.now()}")
        actual = row[len(row)-1]
        prediction = get_commonlabel(get_kneighbors(row, trainingdata, k_neighbors))
        if (actual == 6):
            if (prediction == actual):
                TP += 1
            else:
                FP += 1
        else:
            if (prediction == actual):
                TN += 1
            else:
                FN += 1
    results_metrics.append((TP, TN, FP, FN))
    total_TP = total_TN = total_FP = total_FN = 0
    for row in results_metrics:
        total_TP += row[0]
        total_TN += row[1]
        total_FP += row[2]
        total_FN += row[3]
    metrics = [total_TP, total_TN, total_FP, total_FN, len(results_metrics)]
    return metrics

validationpercent = 0.1
numberofsamplestokeep = 20000
k_neighbors = 4

dataset = dataset[:numberofsamplestokeep]
validationsetlength = int(validationpercent*len(dataset))

tabledata = list()
print(f"For k={k_neighbors}")
print(f"KNN classifier started at {datetime.now()} with K = {k_neighbors}")
start = datetime.now()
metrics = performpredictions(dataset, k_neighbors, validationsetlength)
end = datetime.now()
tabledata.append(("Enclidean", str(round((end-start).total_seconds(), 2)), metrics[0], metrics[1], metrics[2], metrics[3], str(round((metrics[0]+metrics[1])/(metrics[0]+metrics[1]+metrics[2]+metrics[3]), 2)), str(round(metrics[0]/(metrics[0]+metrics[3]), 2)), str(round(metrics[1]/(metrics[1]+metrics[2]), 2))))

tableheader = ["Dist. Metric", "Duration(s)", "TP", "TN", "FP", "FN", "Accuracy", "Sensitivity", "Specificity"]

format_row = "{:>12}" * (len(tableheader))
print(format_row.format(*tableheader))

for row in tabledata:
    print(format_row.format(*row))


For k=4
KNN classifier started at 2021-03-05 20:37:27.448534 with K = 4
Now completed line: 100 at 2021-03-05 20:45:24.166071
Now completed line: 200 at 2021-03-05 20:53:40.543481
Now completed line: 300 at 2021-03-05 21:02:01.847858
Now completed line: 400 at 2021-03-05 21:10:21.163435
Now completed line: 500 at 2021-03-05 21:18:47.867542
Now completed line: 600 at 2021-03-05 21:27:08.277224
Now completed line: 700 at 2021-03-05 21:35:20.464059
Now completed line: 800 at 2021-03-05 21:43:42.329221
Now completed line: 900 at 2021-03-05 21:51:45.429341
Now completed line: 1000 at 2021-03-05 21:59:41.828718
Now completed line: 1100 at 2021-03-05 22:07:51.463504
Now completed line: 1200 at 2021-03-05 22:16:08.981513
Now completed line: 1300 at 2021-03-05 22:24:16.730789
Now completed line: 1400 at 2021-03-05 22:32:26.272118
Now completed line: 1500 at 2021-03-05 22:40:29.907565
Now completed line: 1600 at 2021-03-05 22:48:27.348284
Now completed line: 1700 at 2021-03-05 22:56:28.500010
No