In [None]:
# Copyright (c) 2022 Chloe Woolwine

A K-Nearest Neighbor Classifier using a Diabetic Retinopathy data set. Classifies into likely to have (1) versus unlikely to have (0)

In [2]:

import pandas as pd
import numpy as np
import time

In [4]:
def get_data(filename):
    data = pd.read_csv(filename, header=None)
    
    return data

print("\n\nTest KNN: ")

train_set = get_data("sample_train_KNN.txt")
test_set = get_data("sample_test_KNN.txt")

print("train_data shape=", train_set.shape)
print("test_data shape=", test_set.shape)

print("\nQ1.\nValues before norm:")
print("Train:", list(train_set.iloc[0,:]))
print("Test:", list(test_set.iloc[0,:]))



Test KNN: 
train_data shape= (25, 20)
test_data shape= (1, 20)

Q1.
Values before norm:
Train: [0.0, 1.0, 56.0, 53.0, 50.0, 48.0, 44.0, 34.0, 48.811069, 23.4443, 11.447292, 2.439939, 0.173514, 0.0, 0.0, 0.0, 0.487553, 0.087525, 0.0, 0.0]
Test: [1.0, 1.0, 40.0, 38.0, 33.0, 25.0, 20.0, 12.0, 73.082699, 23.121256, 13.093588, 5.437382, 3.845287, 2.028783, 0.518568, 0.10715, 0.527112, 0.105129, 1.0, 1.0]


Function to print a confusion matrix for easy data analysis


In [5]:
def print_confusion_matrix(TP, FN, FP, TN):
    table_data = [[TP,FN],[FP,TN]]
    df = pd.DataFrame(table_data, columns =['Predicted 1','Predicted 0'])
    df = df.rename(index={0: 'Actual 1', 1: 'Actual 0'})
    display(df)


Normalizes data so each feature is between [0-1]

In [6]:
def normalize_data(train, test):
    train_norm = (train - train.min())/(train.max() - train.min())
    test_norm = (test - train.min())/(train.max() - train.min())
    return train_norm, test_norm

train_set = get_data("sample_train_KNN.txt")
test_set = get_data("sample_test_KNN.txt")


#normalize the data
train_set, test_set = normalize_data(train_set, test_set)

print("\nValues after norm:")
print("Train:", list(train_set.iloc[0,:]))
print("Test:", list(test_set.iloc[0,:]))


Values after norm:
Train: [0.0, 1.0, 0.5161290322580645, 0.569620253164557, 0.6470588235294118, 0.65625, 0.6060606060606061, 0.5, 0.1860328709289841, 0.4204396404200586, 0.3544116090682406, 0.21814933392604677, 0.04763746830716687, 0.0, 0.0, 0.0, 0.07254881377283225, 0.37045307443365716, 0.0, 0.0]
Test: [1.0, 1.0, 0.34408602150537637, 0.379746835443038, 0.39705882352941174, 0.296875, 0.24242424242424243, 0.14516129032258066, 0.28531683293159166, 0.4145704766038485, 0.40562660799714984, 0.4861438181862236, 1.055705808144938, 1.1130298772683358, 0.4579189276300395, 0.34138011233842985, 0.48782280075582624, 0.6553074433656959, 1.0, 1.0]



Euclidean distance calculation. Utilizes the functionality of a DataFrame to vectorize the calculation for speed. 

In [9]:
def get_distances(point, df):
    nolabel = df.loc[:, df.columns != 19]
    out = (nolabel[:] - point[:19]).pow(2).sum(1).pow(.5)
    df['distance'] = out
    return df

# show distance from 0th test record to each of the 25 training records
dist = get_distances(list(test_set.iloc[0,:]), train_set)
print(dist['distance'])
#print(dist)

0      54.587466
1      68.178668
2      64.764976
3      70.110928
4      77.023575
5      70.425542
6      64.642334
7      42.529562
8      93.458977
9      16.170888
10     61.211608
11     70.363062
12     53.948781
13     82.947397
14    182.018275
15     32.691960
16     36.760323
17     35.379534
18     72.648410
19    122.491586
20     82.804374
21     80.976125
22     61.606595
23     60.032295
24    109.108442
Name: distance, dtype: float64



Takes in training set, test set, and K and puts everything together to run the KNN algorithm. Returns the predicted labels as a list. 

In [10]:
def run_knn(train_set, test_set, k):
    train, test = normalize_data(train_set,test_set)
    preds = []

    for i in range (len(test_set)):
        distances = get_distances(test.iloc[i,:], train)
        leastK = distances.nsmallest(k,'distance')
        leastK = leastK[[19, 'distance']]
        vals = leastK[19].value_counts()
        #print(vals.keys()[0])
        if(len(vals.keys()) == 1):
            preds.append(vals.keys()[0])
        else:
            pos = vals.loc[1]
            neg = vals.loc[0]
            #print("positive: ", pos, " negative", neg)
            if(pos >= neg):
                preds.append(1)
            else:
                preds.append(0)



    return preds

# Re-read data (to un-do normalization)
train_set = get_data("sample_train_KNN.txt")
test_set = get_data("sample_test_KNN.txt")

# classify one test record with k=3
#one_test_record = pd.DataFrame(test_set.iloc[0]).transpose()
preds = run_knn(train_set, test_set, k=3)
# need to un-comment printouts in run_knn() to see results
#print(preds[19].value_counts().count())
print(preds)

[0]


Inner loop of a nested cross validation to find the best value of K between 1 and 10. Uses 5-fold cross validation to evaluate accuracy. Each fold is divided into training and validation. 

In [12]:
def calc_accuracy(preds, actual):
    #print(preds)
    #print(actual)     
    correct = 0
    for i in range(len(preds)):
        if preds[i] == actual.iloc[i]:
            correct += 1
    return correct / len(preds)

def partition_data(data, testfold, totalfolds):
    train_set = None
    test_set = None

    foldsize = int(len(data)/totalfolds)
    start = foldsize * testfold
    end = start + foldsize

    test_set = data.iloc[start:end,:]
    train_set = data.drop(range(start,end))

    test_set.index = range(len(test_set.index))
    train_set.index = range(len(train_set.index))
    #print ('Training set size:', len(train_set))
    #print ('Test set size    :', len(test_set))
    return train_set, test_set

def find_best_k(data):
    numfolds = 5
    best_accuracy = 0
    best_k = 0
    for k in range(1, 11, 2):
        accuracy_sum = 0
        for i in range(0, numfolds):
            train_set, test_set = partition_data(data, i, numfolds)
            preds = run_knn(train_set, test_set, k)
          #  print("k = ", k, " iteration = ", i, " preds = ")
            accuracy = calc_accuracy(preds, test_set[19])
            accuracy_sum += accuracy

        accuracy_sum = (accuracy_sum/numfolds)
        print("average accuracy for ", k, " is ", accuracy_sum)
        if accuracy_sum > best_accuracy:
            best_accuracy = accuracy_sum
            best_k = k

    print("the best k is ", best_k)
    return best_k


# Re-read data (to un-do normalization)
train_set = get_data("sample_train_KNN.txt")
test_set = get_data("sample_test_KNN.txt")

# find the best k on the training set
k = find_best_k(train_set) 
# need to un-comment printouts in find_best_k() to see results

average accuracy for  1  is  0.6
average accuracy for  3  is  0.44000000000000006
average accuracy for  5  is  0.56
average accuracy for  7  is  0.5599999999999999
average accuracy for  9  is  0.56
the best k is  1


Measures the accuracy of entire classifier using 5-fold cross validation. Final accuracy is obtained by averaging accuracy of each fold


In [None]:
def calc_accuracy_and_confusion(preds, actual):
    #print(preds)
    #print(actual)     
    correct = 0
    truepos = 0
    falsepos = 0
    trueneg = 0
    falseneg = 0
    for i in range(len(preds)):
        if preds[i] == actual.iloc[i]:
            correct += 1
            if preds[i] == 0:
                trueneg += 1
            else:
                truepos += 1
        else:
            if preds[i] == 0:
                falseneg += 1
            else:
                falsepos +=1
    return correct / len(preds), truepos, falsepos, trueneg, falseneg

# read in data
data = get_data('messidor_features.txt')
numfolds = 5
start_time = time.time()

accuracy_sum = 0
truepos = 0
falsepos = 0
trueneg = 0
falseneg = 0

for i in range(5):
    print("fold: ", i)
    train_set, test_set = partition_data(data, i, numfolds)
    k = find_best_k(train_set)
    preds = run_knn(train_set, test_set, k)
    accuracy, tp, fp, tn, fn= calc_accuracy_and_confusion(preds, test_set[19])
    truepos += tp
    falsepos += fp
    trueneg += tn
    falseneg += fn
    accuracy_sum += accuracy
    print("the accuracy for this fold is ", accuracy)

accuracy_sum = accuracy_sum/numfolds
end_time = time.time()
print('\nTotal time:', end_time - start_time)
print("accuracy = ", accuracy_sum)
print("percision = ", truepos / (truepos+falsepos))
print("recall = ", truepos / (truepos+falseneg))
print_confusion_matrix(truepos, falseneg, falsepos, trueneg)


fold:  0
average accuracy for  1  is  0.6152173913043478
average accuracy for  3  is  0.6282608695652174
average accuracy for  5  is  0.6163043478260869
average accuracy for  7  is  0.6402173913043476
average accuracy for  9  is  0.6260869565217391
the best k is  7
the accuracy for this fold is  0.6173913043478261
fold:  1
average accuracy for  1  is  0.5989130434782609
average accuracy for  3  is  0.6152173913043477
average accuracy for  5  is  0.6347826086956522
average accuracy for  7  is  0.6304347826086956
average accuracy for  9  is  0.6217391304347826
the best k is  5
the accuracy for this fold is  0.6217391304347826
fold:  2
average accuracy for  1  is  0.5978260869565217
average accuracy for  3  is  0.5967391304347827
average accuracy for  5  is  0.6076086956521739
average accuracy for  7  is  0.6163043478260869
average accuracy for  9  is  0.6228260869565216
the best k is  9
the accuracy for this fold is  0.6434782608695652
fold:  3
average accuracy for  1  is  0.611956521739

Unnamed: 0,Predicted 1,Predicted 0
Actual 1,363,248
Actual 0,188,351
