KNN Classification
Created by Dylan Bozada

In [1]:
# Importing numpy with alias of np
import numpy as np

# Bringing in default_rng to make data to work with 
from numpy.random import default_rng

# Alias for linalg
from numpy import linalg as LA

In [2]:
# Code to be ran with following numerical parameters
TOTAL_SAMPLE_SIZE = 10
n = 7
d = 3
K = 3
m = 5
s = 2

In [3]:
# For loop to produce two runs with two different seeds
for i in range(20):
    rng = np.random.default_rng(np.random.randint(1, high=10))
    
    print(f'Run {i+1}:\n')
    
    # Creating randomly generated labeled data points with size of TOTAL_SAMPLE_SIZE
    SAMPLE_DATA = rng.normal(loc=m, scale=s, size=(TOTAL_SAMPLE_SIZE,d))
    LABELS = np.array(['GOOD','BAD','UGLY'])
    CLASSES = rng.choice(LABELS, (TOTAL_SAMPLE_SIZE,1))
    SAMPLE_DATA_CLASSES = np.concatenate((SAMPLE_DATA, CLASSES),axis = 1)

    # Data structures holding all the training data and labels
    x_train = SAMPLE_DATA_CLASSES[:n, 0:d].astype('f')
    y_train = SAMPLE_DATA_CLASSES[:n, d]

    # Printing to screen the training data, and training labels for output
    print('Training Data:\n', x_train)
    print('\nTraining Labels:\n', y_train)

    # Data structures holding all the test data and labels
    x_test = SAMPLE_DATA_CLASSES[n:, 0:d].astype('f')
    y_test = SAMPLE_DATA_CLASSES[n:, d]

    # Printing to screen the test data, and predicted test labels for output
    print('\nTest Data:\n', x_test)
    print('\nPredicted Test Labels:\n', y_test)

    # Function for weight of distances
    def weight(x):
        return 1 / (0.001 + x)

    # Creating array of distances to calculate KNN
    dist = np.array([LA.norm(np.subtract(x_test[0], x_train[:]), axis = 1)])

    for i in range(1, len(x_test)):
        dist = np.append(dist, [LA.norm(np.subtract(x_test[i], x_train[:]), axis=1)], axis = 0)

    # Applying weight function to array of distances
    wght_dist = weight(dist)

    # Function to find the labels of the KNN's
    def findKNN(arr):
        return y_train[np.argsort(arr)[:K]]

    # Function to find labels that appear the most among the KNN's
    def findLabel(arr):
        values, counts = np.unique(arr, return_counts = True)
        return values[np.argmax(counts)]

    # Creating count to increment by 1 to calculate success rate
    count = 0

    # For loop that will change the label of test data if predicted incorrectly
    for i in range(len(y_test)):     
        if findLabel(findKNN(wght_dist[i])) != y_test[i]:
            y_test[i] = findLabel(findKNN(wght_dist[i]))
            count += 1
        else:
            continue

    # Printing to screen the known labels of test data
    print('\nActual Test Labels:\n', y_test)

    if count == (TOTAL_SAMPLE_SIZE - n):
        print('\nSuccess Rate: 0%\n')
    else:
        print(f"\nSuccess Rate: {(count/len(y_test)):.2f}%\n")
        
    print('****************************************************************')

Run 1:

Training Data:
 [[5.0024605 5.5974913 4.451724 ]
 [3.2188163 4.0906587 3.016707 ]
 [5.1202874 7.6804304 4.015587 ]
 [3.7590501 5.979684  5.713774 ]
 [5.2108283 3.1390638 4.9414964]
 [6.3906064 2.311571  4.0847683]
 [1.1975545 2.4209244 1.3165299]]

Training Labels:
 ['GOOD' 'BAD' 'BAD' 'GOOD' 'UGLY' 'GOOD' 'GOOD']

Test Data:
 [[ 4.5298176   2.465107    5.5425286 ]
 [ 5.3135023   4.626138   -0.03351942]
 [ 3.922614    4.902998    5.226618  ]]

Predicted Test Labels:
 ['GOOD' 'UGLY' 'UGLY']

Actual Test Labels:
 ['GOOD' 'BAD' 'GOOD']

Success Rate: 0.67%

****************************************************************
Run 2:

Training Data:
 [[3.3961372 2.3512821 4.503277 ]
 [5.8408904 7.2720933 5.219413 ]
 [3.8947053 3.4304392 6.4974914]
 [8.269567  5.5455375 2.5333426]
 [3.0834696 8.200038  5.405765 ]
 [1.5357304 4.8326077 2.673548 ]
 [3.7414238 4.0239882 3.5733733]]

Training Labels:
 ['UGLY' 'UGLY' 'UGLY' 'GOOD' 'GOOD' 'BAD' 'BAD']

Test Data:
 [[6.106757  4.873828  3.82113