You should use the training and test file from the classification folder in data.zip for Part 2.<br>

The objective of this section is to investigate the performance of a k-NN and a distance weighted variant and to write a report documenting your findings.
<br>

#### (a) You should implement a distance-weighted variant of the k-NN algorithm you developed in part 1. In your report include the performance achieved by the distance-weighted k-NN for k=10.


In [546]:
import numpy as np
import math

In [547]:
train_dataset  = np.genfromtxt('trainingData_classification.csv', delimiter = ',')
train_data  = np.delete(train_dataset,10,axis = 1)
train_class = train_dataset[:,10]
test_dataset  = np.genfromtxt('testData_classification.csv', delimiter = ',')
test_data  = np.delete(test_dataset,10,axis = 1)
test_class = test_dataset[:,10]

In [548]:
def calculateDistances(query_instance, feature_list):
    feature_difference = feature_list - query_instance
    euclidena_distance = np.sqrt(np.sum(np.square(feature_difference),axis=1))
    sorted_distance_index = np.argsort(euclidena_distance)
    return euclidena_distance, sorted_distance_index

In [549]:
k = 10

In [550]:
results = np.apply_along_axis(calculateDistances,1, test_data,train_data)

In [551]:
sorted_indicies = results[:,1].astype('int32') 
distances = results[:,0]

In [552]:
# Contains all the indicies representing the minimum euclidena distance
minimum_dist = sorted_indicies[:,:k]

In [553]:
sorted_indicies

array([[ 792, 1037, 1518, ..., 2979,  534, 2068],
       [3337, 2946, 3417, ..., 2702, 2979, 2068],
       [1378,  836, 3230, ...,  799, 2068, 2979],
       ...,
       [3659, 1186,  531, ...,  799, 3891, 2068],
       [2655, 3191,  466, ...,  534, 2979, 2068],
       [3666,  137,   66, ..., 2068, 2023, 1458]])

In [554]:
distances

array([[0.45730647, 0.40561773, 0.43496813, ..., 0.32210837, 0.46737854,
        0.86729724],
       [0.54662896, 0.4939321 , 0.64804863, ..., 0.43982632, 0.34213352,
        0.73020761],
       [0.39262847, 0.58098335, 0.5475406 , ..., 0.29384476, 0.5340471 ,
        0.77977997],
       ...,
       [0.58173703, 0.41195646, 0.52525442, ..., 0.52916584, 0.5955721 ,
        0.75141248],
       [0.45165227, 0.40357961, 0.5646753 , ..., 0.33826429, 0.30237948,
        0.76469201],
       [0.31835764, 0.62750192, 0.49928755, ..., 0.40841963, 0.52525653,
        0.61031134]])

In [555]:
test_dist = distances.copy()

In [556]:
test_dist.sort(axis=1)

In [557]:
test_dist

array([[0.13526273, 0.13935928, 0.17230542, ..., 0.95497102, 0.96026204,
        1.19418367],
       [0.17259924, 0.18883668, 0.20437515, ..., 1.01247705, 1.0559992 ,
        1.18839194],
       [0.15332181, 0.15790306, 0.16751189, ..., 1.00494285, 1.00698529,
        1.03300068],
       ...,
       [0.18176714, 0.22278254, 0.23930868, ..., 0.9768479 , 1.00623819,
        1.11600928],
       [0.14034546, 0.15750501, 0.16695484, ..., 0.97279772, 1.00863146,
        1.15867376],
       [0.17756937, 0.18082225, 0.18135623, ..., 0.94800775, 1.00158902,
        1.01078266]])

In [567]:
my_dist = test_dist[:,:k]  

In [568]:
my_dist

array([[0.13526273, 0.13935928, 0.17230542, ..., 0.19052778, 0.19057041,
        0.19785218],
       [0.17259924, 0.18883668, 0.20437515, ..., 0.22015966, 0.22141403,
        0.2246004 ],
       [0.15332181, 0.15790306, 0.16751189, ..., 0.18922303, 0.19033234,
        0.20156609],
       ...,
       [0.18176714, 0.22278254, 0.23930868, ..., 0.25140438, 0.25279925,
        0.25451836],
       [0.14034546, 0.15750501, 0.16695484, ..., 0.18820183, 0.19785646,
        0.19809569],
       [0.17756937, 0.18082225, 0.18135623, ..., 0.2046299 , 0.20964571,
        0.21528287]])

In [560]:
# Numpy array to store the classes predicted for the test data
prediction = train_class[minimum_dist].astype('int32') 

In [574]:
prediction

array([[0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 0],
       [2, 2, 2, ..., 2, 2, 2],
       ...,
       [0, 0, 0, ..., 0, 2, 0],
       [0, 0, 2, ..., 0, 0, 0],
       [2, 1, 1, ..., 1, 1, 2]])

In [614]:
zz = np.zeros(shape=(1000,))
n = 3
for i in range(0,len(prediction)):

    freq0 = 0
    freq1 = 0
    freq2 = 0
    for j in range(0,k):
        if prediction[i][j] == 0:   
            freq0 += (1 / pow(my_dist[i][j],n)) 
        elif prediction[i][j] == 1:
            freq1 += (1 / pow(my_dist[i][j],n)) 
        
        elif prediction[i][j] == 2:
            freq2 += (1 / pow(my_dist[i][j],n)) 
    
#     print('freq0', freq0)
#     print('freq1' ,freq1)
#     print('freq2', freq2)
    if freq0 > freq1 and freq0 > freq2:
        zz[i] = 0
    
    elif freq1 > freq0 and freq1 > freq2:
        zz[i] = 1
    
    elif freq2 > freq0 and freq2 > freq1:
        zz[i] = 2
    

In [617]:
# Calculating the count of correct predictions
correct_prediction = np.count_nonzero(test_class == zz)

In [618]:
# The percentage of correct prediction
percentage =( correct_prediction/len(test_dataset) ) *100

print(f'The model has an accuracy of {percentage} %')

The model has an accuracy of 92.80000000000001 %


#### Part B

### Using scaling on the KNN model to see the improvement in results 

#### On Basic Model

In [398]:

scaling_train_data = train_data.copy()
scaling_test_data = test_data.copy()


In [405]:
scaling_train_data

array([[0.5301304 , 0.62645837, 0.44524917, ..., 0.40806674, 0.46013734,
        0.61033772],
       [0.43333892, 0.46062429, 0.56937923, ..., 0.6451305 , 0.33103777,
        0.35859095],
       [0.60879428, 0.72451976, 0.2661216 , ..., 0.38850336, 0.41685737,
        0.57226228],
       ...,
       [0.43723375, 0.5618769 , 0.56685036, ..., 0.37932588, 0.4918483 ,
        0.4908792 ],
       [0.62399617, 0.35529236, 0.49406175, ..., 0.46282374, 0.42130978,
        0.46383661],
       [0.37743885, 0.39775421, 0.52465763, ..., 0.53548199, 0.38003063,
        0.40968938]])

Min and max of the feature only , subtract the min max of the feature from the query instance.
Example
For feature F1, calculate the min and max 
and when compouting the query instance use the same min and max of F1

In [411]:
min_features_train = np.amin(scaling_train_data, axis=0)
max_features_train = np.amax(scaling_train_data, axis=0)

In [412]:
# min_features_test = np.amin(scaling_test_data, axis=0)
# max_features_test = np.amax(scaling_test_data, axis=0)

In [418]:
scaling_train_data - min_features_train

array([[0.5301304 , 0.57767742, 0.44524917, ..., 0.40806674, 0.42023715,
        0.61033772],
       [0.43333892, 0.41184334, 0.56937923, ..., 0.6451305 , 0.29113757,
        0.35859095],
       [0.60879428, 0.67573881, 0.2661216 , ..., 0.38850336, 0.37695717,
        0.57226228],
       ...,
       [0.43723375, 0.51309595, 0.56685036, ..., 0.37932588, 0.4519481 ,
        0.4908792 ],
       [0.62399617, 0.30651141, 0.49406175, ..., 0.46282374, 0.38140959,
        0.46383661],
       [0.37743885, 0.34897326, 0.52465763, ..., 0.53548199, 0.34013043,
        0.40968938]])

In [421]:
min_features_train

array([0.        , 0.04878095, 0.        , 0.04237391, 0.        ,
       0.        , 0.07190014, 0.        , 0.0399002 , 0.        ])

In [422]:
max_features_train

array([1.        , 1.        , 0.99150153, 1.        , 1.        ,
       1.        , 0.98149185, 0.85121676, 1.        , 1.        ])

In [426]:
scaling_train_data - min_features_train

array([[0.5301304 , 0.57767742, 0.44524917, ..., 0.40806674, 0.42023715,
        0.61033772],
       [0.43333892, 0.41184334, 0.56937923, ..., 0.6451305 , 0.29113757,
        0.35859095],
       [0.60879428, 0.67573881, 0.2661216 , ..., 0.38850336, 0.37695717,
        0.57226228],
       ...,
       [0.43723375, 0.51309595, 0.56685036, ..., 0.37932588, 0.4519481 ,
        0.4908792 ],
       [0.62399617, 0.30651141, 0.49406175, ..., 0.46282374, 0.38140959,
        0.46383661],
       [0.37743885, 0.34897326, 0.52465763, ..., 0.53548199, 0.34013043,
        0.40968938]])

In [425]:
max_features_train-min_features_train

array([1.        , 0.95121905, 0.99150153, 0.95762609, 1.        ,
       1.        , 0.9095917 , 0.85121676, 0.9600998 , 1.        ])

In [419]:
(scaling_train_data - min_features_train)/(max_features_train-min_features_train)

array([[0.5301304 , 0.6073022 , 0.44906554, ..., 0.47939228, 0.43770152,
        0.61033772],
       [0.43333892, 0.43296372, 0.57425956, ..., 0.75789215, 0.30323678,
        0.35859095],
       [0.60879428, 0.71039243, 0.26840261, ..., 0.45640944, 0.3926229 ,
        0.57226228],
       ...,
       [0.43723375, 0.53940882, 0.57170901, ..., 0.44562784, 0.47073034,
        0.4908792 ],
       [0.62399617, 0.3222301 , 0.49829651, ..., 0.5437202 , 0.39726035,
        0.46383661],
       [0.37743885, 0.3668695 , 0.52915463, ..., 0.6290783 , 0.3542657 ,
        0.40968938]])

In [427]:
scaled_train_data = (scaling_train_data - min_features_train)/(max_features_train-min_features_train)

In [435]:
# Contains the scale training data.

scaled_train_data

array([[0.5301304 , 0.6073022 , 0.44906554, ..., 0.47939228, 0.43770152,
        0.61033772],
       [0.43333892, 0.43296372, 0.57425956, ..., 0.75789215, 0.30323678,
        0.35859095],
       [0.60879428, 0.71039243, 0.26840261, ..., 0.45640944, 0.3926229 ,
        0.57226228],
       ...,
       [0.43723375, 0.53940882, 0.57170901, ..., 0.44562784, 0.47073034,
        0.4908792 ],
       [0.62399617, 0.3222301 , 0.49829651, ..., 0.5437202 , 0.39726035,
        0.46383661],
       [0.37743885, 0.3668695 , 0.52915463, ..., 0.6290783 , 0.3542657 ,
        0.40968938]])

In [430]:
scaling_test_data

array([[0.54131721, 0.52305685, 0.42921551, ..., 0.37434461, 0.52591475,
        0.36184407],
       [0.53091097, 0.3000469 , 0.39346106, ..., 0.29261769, 0.3806552 ,
        0.33904193],
       [0.29331853, 0.44518117, 0.41390863, ..., 0.2510257 , 0.50481932,
        0.43607184],
       ...,
       [0.6123325 , 0.53096305, 0.44993697, ..., 0.49420631, 0.42408399,
        0.49770085],
       [0.58123698, 0.40641692, 0.44801209, ..., 0.37437079, 0.39057652,
        0.32998854],
       [0.64804457, 0.60038784, 0.5537455 , ..., 0.37413601, 0.4213437 ,
        0.49818765]])

In [433]:
scaled_test_data = (scaling_test_data - min_features_train)/(max_features_train-min_features_train)

In [436]:
scaled_test_data

array([[0.54131721, 0.49859798, 0.43289445, ..., 0.43977589, 0.50621254,
        0.36184407],
       [0.53091097, 0.26415151, 0.39683354, ..., 0.34376402, 0.35491623,
        0.33904193],
       [0.29331853, 0.41672864, 0.41745638, ..., 0.2949022 , 0.48424041,
        0.43607184],
       ...,
       [0.6123325 , 0.50690963, 0.45379353, ..., 0.58058809, 0.40014985,
        0.49770085],
       [0.58123698, 0.37597646, 0.45185214, ..., 0.43980665, 0.36524987,
        0.32998854],
       [0.64804457, 0.57989471, 0.55849182, ..., 0.43953083, 0.39729568,
        0.49818765]])

#### Now we have the scaled trained and test data, we can apply the KNN model

In [496]:
def calculateDistances(query_instance, feature_list):
    feature_difference = feature_list - query_instance
    euclidena_distance = np.sqrt(np.sum(np.square(feature_difference),axis=1))
    sorted_distance_index = np.argsort(euclidena_distance)
    return euclidena_distance, sorted_distance_index

In [497]:
results = np.apply_along_axis(calculateDistances,1, scaled_test_data,scaled_train_data)

In [498]:
sorted_indicies = results[:,1].astype('int32') 

In [499]:
k = int(input())

10


In [500]:
def knn_vote(prediction):
    return np.bincount(prediction).argmax()

In [501]:
# Contains all the indicies representing the minimum euclidena distance
minimum_dist = sorted_indicies[:,:k]

# Numpy array to store the classes predicted for the test data
prediction = train_class[minimum_dist].astype('int32') 

# Finding the mode of the classes in K neighbours
find_res = np.apply_along_axis(knn_vote,1, prediction)

# Calculating the count of correct predictions
correct_prediction = np.count_nonzero(test_class == find_res)

# The percentage of correct prediction
percentage =( correct_prediction/len(test_dataset) ) *100

print(f'The model has an accuracy of {percentage} %')

The model has an accuracy of 92.0 %


#### On Weighted Model

In [502]:
sorted_indicies = results[:,1].astype('int32') 
distances = results[:,0]
# Contains all the indicies representing the minimum euclidena distance
minimum_dist = sorted_indicies[:,:k]

In [503]:
test_dist = distances.copy()
test_dist.sort(axis=1)
my_dist = test_dist[:,:k] 

In [504]:
# Numpy array to store the classes predicted for the test data
prediction = train_class[minimum_dist].astype('int32') 

In [505]:
prediction_array = np.zeros(shape=(len(scaled_test_data),))

In [506]:
for i in range(0,len(prediction)):

    freq0 = 0
    freq1 = 0
    freq2 = 0
    for j in range(0,k):
        if prediction[i][j] == 0:   
            freq0 += (1 / my_dist[i][j]) 
        elif prediction[i][j] == 1:
            freq1 += (1 / my_dist[i][j]) 
        
        elif prediction[i][j] == 2:
            freq2 += (1 / my_dist[i][j]) 
    
#     print('freq0', freq0)
#     print('freq1' ,freq1)
#     print('freq2', freq2)
    if freq0 > freq1 and freq0 > freq2:
        prediction_array[i] = 0
    
    elif freq1 > freq0 and freq1 > freq2:
        prediction_array[i] = 1
    
    elif freq2 > freq0 and freq2 > freq1:
        prediction_array[i] = 2
    

In [507]:
# Calculating the count of correct predictions
correct_prediction = np.count_nonzero(test_class == prediction_array)
# The percentage of correct prediction
percentage =( correct_prediction/len(test_dataset) ) *100

print(f'The model has an accuracy of {percentage} %')

The model has an accuracy of 92.80000000000001 %
