In [1]:
import numpy as np
import pandas as pd
import math

In [2]:
# importing the datasets
a_train = pd.read_csv("Train.csv")
a_test = pd.read_csv("Sample_test.csv") 
c = pd.read_csv("Test.csv")

In [3]:
# k values
ks = [1, 3, 5]

# converting to numpy array
a_test = a_test.values

In [4]:
# function to calculate euclidian distance
def e_dist(x, y):

    sqd = np.sum(np.square(x-y), axis=1)
    cd = np.power(sqd, 0.5)

    return cd

In [5]:
# performs classification using k nearest neighbours on a list of sorted labels
def g_csfy(k, slab):

    kns = slab[:k] # extracting first k number out of all sorted labels 
    m_s = np.count_nonzero(kns == ' M ') # number of 'M' labels
    w_s = np.count_nonzero(kns == ' W ') # number of 'W' labels

    return ' M ' if m_s > w_s else ' W '

In [6]:
# main KNN function
def knn_main(x, k, a_train, a_drop):
    
    if a_drop:
        y = a_train.drop(['a', 'g'], axis=1).values
    else:
        y = a_train.drop(['g'], axis=1).values

    labels = a_train["g"].values

    e_d = e_dist(x, y) # getting cartesian distance from each data point

    clab = np.vstack((e_d, labels)) # 2D array with 1st column as cartesian distance and 2nd column as corresponding labels

    scar = clab.T[clab.T[:, 0].argsort()] # sorting labels based on cartesian distance
    slab = scar.T[1]

    return g_csfy(k, slab)

In [7]:
# making predictions for sample test data for different values of k
if __name__ == '__main__':

    for x in a_test:
        print("test_point{}".format(x))
        for k in ks:
            prediction_1 = knn_main(x, k, a_train, a_drop=False)
            print("\tOutput is {} for k={}".format(prediction_1, k))
    print()

test_point[ 1.86951158 76.13764299 32.        ]
	Output is  M  for k=1
	Output is  M  for k=3
	Output is  M  for k=5
test_point[ 1.44964692 60.1069934  24.        ]
	Output is  W  for k=1
	Output is  W  for k=3
	Output is  W  for k=5
test_point[ 1.50062171 70.4236467  28.        ]
	Output is  W  for k=1
	Output is  W  for k=3
	Output is  W  for k=5
test_point[ 1.64928523 65.99131108 28.        ]
	Output is  W  for k=1
	Output is  W  for k=3
	Output is  W  for k=5



In [8]:
# making predictions on the entire test data and calculating accuracy
if __name__ == '__main__':

    for k in ks:
        # p_a -> predictions using all features
        # p_e -> predictions using features excluding 'age'
        p_a, p_e = 0, 0

        # tesp -> test sample
        for index, tesp in c.iterrows():
            # separating features and label
            sp1 = tesp.values[:3]
            tgt = tesp.values[3]

            prediction = knn_main(sp1, k, c.drop(index), a_drop=False)
            p_a += 1 if tgt == prediction else 0
            
            prediction = knn_main(sp1[:2], k, c.drop(index), a_drop=True)
            p_e += 1 if tgt == prediction else 0

        print("For k={}".format(k))
        print("{}/{} Predictions/Total for all features".format(p_a, c.shape[0]))
        ac1 = p_a/c.shape[0]
        print("accuracy for all features = {}".format(ac1))
        print("{}/{} Predictions/Total excluding age".format(p_e, c.shape[0]))
        ac2 = p_e/c.shape[0]
        print("accuracy for excluding age = {}".format(ac2))
        print()

For k=1
59/120 Predictions/Total for all features
accuracy for all features = 0.49166666666666664
67/120 Predictions/Total excluding age
accuracy for excluding age = 0.5583333333333333

For k=3
57/120 Predictions/Total for all features
accuracy for all features = 0.475
73/120 Predictions/Total excluding age
accuracy for excluding age = 0.6083333333333333

For k=5
66/120 Predictions/Total for all features
accuracy for all features = 0.55
66/120 Predictions/Total excluding age
accuracy for excluding age = 0.55

