In [1]:
import numpy as np
import math

#### Reading training dataset

In [2]:

train_dataset  = np.genfromtxt('trainingData_classification.csv', delimiter = ',')

#### Creating Training Dataset

In [3]:
# Creates all the features of the training dataset
# Removing the class column from the training dataset
train_data  = np.delete(train_dataset,10,axis = 1)

In [4]:
# Numpy array to store the class of training dataset
train_class = train_dataset[:,10]

In [5]:
train_class

array([2., 0., 2., ..., 2., 1., 2.])

#### Reading Test Dataset

In [6]:
test_dataset  = np.genfromtxt('testData_classification.csv', delimiter = ',')

#### Creating test dataset

In [7]:
# Creates all the features of the test dataset
# Removing the class column from the test dataset
test_data  = np.delete(test_dataset,10,axis = 1)

In [8]:
# Numpy array to store the class of test dataset
test_class = test_dataset[:,10]

1. Now we have train_data with 10 columns all of them are features and test_data with 10 columns all of which are features.

2. We have train_class which has all the classes of train_data and test_class which has all the classes of test_data

Now lets defined the function to calculate the euclidiean distance between the features of the test data and the train data

In [9]:
'''
This function should take the list of all feature training data from the train_data and also the features
of the single query instance from the test_data

Input Paramenters: 

arg 1 : a single NumPy 2D array containing all the feature training data
arg 2: a 1D NumPy array, containing a single query instance
'''

def calculateDistances(feature_list, query_instance):
    
    feature_difference = feature_list-query_instance[:,None]
    euclidena_distance = np.sqrt(np.sum(np.square(feature_difference),axis=2))
    sorted_distance_index = np.argsort(euclidena_distance)
    return euclidena_distance,sorted_distance_index

In [10]:
distance , sorted_indicies = calculateDistances(train_data,test_data)

In [17]:
distance.shape

(1000, 4000)

#### Calculating the percentage of correct predictions

In [18]:
# Contains all the indicies representing the minimum euclidena distance
minimum_dist = sorted_indicies[:,0]

# Numpy array to store the classes predicted for the test data
prediction = train_class[minimum_dist]

# Calculating the count of correct predictions
correct_prediction = np.count_nonzero(test_class == prediction)

# The percentage of correct prediction
percentage =( correct_prediction/len(test_dataset) ) *100

print(f'The model has an accuracy of {percentage} %')

The model has an accuracy of 89.5 %


### Approach 2

In [145]:
'''
This function should take the list of all feature training data from the train_data and also the features
of the single query instance from the test_data

Input Paramenters: 

arg 1 : a single NumPy 2D array containing all the feature training data
arg 2: a 1D NumPy array, containing a single query instance
'''

def calculateDistances(query_instance, feature_list):
    feature_difference = feature_list - query_instance
    euclidena_distance = np.sqrt(np.sum(np.square(feature_difference),axis=1))
    sorted_distance_index = np.argsort(euclidena_distance)
    return euclidena_distance, sorted_distance_index

In [146]:
results = np.apply_along_axis(calculateDistances,1, test_data,train_data)

In [147]:
sorted_indicies = results[:,1].astype('int32') 

In [148]:
# Contains all the indicies representing the minimum euclidena distance
minimum_dist = sorted_indicies[:,0]

# Numpy array to store the classes predicted for the test data
prediction = train_class[minimum_dist]

# Calculating the count of correct predictions
correct_prediction = np.count_nonzero(test_class == prediction)

# The percentage of correct prediction
percentage =( correct_prediction/len(test_dataset) ) *100

print(f'The model has an accuracy of {percentage} %')

The model has an accuracy of 89.5 %


#### With Varying K value

In [12]:
k = int(input())

3


In [13]:
def knn_vote(prediction):
    return np.bincount(prediction).argmax()

In [14]:
# Contains all the indicies representing the minimum euclidena distance
minimum_dist = sorted_indicies[:,:k]

# Numpy array to store the classes predicted for the test data
prediction = train_class[minimum_dist].astype('int32') 

# Finding the mode of the classes in K neighbours
find_res = np.apply_along_axis(knn_vote,1, prediction)

# Calculating the count of correct predictions
correct_prediction = np.count_nonzero(test_class == find_res)

# The percentage of correct prediction
percentage =( correct_prediction/len(test_dataset) ) *100

print(f'The model has an accuracy of {percentage} %')

The model has an accuracy of 91.0 %


### Practices

In [377]:
feature_list = train_data
query_instance = test_data[0,:]

In [379]:
query_instance.shape

(10,)

In [52]:
calculateDistances(feature_list,query_instance)

array([0.45730647, 0.40561773, 0.43496813, ..., 0.32210837, 0.46737854,
       0.86729724])

In [27]:
query_instance = test_data

In [28]:
query_instance

array([[0.54131721, 0.52305685, 0.42921551, ..., 0.37434461, 0.52591475,
        0.36184407],
       [0.53091097, 0.3000469 , 0.39346106, ..., 0.29261769, 0.3806552 ,
        0.33904193],
       [0.29331853, 0.44518117, 0.41390863, ..., 0.2510257 , 0.50481932,
        0.43607184],
       ...,
       [0.6123325 , 0.53096305, 0.44993697, ..., 0.49420631, 0.42408399,
        0.49770085],
       [0.58123698, 0.40641692, 0.44801209, ..., 0.37437079, 0.39057652,
        0.32998854],
       [0.64804457, 0.60038784, 0.5537455 , ..., 0.37413601, 0.4213437 ,
        0.49818765]])

Test

In [39]:
feature_list = train_data
query_instance = test_data

In [40]:
a = feature_list-query_instance[:,None]

In [41]:
c = np.square(a)


In [42]:
np.sum(c[0],axis=1)

array([0.20912921, 0.16452574, 0.18919727, ..., 0.1037538 , 0.2184427 ,
       0.7522045 ])

In [43]:
np.sum(c,axis=2)

array([[0.20912921, 0.16452574, 0.18919727, ..., 0.1037538 , 0.2184427 ,
        0.7522045 ],
       [0.29880322, 0.24396892, 0.41996703, ..., 0.1934472 , 0.11705535,
        0.53320315],
       [0.15415712, 0.33754166, 0.29980071, ..., 0.08634474, 0.28520631,
        0.6080568 ],
       ...,
       [0.33841798, 0.16970812, 0.27589221, ..., 0.28001649, 0.35470612,
        0.56462072],
       [0.20398977, 0.1628765 , 0.31885819, ..., 0.11442273, 0.09143335,
        0.58475387],
       [0.10135159, 0.39375866, 0.24928805, ..., 0.16680659, 0.27589442,
        0.37247993]])

In [44]:
result = np.sqrt(np.sum(c,axis=2))

In [46]:
result

array([[0.45730647, 0.40561773, 0.43496813, ..., 0.32210837, 0.46737854,
        0.86729724],
       [0.54662896, 0.4939321 , 0.64804863, ..., 0.43982632, 0.34213352,
        0.73020761],
       [0.39262847, 0.58098335, 0.5475406 , ..., 0.29384476, 0.5340471 ,
        0.77977997],
       ...,
       [0.58173703, 0.41195646, 0.52525442, ..., 0.52916584, 0.5955721 ,
        0.75141248],
       [0.45165227, 0.40357961, 0.5646753 , ..., 0.33826429, 0.30237948,
        0.76469201],
       [0.31835764, 0.62750192, 0.49928755, ..., 0.40841963, 0.52525653,
        0.61031134]])

In [327]:
sorted_distance_index = np.argsort(result)

In [326]:
sorted_distance_index

array([[ 792, 1037, 1518, ..., 2979,  534, 2068],
       [3337, 2946, 3417, ..., 2702, 2979, 2068],
       [1378,  836, 3230, ...,  799, 2068, 2979],
       ...,
       [3659, 1186,  531, ...,  799, 3891, 2068],
       [2655, 3191,  466, ...,  534, 2979, 2068],
       [3666,  137,   66, ..., 2068, 2023, 1458]], dtype=int64)

In [312]:
min_dist = sorted_distance_index[:,0]

In [343]:
prediction = train_class[min_dist]

In [351]:
 correct_prediction = np.count_nonzero(test_class == prediction)

In [352]:
percentage =( correct_prediction/len(test_dataset) ) *100

In [353]:
percentage

89.5

In [None]:
train_class[sorted_distance_index[0]]

#### Below Test

In [235]:
feature_list = train_data
query_instance = test_data[0,:]

In [236]:
b = feature_list-query_instance

In [237]:
np.square(b)

array([[1.25144630e-04, 1.06918753e-02, 2.57078215e-04, ...,
        1.13718236e-03, 4.32666755e-03, 6.17490931e-02],
       [1.16593096e-02, 3.89782418e-03, 1.96458697e-02, ...,
        7.33249971e-02, 3.79770405e-02, 1.05827566e-05],
       [4.55315599e-03, 4.05873069e-02, 2.65996234e-02, ...,
        2.00470216e-04, 1.18935135e-02, 4.42758242e-02],
       ...,
       [1.08333666e-02, 1.50699642e-03, 1.89433515e-02, ...,
        2.48130922e-05, 1.16052319e-03, 1.66500658e-02],
       [6.83581150e-03, 2.81449245e-02, 4.20503465e-03, ...,
        7.82855734e-03, 1.09421999e-02, 1.04024784e-02],
       [2.68561149e-02, 1.57007506e-02, 9.10919777e-03, ...,
        2.59652558e-02, 2.12821774e-02, 2.28917353e-03]])

In [242]:
z =np.sum(np.square(b),axis=1)
z

array([0.20912921, 0.16452574, 0.18919727, ..., 0.1037538 , 0.2184427 ,
       0.7522045 ])

In [244]:
np.sqrt(z)

array([0.45730647, 0.40561773, 0.43496813, ..., 0.32210837, 0.46737854,
       0.86729724])

In [153]:
# x  =np.equal(c[0],z)

In [None]:
# for i in range(0,len(x)):
#     if not all(x[i]):
#         print('fuck')

Working

In [274]:
feature_list = train_data

In [284]:
query_instance = test_data[0,:]
# query_instance = test_data

In [285]:
# Calculates euclidena distance
check = np.sqrt(np.sum(np.square(feature_list-query_instance),axis=1))

In [286]:
check.shape

(4000,)

In [337]:
check

array([0.45730647, 0.40561773, 0.43496813, ..., 0.32210837, 0.46737854,
       0.86729724])

In [302]:
# Returns the numpy array with the indicies sorted in ascending order
# The first value shws that the minimum distance is at index location 792
sorted_distance_index = np.argsort(check)
sorted_distance_index

array([ 792, 1037, 1518, ..., 2979,  534, 2068], dtype=int64)

In [290]:
# On finding the class which is at the same location, we will get the class of the test data
train_class[sorted_distance_index[0]]

0.0

In [8]:
# Initializing the value of K 

K =1

In [12]:
data

array([[0.54131721, 0.52305685, 0.42921551, ..., 0.52591475, 0.36184407,
        0.        ],
       [0.53091097, 0.3000469 , 0.39346106, ..., 0.3806552 , 0.33904193,
        0.        ],
       [0.29331853, 0.44518117, 0.41390863, ..., 0.50481932, 0.43607184,
        2.        ],
       ...,
       [0.6123325 , 0.53096305, 0.44993697, ..., 0.42408399, 0.49770085,
        0.        ],
       [0.58123698, 0.40641692, 0.44801209, ..., 0.39057652, 0.32998854,
        0.        ],
       [0.64804457, 0.60038784, 0.5537455 , ..., 0.4213437 , 0.49818765,
        1.        ]])

In [184]:
a = np.empty

In [178]:
type(a)

builtin_function_or_method

In [180]:
a = train_class

In [181]:
a

array([2., 0., 2., ..., 2., 1., 2.])

In [182]:
type(a)

numpy.ndarray