In [47]:
import numpy as np
import pandas as pd

In [83]:
K = 30

train_df = pd.read_csv('./KNN/KNN_train.csv')
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135 entries, 0 to 134
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             135 non-null    int64  
 1   SepalLengthCm  135 non-null    float64
 2   SepalWidthCm   135 non-null    float64
 3   PetalLengthCm  135 non-null    float64
 4   PetalWidthCm   135 non-null    float64
 5   Labels         135 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 6.5+ KB
None


In [49]:
# exclude Id row since pandas df has automatic indexing
train_x_df = train_df.iloc[:, 1:]
print(train_x_df)

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm          Labels
0              5.0           3.5            1.4          0.20     Iris-setosa
1              4.9           3.1            1.3          0.21     Iris-setosa
2              4.7           3.2            1.3          0.20     Iris-setosa
3              4.6           3.1            1.5          0.22     Iris-setosa
4              5.0           3.6            1.4          0.21     Iris-setosa
..             ...           ...            ...           ...             ...
130            6.7           3.1            5.6          2.40  Iris-virginica
131            6.9           3.1            5.1          2.30  Iris-virginica
132            5.8           2.7            5.1          1.90  Iris-virginica
133            6.8           3.2            5.9          2.30  Iris-virginica
134            6.7           3.3            5.7          2.50  Iris-virginica

[135 rows x 5 columns]


In [50]:
# Euclidean distance compute method
def calc_dist(p, q):
   return np.linalg.norm(q - p)

# main method for calculating KNN, following algorithm
def knn(K, x, q):
   # calculate distances to neighbors (all other data rows)
   neighbor_distances = x.apply(lambda x_i, q, cols: calc_dist(
       x_i.iloc[:cols], q), axis=1, q=q, cols=x.shape[1]-1)
   
   # sort neighbor distances in ascending order and keep K neighbors
   k_nearest_distances = neighbor_distances.sort_values().iloc[:K]
   k_nearest_neighbors = x.filter(items=k_nearest_distances.index.to_list(), axis=0)
   
   # obtain the most common label among K nearest neighbors
   return k_nearest_neighbors['Labels'].mode().max()

In [51]:
valid_df = pd.read_csv('./KNN/KNN_valid.csv')
valid_x_df = valid_df.iloc[:, 1:valid_df.shape[1]-1]
valid_y_df = valid_df.loc[:, 'Labels']
print(valid_x_df)
print(valid_y_df)

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0            4.8           3.0            1.4           0.3
1            5.1           3.8            1.6           0.2
2            5.7           3.0            4.2           1.2
3            5.7           2.9            4.2           1.3
4            6.7           3.0            5.2           2.3
5            6.3           2.5            5.0           1.9
0        Iris-setosa
1        Iris-setosa
2    Iris-versicolor
3    Iris-versicolor
4     Iris-virginica
5     Iris-virginica
Name: Labels, dtype: object


In [52]:
# use validation data to calculate experimental labels
valid_y_exp_df = valid_x_df.apply(lambda valid_x_i, train_x_df: knn(
    K, train_x_df, valid_x_i), axis=1, train_x_df=train_x_df)
print(valid_y_exp_df)

# confirm that the experimental labels match the actual labels
print(valid_y_exp_df.equals(valid_y_df))

0        Iris-setosa
1        Iris-setosa
2    Iris-versicolor
3    Iris-versicolor
4     Iris-virginica
5     Iris-virginica
dtype: object
True


In [53]:
test_df = pd.read_csv('./KNN/KNN_test.csv')
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             10 non-null     int64  
 1   SepalLengthCm  10 non-null     float64
 2   SepalWidthCm   10 non-null     float64
 3   PetalLengthCm  10 non-null     float64
 4   PetalWidthCm   10 non-null     float64
 5   Labels         10 non-null     object 
dtypes: float64(4), int64(1), object(1)
memory usage: 608.0+ bytes
None


In [54]:
test_x_df = test_df.iloc[:, 1:test_df.shape[1] - 1]
print(test_x_df)

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0            4.6           3.2            1.4          0.20
1            5.3           3.7            1.5          0.20
2            5.0           3.3            1.4          0.22
3            6.2           2.9            4.3          1.30
4            5.1           2.5            3.0          1.10
5            5.7           2.8            4.1          1.30
6            6.5           3.0            5.2          2.00
7            6.2           3.4            5.4          2.30
8            5.9           3.0            5.1          1.80
9            4.5           3.1            1.3          0.30


In [84]:
test_y_df = test_x_df.apply(lambda test_x_df, train_x_df: knn(
    K, train_x_df, test_x_df), axis=1, train_x_df=train_x_df)
print(test_y_df)

0        Iris-setosa
1        Iris-setosa
2        Iris-setosa
3    Iris-versicolor
4    Iris-versicolor
5    Iris-versicolor
6     Iris-virginica
7     Iris-virginica
8     Iris-virginica
9        Iris-setosa
dtype: object
