# KNN implementation using numpy

In [1]:
import numpy as np
import pandas as pd
from numpy.random import randint
from collections import Counter

In [2]:
class KNN:

    def __init__(self):
        # https://archive.ics.uci.edu/ml/datasets/Iris
        self._columns = ["sepallength",  "sepalwidth", "petallength", "petalwidth", "target"]
        self._valid_class = ["Iris-setosa", "Iris-versicolor", "Iris-virginica"]

    def load_data(self):
        """load a data from given file """
        dtypes = {"sepallength":np.float64,  
                  "sepalwidth":np.float64, 
                  "petallength":np.float64, 
                  "petalwidth":np.float64, 
                  "target":np.str}
        path_to_file = "./data/iris.data"
        self._df_data = pd.read_csv(path_to_file, header = None, dtype= dtypes, names=self._columns)
        print(self._df_data.info())
        
        
    def split_data(self, train=0.8, test=0.2, X_col = [], y_col = []):
        """ Split a data into training and testing set"""
        total_data = self._df_data.shape[0]
        train_part = int(total_data * train)
        test_part  = int(total_data * test)
        print("Splitted %d train and %d test" %(train_part, test_part))

        idxes = np.arange(total_data)
        np.random.shuffle(idxes)
       
        train_idxes = idxes[:train_part]
        test_idxes  = idxes[train_part:train_part+test_part]
        
        train_data = self._df_data.iloc[train_idxes] 
        test_data  = self._df_data.iloc[test_idxes] 
        
        # convert text category into number
        temp_cols = train_data[y_col].to_numpy()
        y_train   = np.array([self._valid_class.index(tc[0]) for tc in temp_cols])
        temp_cols = test_data[y_col].to_numpy()
        y_test    = np.array([self._valid_class.index(tc[0]) for tc in temp_cols])
        return train_data[X_col].to_numpy(), y_train, test_data[X_col].to_numpy(), y_test
    
    def compute_dist(self, X_train, x_test):
        """compute a distance between one point with vector of points"""
        return np.sqrt(np.sum((x_test - X_train) ** 2, axis=1))
   
    def fit(self, X_train, y_train, X_test, y_test, k=5):
        """get the closest point and its corresponding label"""
        
        y_preds = []
        for x_test_row in X_test:
            dist        = self.compute_dist(X_train, x_test_row)
            dist_widxes = [(i, d) for i, d in enumerate(dist)]
            
            dist_widxes.sort(key= lambda x: x[1])
            selected_idx = [x[0] for x in dist_widxes[:k]]
            y_pred = Counter(y_train[selected_idx]).most_common()[0]
            y_preds.append(y_pred[0])
        return y_preds
    
    def score(self, y_test, y_preds):
        """calculate a prediction result"""
        count_correct = 0
        for actual, predicted in zip(y_test, y_preds):
            if actual == predicted: count_correct += 1
        return count_correct / len(y_test)

In [3]:
oj_knn = KNN()
oj_knn.load_data()

data_columns  = ["sepallength",  "sepalwidth", "petallength", "petalwidth"]
target_column = ["target"]
X_train, y_train, X_test, y_test = oj_knn.split_data(X_col=data_columns, y_col = target_column)
y_pred = oj_knn.fit(X_train, y_train, X_test, y_test)
result = oj_knn.score(y_test, y_pred)
print("Accuracy: %0.2f " %result)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepallength    150 non-null float64
sepalwidth     150 non-null float64
petallength    150 non-null float64
petalwidth     150 non-null float64
target         150 non-null object
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None
Splitted 120 train and 30 test
Accuracy: 0.97 
