In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn import linear_model
from sklearn.model_selection import KFold, cross_val_score, LeaveOneOut
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

In [2]:
from abc import ABC, abstractmethod

class KNearestNeighbors(ABC):
    ''' Base class for implementation of interface of KNN model '''
    
    def __init__(self, n_neighbors=1):
        self.n_neighbors = n_neighbors
        self.X_train = None
        self.y_train = None

    @abstractmethod
    def predict(self, X_test):
        pass
    
    def set_params(self, n_neighbors=None):
        if n_neighbors:
            self.n_neighbors = n_neighbors
    
    def __find_neighbors(self, x_test):
        if self.X_train is None:
            raise Exception('Training data has not been loaded')
            
        if x_test is None:
            raise Exception('Test data has not been provided')
            
        distances = list()
        
        for i, x_train in enumerate(self.X_train):
            distance = np.linalg.norm(x_train - x_test)
            distances.append((i, distance))
        
        distances.sort(key=lambda x: x[1])

        return [distance[0] for distance in distances][:self.n_neighbors]
    
    def retrieve_neighbors(self, X_test):
        if X_test is None:
            raise Exception('Test data has not been provided')
        
        return self.__find_neighbors(X_test)
    
    def fit(self, X_train, y_train):
        if X_train is None:
            raise Exception('Training feature data has not bee provided')
        
        if y_train is None:
            raise Exception('Training prediction data has not been provided')
            
        self.X_train = X_train
        self.y_train = y_train

In [3]:
class KNearestNeighborsRegressor(KNearestNeighbors):
    ''' Implementation for regression tasks using KNN algorithm '''
    
    def __init__(self, n_neighbors=1):
        super().__init__(n_neighbors)

    def predict(self, X_test):
        for x_test in X_test:
            print('Test entry:', x_test)
            for index in self.retrieve_neighbors(x_test):
                print('Neighbor:', self.X_train[index], self.y_train[index])
        return [
            np.average([self.y_train[index] for index in self.retrieve_neighbors(x_test)]) for x_test in X_test
        ]

In [4]:
X_train = np.array([
    [1, 7],
    [2, 5],
    [2, 6],
    [3, 3],
    [7, 1],
    [3, 1],
    [4, 2],
    [5, 4],
    [5, 6]
])

y_train = np.array([
    25,
    21,
    14,
    32,
    14,
    14,
    25,
    18,
    12
])

k = 6

In [5]:
model = KNearestNeighborsRegressor(n_neighbors=k)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)

neigh = KNeighborsRegressor(n_neighbors=k)
neigh.fit(X_train, y_train)

print('Implementation:', y_pred)
print('sklearn:', neigh.predict(X_train))

Test entry: [1 7]
Neighbor: [1 7] 25
Neighbor: [2 6] 14
Neighbor: [2 5] 21
Neighbor: [5 6] 12
Neighbor: [3 3] 32
Neighbor: [5 4] 18
Test entry: [2 5]
Neighbor: [2 5] 21
Neighbor: [2 6] 14
Neighbor: [1 7] 25
Neighbor: [3 3] 32
Neighbor: [5 4] 18
Neighbor: [5 6] 12
Test entry: [2 6]
Neighbor: [2 6] 14
Neighbor: [2 5] 21
Neighbor: [1 7] 25
Neighbor: [5 6] 12
Neighbor: [3 3] 32
Neighbor: [5 4] 18
Test entry: [3 3]
Neighbor: [3 3] 32
Neighbor: [4 2] 25
Neighbor: [3 1] 14
Neighbor: [2 5] 21
Neighbor: [5 4] 18
Neighbor: [2 6] 14
Test entry: [7 1]
Neighbor: [7 1] 14
Neighbor: [4 2] 25
Neighbor: [5 4] 18
Neighbor: [3 1] 14
Neighbor: [3 3] 32
Neighbor: [5 6] 12
Test entry: [3 1]
Neighbor: [3 1] 14
Neighbor: [4 2] 25
Neighbor: [3 3] 32
Neighbor: [5 4] 18
Neighbor: [7 1] 14
Neighbor: [2 5] 21
Test entry: [4 2]
Neighbor: [4 2] 25
Neighbor: [3 3] 32
Neighbor: [3 1] 14
Neighbor: [5 4] 18
Neighbor: [7 1] 14
Neighbor: [2 5] 21
Test entry: [5 4]
Neighbor: [5 4] 18
Neighbor: [5 6] 12
Neighbor: [3 3] 32
N

In [6]:
print('Training error using kNN regressor with k = 6:', np.power(y_pred - y_train, 2).mean())

Training error using kNN regressor with k = 6: 39.50925925925925


In [7]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

print(model.intercept_, model.coef_)
y_pred = model.predict(X_train)
print('Training error using linear model:', np.power(y_pred - y_train, 2).mean())

30.131075914800654 [-2.03850355 -0.88421628]
Training error using linear model: 31.71864190788276


In [8]:
X_train = np.array([
    [-2],
    [2],
    [1],
    [-1],
    [0],
    [3]
])

y_train = np.array([
    4,
    4,
    1,
    1,
    0,
    9
])

k = 2
K = 3

In [9]:
model = KNeighborsRegressor(n_neighbors=k)
kf = KFold(n_splits=K, shuffle=False)

for train_index, test_index in kf.split(X_train):
    print("TRAIN:", train_index, "TEST:", test_index)

cv_score = -cross_val_score(model, X_train, y_train, cv=kf, scoring='neg_mean_squared_error')

print(cv_score)
print(cv_score.mean())

TRAIN: [2 3 4 5] TEST: [0 1]
TRAIN: [0 1 4 5] TEST: [2 3]
TRAIN: [0 1 2 3] TEST: [4 5]
[ 6.625  1.    21.625]
9.75


In [10]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)

loo = LeaveOneOut()

for train_index, test_index in loo.split(X_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train_g = np.take(X_train, train_index).reshape(-1, 1)
    y_train_g = np.take(y_train, train_index).reshape(-1, 1)
    X_test_g = np.take(X_train, test_index).reshape(-1, 1)
    y_test_g = np.take(y_train, test_index).reshape(-1, 1)
    print(X_train_g, y_train_g)
    m = linear_model.LinearRegression()
    m.fit(X_train_g, y_train_g)
    print(m.intercept_, m.coef_)
    
    
cv_score = -cross_val_score(model, X_train, y_train, cv=loo, scoring='neg_mean_squared_error')
       
print(cv_score)
print(cv_score.mean())

TRAIN: [1 2 3 4 5] TEST: [0]
[[ 2]
 [ 1]
 [-1]
 [ 0]
 [ 3]] [[4]
 [1]
 [1]
 [0]
 [9]]
[1.] [[2.]]
TRAIN: [0 2 3 4 5] TEST: [1]
[[-2]
 [ 1]
 [-1]
 [ 0]
 [ 3]] [[4]
 [1]
 [1]
 [0]
 [9]]
[2.78378378] [[1.08108108]]
TRAIN: [0 1 3 4 5] TEST: [2]
[[-2]
 [ 2]
 [-1]
 [ 0]
 [ 3]] [[4]
 [4]
 [1]
 [0]
 [9]]
[3.1627907] [[1.09302326]]
TRAIN: [0 1 2 4 5] TEST: [3]
[[-2]
 [ 2]
 [ 1]
 [ 0]
 [ 3]] [[4]
 [4]
 [1]
 [0]
 [9]]
[2.86486486] [[0.91891892]]
TRAIN: [0 1 2 3 5] TEST: [4]
[[-2]
 [ 2]
 [ 1]
 [-1]
 [ 3]] [[4]
 [4]
 [1]
 [1]
 [9]]
[3.25581395] [[0.90697674]]
TRAIN: [0 1 2 3 4] TEST: [5]
[[-2]
 [ 2]
 [ 1]
 [-1]
 [ 0]] [[4]
 [4]
 [1]
 [1]
 [0]]
[2.] [[-1.40433339e-16]]
[49.          0.89481373 10.6003245   0.89481373 10.6003245  49.        ]
20.165046077460385
