# K Nearest Neighbors Regression

KNN regression makes a prediction for a new instance by taking the average target value for the k closest points.

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston

# Data Preparation

In [16]:
boston = load_boston()
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [40]:
print(boston['DESCR'][20:1265])


Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by

In [42]:
print(boston['data'].shape, boston['target'].shape)

X = boston['data']
y = boston['target']

(506, 13) (506,)


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print()
print('y_train:', y_train.shape)
print('y_test:', y_test.shape)

X_train: (379, 13)
X_test: (127, 13)

y_train: (379,)
y_test: (127,)


# Distance metrics

Minkowski distance is a metric that measures the distance between two n-dimensional points.

For $\vec{x} = \begin{pmatrix} x_1 \\ x_2 \\ \vdots \\ x_n\end{pmatrix} $ and $\vec{y} = \begin{pmatrix} y_1 \\ y_2 \\ \vdots \\ y_n\end{pmatrix}$ the Minkowski distance of order $p \ge 1$  for $\vec{x}$ and $\vec{y}$ is given by:

$$
D(\vec{x}, \vec{y}) = \bigg(\sum_{i=1}^n|x_i - y_i|^p\bigg)^{1/p}
$$

Minkowski distance is a generalization of Manhattan ($p=1$) and Euclidean ($p=2$) distances.

In [73]:
# Find the distance between the given instance
# and all points in the training set
instance = X_test[0]
target = y_test[0]

print('instance:')
for k, v in zip(boston['feature_names'], instance):
    print(f'\t{k:8}:{v:10}')
    
print('\ntarget (MEDV) :', target)

instance:
	CRIM    :   0.09178
	ZN      :       0.0
	INDUS   :      4.05
	CHAS    :       0.0
	NOX     :      0.51
	RM      :     6.416
	AGE     :      84.1
	DIS     :    2.6463
	RAD     :       5.0
	TAX     :     296.0
	PTRATIO :      16.6
	B       :     395.5
	LSTAT   :      9.04

target (MEDV) : 23.6


In [67]:
# minkowski distances for p = 1, 2, 3
dist1 = (np.abs(X_train - instance) ** 1).sum(axis=1) ** (1/1)
dist2 = (np.abs(X_train - instance) ** 2).sum(axis=1) ** (1/2)
dist3 = (np.abs(X_train - instance) ** 3).sum(axis=1) ** (1/3)

print('p = 1:', dist1[:5])
print()
print('p = 2:', dist2[:5])
print()
print('p = 3:', dist3[:5])

p = 1: [122.30605 445.36903 139.0655  203.08867 227.72584]

p = 2: [103.45453327 326.03031867  85.37387011 117.99457704 151.17716877]

p = 3: [103.01958254 311.76333109  80.83718433 109.83115029 139.4861913 ]


# The Closest K points

In [75]:
k = 5

dist1_sorted_indx = dist1.argsort()
dist2_sorted_indx = dist2.argsort()
dist3_sorted_indx = dist3.argsort()

closest_k1 = dist1_sorted_indx[:k]
closest_k2 = dist2_sorted_indx[:k]
closest_k3 = dist3_sorted_indx[:k]

In [76]:
dist1[dist1_sorted_indx[:5]]

array([14.35973, 16.78836, 18.88991, 21.69842, 22.1781 ])

In [77]:
print('The indices of the closest', k, 'points.\n')
print('p = 1:', closest_k1)
print('p = 2:', closest_k2)
print('p = 3:', closest_k3)

The indices of the closest 5 points.

p = 1: [115 220 333 178 346]
p = 2: [346 178 220 115 357]
p = 3: [346 178 357 220 317]


In [79]:
print('The targets of the closest', k, 'points.\n')
print('p = 1:', y_train[closest_k1])
print('p = 2:', y_train[closest_k2])
print('p = 3:', y_train[closest_k3])

The targets of the closest 5 points.

p = 1: [24.6 29.9 22.6 23.8 21.6]
p = 2: [21.6 23.8 29.9 24.6 22.8]
p = 3: [21.6 23.8 22.8 29.9 16.2]


# Making predictions

The prediction is just average of the closest points targets.

In [82]:
pred1 = y_train[closest_k1].mean()
pred2 = y_train[closest_k2].mean()
pred3 = y_train[closest_k3].mean()


print(
f'''
Prediction:

    p = 1: {pred1}
    p = 2: {pred2}
    p = 3: {pred3}
''')



Prediction:

    p = 1: 24.5
    p = 2: 24.54
    p = 3: 22.86

