In [1]:
import numpy as np
import pandas as pd
import knn_classification
import knn_regression
from sklearn.model_selection import train_test_split

# Disclaimer
### This is an example notebook in order to show the implemented knn algorithm with a classification and regession task. The shown values are not the best possible values, they are just examples! 

# KNN Classification

In [2]:
#read in the data using pandas
df = pd.read_csv('../datasets/diabetes.csv')
df = df.reindex(np.random.permutation(df.index)).reset_index(drop = True)
display(df)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,89,90,30,0,33.5,0.292,42,0
1,8,196,76,29,280,37.5,0.605,57,1
2,1,100,66,29,196,32.0,0.444,42,0
3,2,115,64,22,0,30.8,0.421,21,0
4,7,62,78,0,0,32.6,0.391,41,0
...,...,...,...,...,...,...,...,...,...
763,4,125,70,18,122,28.9,1.144,45,1
764,1,107,50,19,0,28.3,0.181,29,0
765,7,114,66,0,0,32.8,0.258,42,1
766,3,150,76,0,0,21.0,0.207,37,0


In [3]:
X = df[df.columns[:-1]]
y = df[df.columns[-1]]

#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

display(X_train.head())
display(X_test.head())
display(y_train.to_frame().head())
display(y_test.to_frame().head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
6,4,83,86,19,0,29.3,0.317,34
495,1,163,72,0,0,39.0,1.222,33
240,3,80,82,31,70,34.2,1.292,27
629,2,92,62,28,0,31.6,0.13,24
625,7,94,64,25,79,33.3,0.738,41


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
294,6,144,72,27,228,33.9,0.255,40
212,5,88,66,21,23,24.4,0.342,30
698,0,93,60,25,92,28.7,0.532,22
325,7,142,60,33,190,28.8,0.687,61
762,1,168,88,29,0,35.0,0.905,52


Unnamed: 0,Outcome
6,0
495,1
240,1
629,0
625,0


Unnamed: 0,Outcome
294,0
212,0
698,0
325,0
762,1


## Not normalized

In [4]:
knn_testing = knn_classification.KNN_self(n_neighbors = 3, distance_metric="euclidean", weighting=False)

knn_testing.fit(X_train, y_train)

In [5]:
predictions = knn_testing.predict(X_test)
predictions

array([0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [6]:
knn_testing.accuracy(predictions, y_test)

0.6363636363636364

## Normalized

In [7]:
knn_testing_scal = knn_classification.KNN_self(n_neighbors = 3, distance_metric="euclidean", weighting=True)

knn_testing_scal.fit(X_train, y_train, scaling_method="standardization")

In [8]:
predictions_scal = knn_testing_scal.predict(X_test)
predictions_scal

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [9]:
knn_testing_scal.accuracy(predictions_scal, y_test)

0.5324675324675324

# KNN Regression

In [10]:
#read in the data using pandas
df = pd.read_csv('../datasets/advertising.csv')
display(df)

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,14.0
197,177.0,9.3,6.4,14.8
198,283.6,42.0,66.2,25.5


In [11]:
X = df[df.columns[:-1]]
y = df[df.columns[-1]]

#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

display(X_train.head())
display(X_test.head())
display(y_train.to_frame().head())
display(y_test.to_frame().head())

Unnamed: 0,TV,Radio,Newspaper
159,131.7,18.4,34.6
194,149.7,35.6,6.0
165,234.5,3.4,84.8
56,7.3,28.1,41.4
124,229.5,32.3,74.2


Unnamed: 0,TV,Radio,Newspaper
95,163.3,31.6,52.9
172,19.6,20.1,17.0
108,13.1,0.4,25.6
19,147.3,23.9,19.1
192,17.2,4.1,31.6


Unnamed: 0,Sales
159,12.9
194,17.3
165,16.9
56,5.5
124,19.7


Unnamed: 0,Sales
95,16.9
172,7.6
108,5.3
19,14.6
192,5.9


## Not normalized

In [12]:
knn_testing_reg = knn_regression.KNN_self(n_neighbors = 3, distance_metric="euclidean", weighting=False)

knn_testing_reg.fit(X_train, y_train)

In [13]:
predictions_reg = knn_testing_reg.predict(X_test)
predictions_reg

array([19.4       , 19.4       , 19.4       , 19.4       , 19.4       ,
       19.4       , 19.4       , 19.26666667, 19.26666667, 19.26666667,
       19.26666667, 19.26666667, 19.4       , 19.4       , 19.4       ,
       17.6       , 17.6       , 17.6       , 17.63333333, 17.63333333,
       17.63333333, 17.63333333, 17.9       , 18.33333333, 17.46666667,
       17.46666667, 17.46666667, 16.63333333, 19.4       , 19.4       ,
       19.4       , 19.4       , 19.4       , 19.4       , 15.03333333,
       15.03333333, 16.7       , 16.7       , 15.5       , 13.83333333])

In [14]:
knn_testing_reg.error(predictions_reg, y_test, error_type="MSE")

45.82972222222223

## Normalized

In [15]:
knn_testing_reg_scale = knn_regression.KNN_self(n_neighbors = 3, distance_metric="euclidean", weighting=False)

knn_testing_reg_scale.fit(X_train, y_train, scaling_method="standardization")

In [16]:
predictions_reg_scale = knn_testing_reg_scale.predict(X_test)
predictions_reg_scale

array([12.86666667, 13.43333333, 17.76666667, 15.4       , 13.7       ,
       15.63333333, 15.83333333, 15.56666667, 15.26666667, 15.8       ,
       15.8       , 15.8       , 15.86666667, 15.86666667, 15.86666667,
       15.86666667, 15.86666667, 15.86666667, 15.03333333, 15.03333333,
       15.03333333, 15.03333333, 15.03333333, 15.03333333, 15.03333333,
       15.86666667, 15.86666667, 15.86666667, 15.86666667, 15.86666667,
       15.86666667, 14.73333333, 14.73333333, 14.73333333, 14.73333333,
       14.73333333, 14.73333333, 14.73333333, 14.73333333, 14.73333333])

In [17]:
knn_testing_reg_scale.error(predictions_reg_scale, y_test, error_type="MSE")

28.749305555555555