In [118]:
import numpy as np
import pandas as pd
import knn_classification
import knn_regression
from sklearn.model_selection import train_test_split

# Disclaimer
### This is an example notebook in order to show the implemented knn algorithm with a classification and regession task. The shown values are not the best possible values, they are just examples! 

# KNN Classification

In [119]:
#read in the data using pandas
df = pd.read_csv('../datasets/diabetes.csv')
df = df.reindex(np.random.permutation(df.index)).reset_index(drop = True)
display(df)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,7,107,74,0,0,29.6,0.254,31,1
1,1,167,74,17,144,23.4,0.447,33,1
2,1,111,94,0,0,32.8,0.265,45,0
3,3,121,52,0,0,36.0,0.127,25,1
4,4,84,90,23,56,39.5,0.159,25,0
...,...,...,...,...,...,...,...,...,...
763,0,93,100,39,72,43.4,1.021,35,0
764,1,111,62,13,182,24.0,0.138,23,0
765,0,105,68,22,0,20.0,0.236,22,0
766,0,165,90,33,680,52.3,0.427,23,0


In [120]:
X = df[df.columns[:-1]]
y = df[df.columns[-1]]

#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

display(X_train.head())
display(X_test.head())
display(y_train.to_frame().head())
display(y_test.to_frame().head())

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
656,13,76,60,0,0,32.8,0.18,41
689,4,141,74,0,0,27.6,0.244,40
583,5,155,84,44,545,38.7,0.619,34
391,1,147,94,41,0,49.3,0.358,27
239,5,147,78,0,0,33.7,0.218,65


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
723,2,174,88,37,120,44.5,0.646,24
691,0,180,66,39,0,42.0,1.893,25
517,2,95,54,14,88,26.1,0.748,22
264,5,123,74,40,77,34.1,0.269,28
95,13,106,70,0,0,34.2,0.251,52


Unnamed: 0,Outcome
656,0
689,0
583,0
391,1
239,0


Unnamed: 0,Outcome
723,1
691,1
517,0
264,0
95,0


## Not normalized

In [121]:
knn_testing = knn_classification.KNN_self(n_neighbors = 3, distance_metric="euclidean", weighting=False)

knn_testing.fit(X_train, y_train)

In [122]:
predictions = knn_testing.predict(X_test)
predictions

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0],
      dtype=int64)

In [123]:
knn_testing.accuracy(predictions, y_test)

0.538961038961039

## Normalized

In [124]:
knn_testing_scal = knn_classification.KNN_self(n_neighbors = 3, distance_metric="euclidean", weighting=True)

knn_testing_scal.fit(X_train, y_train, scaling_method="standardization")

In [125]:
predictions_scal = knn_testing_scal.predict(X_test)
predictions_scal

array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [126]:
knn_testing_scal.accuracy(predictions_scal, y_test)

0.5974025974025974

# KNN Regression

In [127]:
#read in the data using pandas
df = pd.read_csv('../datasets/advertising.csv')
display(df)

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,14.0
197,177.0,9.3,6.4,14.8
198,283.6,42.0,66.2,25.5


In [128]:
X = df[df.columns[:-1]]
y = df[df.columns[-1]]

#split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

display(X_train.head())
display(X_test.head())
display(y_train.to_frame().head())
display(y_test.to_frame().head())

Unnamed: 0,TV,Radio,Newspaper
90,134.3,4.9,9.3
24,62.3,12.6,18.3
42,293.6,27.7,1.8
130,0.7,39.6,8.7
101,296.4,36.3,100.9


Unnamed: 0,TV,Radio,Newspaper
2,17.2,45.9,69.3
104,238.2,34.3,5.3
16,67.8,36.6,114.0
78,5.4,29.9,9.4
187,191.1,28.7,18.2


Unnamed: 0,Sales
90,14.0
24,9.7
42,20.7
130,1.6
101,23.8


Unnamed: 0,Sales
2,12.0
104,20.7
16,12.5
78,5.3
187,17.3


## Not normalized

In [129]:
knn_testing_reg = knn_regression.KNN_self(n_neighbors = 3, distance_metric="euclidean", weighting=False)

knn_testing_reg.fit(X_train, y_train)

In [130]:
predictions_reg = knn_testing_reg.predict(X_test)
predictions_reg

array([22.33333333, 22.33333333, 22.33333333, 20.56666667, 20.56666667,
       17.56666667, 17.56666667, 17.56666667, 18.23333333, 17.4       ,
       17.9       , 17.9       , 15.53333333, 15.53333333, 15.53333333,
       15.53333333, 15.53333333, 11.4       , 10.63333333, 12.9       ,
       12.9       , 14.93333333, 16.3       , 15.36666667, 13.3       ,
       14.06666667, 14.06666667, 13.63333333, 17.43333333, 19.03333333,
       17.7       , 17.33333333, 17.33333333, 17.7       , 20.33333333,
       20.33333333, 20.33333333, 20.33333333, 20.33333333, 18.13333333])

In [131]:
knn_testing_reg.error(predictions_reg, y_test, error_type="MSE")

31.15416666666666

## Normalized

In [132]:
knn_testing_reg_scale = knn_regression.KNN_self(n_neighbors = 3, distance_metric="euclidean", weighting=False)

knn_testing_reg_scale.fit(X_train, y_train, scaling_method="standardization")

In [133]:
predictions_reg_scale = knn_testing_reg_scale.predict(X_test)
predictions_reg_scale

array([12.6       , 13.23333333, 22.9       , 15.13333333, 17.6       ,
       16.23333333, 15.76666667, 15.76666667, 17.2       , 17.2       ,
       17.06666667, 17.06666667, 16.3       , 16.73333333, 16.73333333,
       16.73333333, 16.73333333, 16.73333333, 16.73333333, 16.73333333,
       15.76666667, 15.76666667, 15.76666667, 15.76666667, 15.76666667,
       15.76666667, 15.76666667, 15.76666667, 15.76666667, 15.76666667,
       15.76666667, 15.76666667, 15.76666667, 15.76666667, 15.76666667,
       15.76666667, 15.76666667, 15.76666667, 15.76666667, 15.76666667])

In [134]:
knn_testing_reg_scale.error(predictions_reg_scale, y_test, error_type="MSE")

20.151194444444446