In [2]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import gather_data, mean_squared_percentage_error, basis_expansion

# KNN Regressor
The goal of this notebook is to run a KNN regressor and find optimal hyperparameters in respect to number of neighbors and basis expansion.

In [3]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [4]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
kfold = KFold(n_splits=10, shuffle=True)

In [9]:
results = []
for n in range(1, 11):
    x_train_expanded = basis_expansion(x_train, n)
    for k in [3, 5, 10, 15, 20]:
        result = {
            "n": n,
            "k": k,
            "MSE": [],
            "R2": [],
            "MSPE": []
        }
        for train_index, val_index in kfold.split(x_train_expanded, y_train):
            model = KNeighborsRegressor(n_neighbors=k)
            model.fit(x_train_expanded[train_index], y_train[train_index])
            y_pred = model.predict(x_train_expanded[val_index])
            result["MSE"].append(mean_squared_error(y_train[val_index], y_pred))
            result["R2"].append(r2_score(y_train[val_index], y_pred))
            result["MSPE"].append(mean_squared_percentage_error(y_train[val_index], y_pred))
        result["MSE"] = np.mean(result["MSE"])
        result["R2"] = np.mean(result["R2"])
        result["MSPE"] = np.mean(result["MSPE"])
        results.append(result)

In [10]:
results

[{'n': 1,
  'k': 3,
  'MSE': 0.22816691637967296,
  'R2': 0.8194642565509325,
  'MSPE': 1.1259999396649565},
 {'n': 1,
  'k': 5,
  'MSE': 0.23652938652185718,
  'R2': 0.81360431415762,
  'MSPE': 1.1948346133371843},
 {'n': 1,
  'k': 10,
  'MSE': 0.25082295692051076,
  'R2': 0.8027044120705671,
  'MSPE': 1.2537824153538666},
 {'n': 1,
  'k': 15,
  'MSE': 0.2740265625048083,
  'R2': 0.7846285746444577,
  'MSPE': 1.3598536741871077},
 {'n': 1,
  'k': 20,
  'MSE': 0.2978711394787129,
  'R2': 0.7658743787244379,
  'MSPE': 1.4576874583956791},
 {'n': 2,
  'k': 3,
  'MSE': 0.48472662460095517,
  'R2': 0.6185344476349898,
  'MSPE': 2.1912019268695238},
 {'n': 2,
  'k': 5,
  'MSE': 0.45748576345713793,
  'R2': 0.6403677711690944,
  'MSPE': 2.094467300533363},
 {'n': 2,
  'k': 10,
  'MSE': 0.44457473791643354,
  'R2': 0.6506155618689335,
  'MSPE': 2.0057946273790455},
 {'n': 2,
  'k': 15,
  'MSE': 0.4431475535309669,
  'R2': 0.6517380578513592,
  'MSPE': 1.97747407817216},
 {'n': 2,
  'k': 20,
 

In [12]:
sorted(results, key=lambda x: x["MSE"])

[{'n': 1,
  'k': 3,
  'MSE': 0.22816691637967296,
  'R2': 0.8194642565509325,
  'MSPE': 1.1259999396649565},
 {'n': 1,
  'k': 5,
  'MSE': 0.23652938652185718,
  'R2': 0.81360431415762,
  'MSPE': 1.1948346133371843},
 {'n': 1,
  'k': 10,
  'MSE': 0.25082295692051076,
  'R2': 0.8027044120705671,
  'MSPE': 1.2537824153538666},
 {'n': 1,
  'k': 15,
  'MSE': 0.2740265625048083,
  'R2': 0.7846285746444577,
  'MSPE': 1.3598536741871077},
 {'n': 1,
  'k': 20,
  'MSE': 0.2978711394787129,
  'R2': 0.7658743787244379,
  'MSPE': 1.4576874583956791},
 {'n': 2,
  'k': 15,
  'MSE': 0.4431475535309669,
  'R2': 0.6517380578513592,
  'MSPE': 1.97747407817216},
 {'n': 2,
  'k': 10,
  'MSE': 0.44457473791643354,
  'R2': 0.6506155618689335,
  'MSPE': 2.0057946273790455},
 {'n': 2,
  'k': 20,
  'MSE': 0.446416293038559,
  'R2': 0.6493603941719861,
  'MSPE': 2.001025254499633},
 {'n': 2,
  'k': 5,
  'MSE': 0.45748576345713793,
  'R2': 0.6403677711690944,
  'MSPE': 2.094467300533363},
 {'n': 3,
  'k': 20,
  '

In [14]:
sorted(results, key=lambda x: x["R2"], reverse=True)

[{'n': 1,
  'k': 3,
  'MSE': 0.22816691637967296,
  'R2': 0.8194642565509325,
  'MSPE': 1.1259999396649565},
 {'n': 1,
  'k': 5,
  'MSE': 0.23652938652185718,
  'R2': 0.81360431415762,
  'MSPE': 1.1948346133371843},
 {'n': 1,
  'k': 10,
  'MSE': 0.25082295692051076,
  'R2': 0.8027044120705671,
  'MSPE': 1.2537824153538666},
 {'n': 1,
  'k': 15,
  'MSE': 0.2740265625048083,
  'R2': 0.7846285746444577,
  'MSPE': 1.3598536741871077},
 {'n': 1,
  'k': 20,
  'MSE': 0.2978711394787129,
  'R2': 0.7658743787244379,
  'MSPE': 1.4576874583956791},
 {'n': 2,
  'k': 15,
  'MSE': 0.4431475535309669,
  'R2': 0.6517380578513592,
  'MSPE': 1.97747407817216},
 {'n': 2,
  'k': 10,
  'MSE': 0.44457473791643354,
  'R2': 0.6506155618689335,
  'MSPE': 2.0057946273790455},
 {'n': 2,
  'k': 20,
  'MSE': 0.446416293038559,
  'R2': 0.6493603941719861,
  'MSPE': 2.001025254499633},
 {'n': 2,
  'k': 5,
  'MSE': 0.45748576345713793,
  'R2': 0.6403677711690944,
  'MSPE': 2.094467300533363},
 {'n': 3,
  'k': 20,
  '

Best hyperparameters for KNN is n=1 and k=3.

In [15]:
best_model = KNeighborsRegressor(n_neighbors=3)
best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_test)
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"R2: {r2_score(y_test, y_pred)}")
print(f"MSPE: {mean_squared_percentage_error(y_test, y_pred)}")

MSE: 0.29770070941630433
R2: 0.7768343104331787
MSPE: 1.5261705047410536
