In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import gather_data, mean_squared_percentage_error, basis_expansion

# KNN Regressor
The goal of this notebook is to run a KNN regressor and find optimal hyperparameters in respect to number of neighbors and basis expansion.

In [2]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
kfold = KFold(n_splits=10, shuffle=True)

In [4]:
results = []
for n in range(1, 11):
    x_train_expanded = basis_expansion(x_train, n)
    for k in [3, 5, 10, 15, 20]:
        result = {
            "n": n,
            "k": k,
            "MSE": [],
            "R2": [],
            "MSPE": []
        }
        for train_index, val_index in kfold.split(x_train_expanded, y_train):
            model = KNeighborsRegressor(n_neighbors=k)
            model.fit(x_train_expanded[train_index], y_train[train_index])
            y_pred = model.predict(x_train_expanded[val_index])
            result["MSE"].append(mean_squared_error(y_train[val_index], y_pred))
            result["R2"].append(r2_score(y_train[val_index], y_pred))
            result["MSPE"].append(mean_squared_percentage_error(y_train[val_index], y_pred))
        result["MSE"] = np.mean(result["MSE"])
        result["R2"] = np.mean(result["R2"])
        result["MSPE"] = np.mean(result["MSPE"])
        results.append(result)

In [5]:
results

[{'n': 1,
  'k': 3,
  'MSE': 0.18655608742621638,
  'R2': 0.855242526976971,
  'MSPE': 0.9283351819082821},
 {'n': 1,
  'k': 5,
  'MSE': 0.19846652684734295,
  'R2': 0.8484922390225907,
  'MSPE': 1.0301054500236746},
 {'n': 1,
  'k': 10,
  'MSE': 0.21271520572498118,
  'R2': 0.8379435827280322,
  'MSPE': 1.080038047185257},
 {'n': 1,
  'k': 15,
  'MSE': 0.23558677602409114,
  'R2': 0.8186530656936348,
  'MSPE': 1.2242277896140477},
 {'n': 1,
  'k': 20,
  'MSE': 0.2465515252030349,
  'R2': 0.8092590209005707,
  'MSPE': 1.2954023990645305},
 {'n': 2,
  'k': 3,
  'MSE': 0.18413878502979392,
  'R2': 0.8592541778909526,
  'MSPE': 0.9268738261631672},
 {'n': 2,
  'k': 5,
  'MSE': 0.184712438160753,
  'R2': 0.8573649542038005,
  'MSPE': 0.956151976435954},
 {'n': 2,
  'k': 10,
  'MSE': 0.20696210512802793,
  'R2': 0.8417255507631278,
  'MSPE': 1.0525150959382696},
 {'n': 2,
  'k': 15,
  'MSE': 0.2238457442859219,
  'R2': 0.8278917958437016,
  'MSPE': 1.1798734359374001},
 {'n': 2,
  'k': 20,


In [6]:
# Sort by lowest MSE
sorted(results, key=lambda x: x["MSE"])

[{'n': 4,
  'k': 3,
  'MSE': 0.18195418830093021,
  'R2': 0.8611069382288132,
  'MSPE': 0.9167297192136912},
 {'n': 3,
  'k': 3,
  'MSE': 0.18354039406768205,
  'R2': 0.8581230353252494,
  'MSPE': 0.9096318287212058},
 {'n': 2,
  'k': 3,
  'MSE': 0.18413878502979392,
  'R2': 0.8592541778909526,
  'MSPE': 0.9268738261631672},
 {'n': 2,
  'k': 5,
  'MSE': 0.184712438160753,
  'R2': 0.8573649542038005,
  'MSPE': 0.956151976435954},
 {'n': 3,
  'k': 5,
  'MSE': 0.18646451040562767,
  'R2': 0.8562623664488747,
  'MSPE': 0.9742431303001279},
 {'n': 1,
  'k': 3,
  'MSE': 0.18655608742621638,
  'R2': 0.855242526976971,
  'MSPE': 0.9283351819082821},
 {'n': 5,
  'k': 3,
  'MSE': 0.18716117461744908,
  'R2': 0.8560586248306488,
  'MSPE': 0.9429469940388632},
 {'n': 4,
  'k': 5,
  'MSE': 0.18983743793905697,
  'R2': 0.8541676948415399,
  'MSPE': 1.0162770358053788},
 {'n': 6,
  'k': 3,
  'MSE': 0.19210823317727332,
  'R2': 0.8523049198080914,
  'MSPE': 0.9838713526111912},
 {'n': 5,
  'k': 5,
  '

In [7]:
# Sort by highest R2
sorted(results, key=lambda x: x["R2"], reverse=True)

[{'n': 4,
  'k': 3,
  'MSE': 0.18195418830093021,
  'R2': 0.8611069382288132,
  'MSPE': 0.9167297192136912},
 {'n': 2,
  'k': 3,
  'MSE': 0.18413878502979392,
  'R2': 0.8592541778909526,
  'MSPE': 0.9268738261631672},
 {'n': 3,
  'k': 3,
  'MSE': 0.18354039406768205,
  'R2': 0.8581230353252494,
  'MSPE': 0.9096318287212058},
 {'n': 2,
  'k': 5,
  'MSE': 0.184712438160753,
  'R2': 0.8573649542038005,
  'MSPE': 0.956151976435954},
 {'n': 3,
  'k': 5,
  'MSE': 0.18646451040562767,
  'R2': 0.8562623664488747,
  'MSPE': 0.9742431303001279},
 {'n': 5,
  'k': 3,
  'MSE': 0.18716117461744908,
  'R2': 0.8560586248306488,
  'MSPE': 0.9429469940388632},
 {'n': 1,
  'k': 3,
  'MSE': 0.18655608742621638,
  'R2': 0.855242526976971,
  'MSPE': 0.9283351819082821},
 {'n': 4,
  'k': 5,
  'MSE': 0.18983743793905697,
  'R2': 0.8541676948415399,
  'MSPE': 1.0162770358053788},
 {'n': 6,
  'k': 3,
  'MSE': 0.19210823317727332,
  'R2': 0.8523049198080914,
  'MSPE': 0.9838713526111912},
 {'n': 7,
  'k': 3,
  '

Best hyperparameters for KNN is n=4 and k=3.

In [9]:
x_train_expanded = basis_expansion(x_train, 4)
x_test_expanded = basis_expansion(x_test, 4)
best_model = KNeighborsRegressor(n_neighbors=3)
best_model.fit(x_train_expanded, y_train)
y_pred = best_model.predict(x_test_expanded)
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"R2: {r2_score(y_test, y_pred)}")
print(f"MSPE: {mean_squared_percentage_error(y_test, y_pred)}")

MSE: 0.2124819897024487
R2: 0.8329545921844022
MSPE: 0.9946491621631134
