In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import gather_data, mean_squared_percentage_error, basis_expansion

# KNN Regressor
The goal of this notebook is to run a KNN regressor and find optimal hyperparameters in respect to number of neighbors and basis expansion.

In [2]:
data, label = gather_data()

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
x_train, x_test, y_train, y_test = train_test_split(data, label, test_size=0.2)
kfold = KFold(n_splits=10, shuffle=True)

In [4]:
results = []
for n in range(1, 11):
    x_train_expanded = basis_expansion(x_train, n)
    for k in [3, 5, 10, 15, 20]:
        result = {
            "n": n,
            "k": k,
            "MSE": [],
            "R2": [],
            "MSPE": []
        }
        for train_index, val_index in kfold.split(x_train_expanded, y_train):
            model = KNeighborsRegressor(n_neighbors=k)
            model.fit(x_train_expanded[train_index], y_train[train_index])
            y_pred = model.predict(x_train_expanded[val_index])
            result["MSE"].append(mean_squared_error(y_train[val_index], y_pred))
            result["R2"].append(r2_score(y_train[val_index], y_pred))
            result["MSPE"].append(mean_squared_percentage_error(y_train[val_index], y_pred))
        result["MSE"] = np.mean(result["MSE"])
        result["R2"] = np.mean(result["R2"])
        result["MSPE"] = np.mean(result["MSPE"])
        results.append(result)

In [5]:
results

[{'n': 1,
  'k': 3,
  'MSE': 0.24012357022472303,
  'R2': 0.8120133254592006,
  'MSPE': 1.1759925112984821},
 {'n': 1,
  'k': 5,
  'MSE': 0.23341454023539704,
  'R2': 0.8176202487135018,
  'MSPE': 1.1885683393570519},
 {'n': 1,
  'k': 10,
  'MSE': 0.2601411086221729,
  'R2': 0.7956101244260133,
  'MSPE': 1.3133389424944246},
 {'n': 1,
  'k': 15,
  'MSE': 0.27655624934243883,
  'R2': 0.7842195807604335,
  'MSPE': 1.3797210456249922},
 {'n': 1,
  'k': 20,
  'MSE': 0.30250550038214,
  'R2': 0.7633540719208068,
  'MSPE': 1.4826610820265362},
 {'n': 2,
  'k': 3,
  'MSE': 0.49309841574487095,
  'R2': 0.6142784339007059,
  'MSPE': 2.1418479709247342},
 {'n': 2,
  'k': 5,
  'MSE': 0.4761181776681589,
  'R2': 0.6293067381713928,
  'MSPE': 2.1339185294656002},
 {'n': 2,
  'k': 10,
  'MSE': 0.4616429170941472,
  'R2': 0.6360973210128946,
  'MSPE': 2.0654839026537415},
 {'n': 2,
  'k': 15,
  'MSE': 0.4607317173385182,
  'R2': 0.6385166483445129,
  'MSPE': 2.1049494895551923},
 {'n': 2,
  'k': 20,


In [6]:
# Sort by lowest MSE
sorted(results, key=lambda x: x["MSE"])

[{'n': 1,
  'k': 5,
  'MSE': 0.23341454023539704,
  'R2': 0.8176202487135018,
  'MSPE': 1.1885683393570519},
 {'n': 1,
  'k': 3,
  'MSE': 0.24012357022472303,
  'R2': 0.8120133254592006,
  'MSPE': 1.1759925112984821},
 {'n': 1,
  'k': 10,
  'MSE': 0.2601411086221729,
  'R2': 0.7956101244260133,
  'MSPE': 1.3133389424944246},
 {'n': 1,
  'k': 15,
  'MSE': 0.27655624934243883,
  'R2': 0.7842195807604335,
  'MSPE': 1.3797210456249922},
 {'n': 1,
  'k': 20,
  'MSE': 0.30250550038214,
  'R2': 0.7633540719208068,
  'MSPE': 1.4826610820265362},
 {'n': 2,
  'k': 20,
  'MSE': 0.4581326553218673,
  'R2': 0.6417417279288424,
  'MSPE': 2.1095767339202007},
 {'n': 2,
  'k': 15,
  'MSE': 0.4607317173385182,
  'R2': 0.6385166483445129,
  'MSPE': 2.1049494895551923},
 {'n': 2,
  'k': 10,
  'MSE': 0.4616429170941472,
  'R2': 0.6360973210128946,
  'MSPE': 2.0654839026537415},
 {'n': 2,
  'k': 5,
  'MSE': 0.4761181776681589,
  'R2': 0.6293067381713928,
  'MSPE': 2.1339185294656002},
 {'n': 3,
  'k': 20,


In [7]:
# Sort by highest R2
sorted(results, key=lambda x: x["R2"], reverse=True)

[{'n': 1,
  'k': 5,
  'MSE': 0.23341454023539704,
  'R2': 0.8176202487135018,
  'MSPE': 1.1885683393570519},
 {'n': 1,
  'k': 3,
  'MSE': 0.24012357022472303,
  'R2': 0.8120133254592006,
  'MSPE': 1.1759925112984821},
 {'n': 1,
  'k': 10,
  'MSE': 0.2601411086221729,
  'R2': 0.7956101244260133,
  'MSPE': 1.3133389424944246},
 {'n': 1,
  'k': 15,
  'MSE': 0.27655624934243883,
  'R2': 0.7842195807604335,
  'MSPE': 1.3797210456249922},
 {'n': 1,
  'k': 20,
  'MSE': 0.30250550038214,
  'R2': 0.7633540719208068,
  'MSPE': 1.4826610820265362},
 {'n': 2,
  'k': 20,
  'MSE': 0.4581326553218673,
  'R2': 0.6417417279288424,
  'MSPE': 2.1095767339202007},
 {'n': 2,
  'k': 15,
  'MSE': 0.4607317173385182,
  'R2': 0.6385166483445129,
  'MSPE': 2.1049494895551923},
 {'n': 2,
  'k': 10,
  'MSE': 0.4616429170941472,
  'R2': 0.6360973210128946,
  'MSPE': 2.0654839026537415},
 {'n': 2,
  'k': 5,
  'MSE': 0.4761181776681589,
  'R2': 0.6293067381713928,
  'MSPE': 2.1339185294656002},
 {'n': 3,
  'k': 20,


Best hyperparameters for KNN is n=1 and k=3.

In [8]:
best_model = KNeighborsRegressor(n_neighbors=3)
best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_test)
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"R2: {r2_score(y_test, y_pred)}")
print(f"MSPE: {mean_squared_percentage_error(y_test, y_pred)}")

MSE: 0.28680421413975793
R2: 0.792093187599672
MSPE: 1.6900356412577118
