In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from utils import gather_data, mean_squared_percentage_error, basis_expansion

# KNN Regressor
The goal of this notebook is to run a KNN regressor and find optimal hyperparameters in respect to number of neighbors and basis expansion.

In [2]:
x_train, x_test, y_train, y_test = gather_data()
kfold = KFold(n_splits=10, shuffle=True)

Index(['Log GDP per capita', 'Social support',
       'Healthy life expectancy at birth', 'Freedom to make life choices',
       'Generosity', 'Perceptions of corruption', 'Positive affect',
       'Negative affect', 'Confidence in national government'],
      dtype='object')


In [3]:
results = []
for n in range(1, 11):
    x_train_expanded = basis_expansion(x_train, n)
    for k in [3, 5, 10, 15, 20]:
        result = {
            "n": n,
            "k": k,
            "MSE": [],
            "R2": [],
            "MSPE": []
        }
        for train_index, val_index in kfold.split(x_train_expanded, y_train):
            model = KNeighborsRegressor(n_neighbors=k)
            model.fit(x_train_expanded[train_index], y_train[train_index])
            y_pred = model.predict(x_train_expanded[val_index])
            result["MSE"].append(mean_squared_error(y_train[val_index], y_pred))
            result["R2"].append(r2_score(y_train[val_index], y_pred))
            result["MSPE"].append(mean_squared_percentage_error(y_train[val_index], y_pred))
        result["MSE"] = np.mean(result["MSE"])
        result["R2"] = np.mean(result["R2"])
        result["MSPE"] = np.mean(result["MSPE"])
        results.append(result)

In [4]:
results

[{'n': 1,
  'k': 3,
  'MSE': 0.20478196551417635,
  'R2': 0.8389690911066454,
  'MSPE': 1.0083780191284943},
 {'n': 1,
  'k': 5,
  'MSE': 0.21129766461222496,
  'R2': 0.8349802899880718,
  'MSPE': 1.054732304413414},
 {'n': 1,
  'k': 10,
  'MSE': 0.22198933234181556,
  'R2': 0.8239313052655,
  'MSPE': 1.1117174019606335},
 {'n': 1,
  'k': 15,
  'MSE': 0.23554450552020093,
  'R2': 0.814204797463189,
  'MSPE': 1.1990715637433835},
 {'n': 1,
  'k': 20,
  'MSE': 0.25138355435031706,
  'R2': 0.8030841600374569,
  'MSPE': 1.292430362158249},
 {'n': 2,
  'k': 3,
  'MSE': 0.19897762105869407,
  'R2': 0.8445967326168546,
  'MSPE': 0.994148253088715},
 {'n': 2,
  'k': 5,
  'MSE': 0.20133559825350714,
  'R2': 0.8418307757095093,
  'MSPE': 1.0318519840074114},
 {'n': 2,
  'k': 10,
  'MSE': 0.212953547856337,
  'R2': 0.8324390060144184,
  'MSPE': 1.0889491294679663},
 {'n': 2,
  'k': 15,
  'MSE': 0.23139469838686777,
  'R2': 0.8190214944583998,
  'MSPE': 1.1931079051935645},
 {'n': 2,
  'k': 20,
  

In [5]:
# Sort by lowest MSE
sorted(results, key=lambda x: x["MSE"])

[{'n': 3,
  'k': 3,
  'MSE': 0.19419549937742892,
  'R2': 0.8475722373502077,
  'MSPE': 0.9699415926783278},
 {'n': 4,
  'k': 3,
  'MSE': 0.19796727631069022,
  'R2': 0.8450869369327174,
  'MSPE': 1.0014495436300215},
 {'n': 3,
  'k': 5,
  'MSE': 0.1984642642501284,
  'R2': 0.843401525031533,
  'MSPE': 1.0306839351451906},
 {'n': 2,
  'k': 3,
  'MSE': 0.19897762105869407,
  'R2': 0.8445967326168546,
  'MSPE': 0.994148253088715},
 {'n': 6,
  'k': 3,
  'MSE': 0.20094497148806165,
  'R2': 0.8420810401116071,
  'MSPE': 1.033103453947096},
 {'n': 2,
  'k': 5,
  'MSE': 0.20133559825350714,
  'R2': 0.8418307757095093,
  'MSPE': 1.0318519840074114},
 {'n': 4,
  'k': 5,
  'MSE': 0.20162419399640114,
  'R2': 0.84187527545979,
  'MSPE': 1.0337565940709184},
 {'n': 8,
  'k': 3,
  'MSE': 0.2026969304174604,
  'R2': 0.8402292407271166,
  'MSPE': 0.9942914838455292},
 {'n': 1,
  'k': 3,
  'MSE': 0.20478196551417635,
  'R2': 0.8389690911066454,
  'MSPE': 1.0083780191284943},
 {'n': 5,
  'k': 3,
  'MSE

In [6]:
# Sort by highest R2
sorted(results, key=lambda x: x["R2"], reverse=True)

[{'n': 3,
  'k': 3,
  'MSE': 0.19419549937742892,
  'R2': 0.8475722373502077,
  'MSPE': 0.9699415926783278},
 {'n': 4,
  'k': 3,
  'MSE': 0.19796727631069022,
  'R2': 0.8450869369327174,
  'MSPE': 1.0014495436300215},
 {'n': 2,
  'k': 3,
  'MSE': 0.19897762105869407,
  'R2': 0.8445967326168546,
  'MSPE': 0.994148253088715},
 {'n': 3,
  'k': 5,
  'MSE': 0.1984642642501284,
  'R2': 0.843401525031533,
  'MSPE': 1.0306839351451906},
 {'n': 6,
  'k': 3,
  'MSE': 0.20094497148806165,
  'R2': 0.8420810401116071,
  'MSPE': 1.033103453947096},
 {'n': 4,
  'k': 5,
  'MSE': 0.20162419399640114,
  'R2': 0.84187527545979,
  'MSPE': 1.0337565940709184},
 {'n': 2,
  'k': 5,
  'MSE': 0.20133559825350714,
  'R2': 0.8418307757095093,
  'MSPE': 1.0318519840074114},
 {'n': 8,
  'k': 3,
  'MSE': 0.2026969304174604,
  'R2': 0.8402292407271166,
  'MSPE': 0.9942914838455292},
 {'n': 1,
  'k': 3,
  'MSE': 0.20478196551417635,
  'R2': 0.8389690911066454,
  'MSPE': 1.0083780191284943},
 {'n': 5,
  'k': 3,
  'MSE

Best hyperparameters for KNN is n=3 and k=3.

In [8]:
x_train_expanded = basis_expansion(x_train, 3)
x_test_expanded = basis_expansion(x_test, 3)
best_model = KNeighborsRegressor(n_neighbors=3)
best_model.fit(x_train_expanded, y_train)
y_pred_train = best_model.predict(x_train_expanded)
y_pred_test = best_model.predict(x_test_expanded)
print(f"Train MSE: {mean_squared_error(y_train, y_pred_train)}")
print(f"Train R2: {r2_score(y_train, y_pred_train)}")
print(f"Train MSPE: {mean_squared_percentage_error(y_train, y_pred_train)}")
print(f"Test MSE: {mean_squared_error(y_test, y_pred_test)}")
print(f"Test R2: {r2_score(y_test, y_pred_test)}")
print(f"Test MSPE: {mean_squared_percentage_error(y_test, y_pred_test)}")

Train MSE: 0.08988862399822883
Train R2: 0.930100351362769
Train MSPE: 0.44197032904742894
Test MSE: 0.14554669066466192
Test R2: 0.8955269488640698
Test MSPE: 0.7365798333801727
