In [1]:
# K-Nearest Neighbors Evaluation Notebook

# Import standard packages
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Import pipeline
import sys
import os
sys.path.append(os.path.abspath('..'))  # add project root to path
import pipeline_ex_KNN
print(pipeline_ex_KNN.__file__)  # ✅ Check that the correct file is loaded
from pipeline_ex_KNN import get_train_val_test_scaled



# Load data
X_train_scaled, X_val_scaled, X_test_scaled, y_train, y_val, y_test = get_train_val_test_scaled()


C:\Users\er\ML_group_project\pipeline_ex_KNN.py
✅ After fillna:
NaNs in X_train (before scaling): 0
NaNs in X_val (before scaling): 0
NaNs in X_test (before scaling): 0
NaNs in X_train_scaled: 0
NaNs in X_val_scaled: 0
NaNs in X_test_scaled: 0


  db['odds_hw'] = db[home_win_cols].mean(axis=1)
  db['odds_d']  = db[draw_cols].mean(axis=1)
  db['odds_aw'] = db[away_win_cols].mean(axis=1)


In [2]:
import numpy as np

# Flatten in case they're 2D
y_train = np.ravel(y_train)
y_val   = np.ravel(y_val)

# Truncate y if needed
if X_train_scaled.shape[0] != y_train.shape[0]:
    y_train = y_train[:X_train_scaled.shape[0]]

if X_val_scaled.shape[0] != y_val.shape[0]:
    y_val = y_val[:X_val_scaled.shape[0]]

# Check if any rows exist before moving on
print("Training set shape:", X_train_scaled.shape)
print("Validation set shape:", X_val_scaled.shape)






Training set shape: (2410, 181)
Validation set shape: (301, 181)


In [3]:
print("NaNs in X_train_scaled:", np.isnan(X_train_scaled).sum())
print("NaNs in X_val_scaled:", np.isnan(X_val_scaled).sum())


NaNs in X_train_scaled: 0
NaNs in X_val_scaled: 0


## What the NaNs mean:

- During preprocessing, we checked for missing values in the feature matrices (X_train_scaled and X_val_scaled). Initially, a large number of NaNs were detected due to inconsistent one-hot encoding across train/val/test splits.

To fix this, we:

Ensured all one-hot encoded DataFrames were reindexed to match the training set columns

Filled all remaining NaNs with 0 before scaling

Verified that no missing values remained after preprocessing

With the NaNs resulting in zeros, it is confirmed that our dataset is clean and safe to use for model training and evaluation, preventing any errors or skewed results from missing data.

In [4]:
from sklearn.impute import SimpleImputer

if X_train_scaled.shape[0] > 0:
    imputer = SimpleImputer(strategy="mean")
    X_train_scaled = imputer.fit_transform(X_train_scaled)
    X_val_scaled   = imputer.transform(X_val_scaled)
    X_test_scaled  = imputer.transform(X_test_scaled)
else:
    print("Error: No samples in training set — cannot fit imputer.")


In [5]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

knn = KNeighborsRegressor()
knn.fit(X_train_scaled, y_train)

y_val_pred = knn.predict(X_val_scaled)

mse = mean_squared_error(y_val, y_val_pred)
rmse = np.sqrt(mse)
mae  = mean_absolute_error(y_val, y_val_pred)
r2   = r2_score(y_val, y_val_pred)

print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE:  {mae:.4f}")
print(f"Validation R²:   {r2:.4f}")




Validation RMSE: 1.4748
Validation MAE:  1.1595
Validation R²:   -0.2375


- Results are showing that k nearest at its basic state is not very optimal/not generalizing well , its also likely underfitting or using wrong k value

## Hyperparameter Tuning

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor


In [7]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 = Manhattan, 2 = Euclidean
}


In [8]:
knn = KNeighborsRegressor()

grid_search = GridSearchCV(
    knn,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',  # or use 'r2', 'neg_mean_absolute_error'
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [9]:
print("Best parameters:", grid_search.best_params_)
print("Best score (negative MSE):", grid_search.best_score_)


Best parameters: {'n_neighbors': 11, 'p': 2, 'weights': 'uniform'}
Best score (negative MSE): -1.7562394979596037


In [10]:
best_knn = grid_search.best_estimator_

y_val_pred = best_knn.predict(X_val_scaled)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
mae = mean_absolute_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE:  {mae:.4f}")
print(f"Validation R²:   {r2:.4f}")


Validation RMSE: 1.4388
Validation MAE:  1.1401
Validation R²:   -0.1779


- Based on new values, we can see R squared went down as well as RMSE and MAE became better (measures how far predictions are from true values)

## Optimizing Grid

In [11]:
param_grid = {
    'n_neighbors': [5, 10, 15, 20, 25, 30],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # Manhattan vs. Euclidean distance
}


In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()

grid_search = GridSearchCV(
    knn,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',  # we’ll still use RMSE later
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_scaled, y_train)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [13]:
best_knn = grid_search.best_estimator_

y_val_pred = best_knn.predict(X_val_scaled)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
mae = mean_absolute_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

print("Best params:", grid_search.best_params_)
print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE:  {mae:.4f}")
print(f"Validation R²:   {r2:.4f}")


Best params: {'n_neighbors': 30, 'p': 2, 'weights': 'uniform'}
Validation RMSE: 1.3789
Validation MAE:  1.0981
Validation R²:   -0.0818


- Optimizing the grid gave us slightly better results than last time.