In [16]:
# K-Nearest Neighbors Evaluation Notebook

# Import standard packages
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# Import pipeline
import sys
import os
sys.path.append(os.path.abspath('..'))  # add project root to path
import pipeline_ex_KNN
print(pipeline_ex_KNN.__file__)  # the correct file is loaded
from pipeline_ex_KNN import get_train_val_test_scaled



# Load data
X_train_scaled, X_val_scaled, X_test_scaled, y_train_multi, y_val_multi, y_test_multi = get_train_val_test_scaled()

y_train_multi = np.array(y_train_multi)
y_val_multi = np.array(y_val_multi)
y_test_multi = np.array(y_test_multi)

C:\Users\er\ML_group_project\pipeline_ex_KNN.py
After fillna:
NaNs in X_train (before scaling): 0
NaNs in X_val (before scaling): 0
NaNs in X_test (before scaling): 0
NaNs in X_train_scaled: 0
NaNs in X_val_scaled: 0
NaNs in X_test_scaled: 0


  db['odds_hw'] = db[home_win_cols].mean(axis=1)
  db['odds_d']  = db[draw_cols].mean(axis=1)
  db['odds_aw'] = db[away_win_cols].mean(axis=1)


In [17]:
import numpy as np

y_train_multi = np.array(y_train_multi)
y_val_multi = np.array(y_val_multi)

# Truncate y if needed
if X_train_scaled.shape[0] != y_train_multi.shape[0]:
    y_train_multi = y_train_multi[:X_train_scaled.shape[0]]

if X_val_scaled.shape[0] != y_val_multi.shape[0]:
    y_val_multi = y_val_multi[:X_val_scaled.shape[0]]

# Check if any rows exist before moving on
print("Training set shape:", X_train_scaled.shape)
print("Validation set shape:", X_val_scaled.shape)
print("y_train_multi shape:", y_train_multi.shape)
print("y_val_multi shape:", y_val_multi.shape)







Training set shape: (2410, 181)
Validation set shape: (301, 181)
y_train_multi shape: (2410, 2)
y_val_multi shape: (301, 2)


In [18]:
print("NaNs in X_train_scaled:", np.isnan(X_train_scaled).sum())
print("NaNs in X_val_scaled:", np.isnan(X_val_scaled).sum())


NaNs in X_train_scaled: 0
NaNs in X_val_scaled: 0


## What the NaNs mean:

- During preprocessing, we checked for missing values in the feature matrices (X_train_scaled and X_val_scaled). Initially, a large number of NaNs were detected due to inconsistent one-hot encoding across train/val/test splits.

To fix this, we:

Ensured all one-hot encoded DataFrames were reindexed to match the training set columns

Filled all remaining NaNs with 0 before scaling

Verified that no missing values remained after preprocessing

With the NaNs resulting in zeros, it is confirmed that our dataset is clean and safe to use for model training and evaluation, preventing any errors or skewed results from missing data.

In [19]:
from sklearn.impute import SimpleImputer

if X_train_scaled.shape[0] > 0:
    imputer = SimpleImputer(strategy="mean")
    X_train_scaled = imputer.fit_transform(X_train_scaled)
    X_val_scaled   = imputer.transform(X_val_scaled)
    X_test_scaled  = imputer.transform(X_test_scaled)
else:
    print("Error: No samples in training set — cannot fit imputer.")


In [20]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

#  MultiOutputRegressor
multi_knn = MultiOutputRegressor(KNeighborsRegressor(
    n_neighbors=30,  # or whatever value you want to test
    weights='uniform',
    p=2
))

# Train on both FTHG and FTAG
multi_knn.fit(X_train_scaled, y_train_multi)

# Predict on validation set
y_val_pred_multi = multi_knn.predict(X_val_scaled)

# Split predictions and targets
y_val_fthg = y_val_multi[:, 0]
y_val_ftag = y_val_multi[:, 1]

y_pred_fthg = y_val_pred_multi[:, 0]
y_pred_ftag = y_val_pred_multi[:, 1]

# FTHG
print("FTHG Validation Metrics:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_val_fthg, y_pred_fthg)):.4f}")
print(f"MAE:  {mean_absolute_error(y_val_fthg, y_pred_fthg):.4f}")
print(f"R²:   {r2_score(y_val_fthg, y_pred_fthg):.4f}")

#FTAG
print("\nFTAG Validation Metrics:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_val_ftag, y_pred_ftag)):.4f}")
print(f"MAE:  {mean_absolute_error(y_val_ftag, y_pred_ftag):.4f}")
print(f"R²:   {r2_score(y_val_ftag, y_pred_ftag):.4f}")





FTHG Validation Metrics:
RMSE: 1.1789
MAE:  0.9474
R²:   0.2357

FTAG Validation Metrics:
RMSE: 1.1497
MAE:  0.8744
R²:   0.1801


- Results are showing that k nearest at its basic state is not very optimal/not generalizing well , its also likely underfitting or using wrong k value

## Hyperparameter Tuning

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor

# Wrap KNN inside MultiOutputRegressor
model = MultiOutputRegressor(KNeighborsRegressor())

# Use estimator__ prefix to access inner KNN parameters
param_grid = {
    'estimator__n_neighbors': [3, 5, 7, 9, 11],
    'estimator__weights': ['uniform', 'distance'],
    'estimator__p': [1, 2]  # 1 = Manhattan, 2 = Euclidean
}



In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor

# MultiOutputRegressor
multi_knn = MultiOutputRegressor(KNeighborsRegressor())

# Use estimator__ prefix for internal KNN parameters
param_grid = {
    'estimator__n_neighbors': [3, 5, 7, 9, 11],
    'estimator__weights': ['uniform', 'distance'],
    'estimator__p': [1, 2]
}

# Run grid search
grid_search = GridSearchCV(
    multi_knn,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_scaled, y_train_multi)



Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [23]:
print("Best parameters:", grid_search.best_params_)
print("Best MSE (positive):", -grid_search.best_score_)



Best parameters: {'estimator__n_neighbors': 11, 'estimator__p': 1, 'estimator__weights': 'distance'}
Best MSE (positive): 1.193260056519291


In [24]:
best_knn = grid_search.best_estimator_

# Predict on validation set
y_val_pred = best_knn.predict(X_val_scaled)

# Split targets
y_val_fthg = y_val_multi[:, 0]
y_val_ftag = y_val_multi[:, 1]

# Split predictions
y_pred_fthg = y_val_pred[:, 0]
y_pred_ftag = y_val_pred[:, 1]

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# FTHG Metrics
print("FTHG Validation Metrics:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_val_fthg, y_pred_fthg)):.4f}")
print(f"MAE:  {mean_absolute_error(y_val_fthg, y_pred_fthg):.4f}")
print(f"R²:   {r2_score(y_val_fthg, y_pred_fthg):.4f}")

# FTAG Metrics
print("\nFTAG Validation Metrics:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_val_ftag, y_pred_ftag)):.4f}")
print(f"MAE:  {mean_absolute_error(y_val_ftag, y_pred_ftag):.4f}")
print(f"R²:   {r2_score(y_val_ftag, y_pred_ftag):.4f}")



FTHG Validation Metrics:
RMSE: 1.1440
MAE:  0.9077
R²:   0.2802

FTAG Validation Metrics:
RMSE: 1.1092
MAE:  0.8611
R²:   0.2367


- Based on new values, we can see R squared went up as well as RMSE and MAE became better (measures how far predictions are from true values)

In [25]:
best_knn = grid_search.best_estimator_



In [26]:
# Predict on test set
y_test_pred = best_knn.predict(X_test_scaled)

# Split true and predicted values for FTHG and FTAG
y_test_fthg = y_test_multi[:, 0]
y_test_ftag = y_test_multi[:, 1]

y_pred_fthg = y_test_pred[:, 0]
y_pred_ftag = y_test_pred[:, 1]

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Evaluate FTHG
print("FTHG Test Metrics:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_fthg, y_pred_fthg)):.4f}")
print(f"MAE:  {mean_absolute_error(y_test_fthg, y_pred_fthg):.4f}")
print(f"R²:   {r2_score(y_test_fthg, y_pred_fthg):.4f}")

# Evaluate FTAG
print("\nFTAG Test Metrics:")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test_ftag, y_pred_ftag)):.4f}")
print(f"MAE:  {mean_absolute_error(y_test_ftag, y_pred_ftag):.4f}")
print(f"R²:   {r2_score(y_test_ftag, y_pred_ftag):.4f}")



FTHG Test Metrics:
RMSE: 1.0987
MAE:  0.8617
R²:   0.2959

FTAG Test Metrics:
RMSE: 1.0593
MAE:  0.8190
R²:   0.2303
