In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer

plt.rcParams.update({'font.size': 12})

## CUP KNN

In [None]:
#importing data
path=r'/home/ludovico/ML-project/data/cup/ML-CUP23-'
train_set = pd.read_csv(path+'TR.csv',skiprows=7, header=None, delimiter=',', dtype=str)

input=train_set[train_set.columns[1:-3]]
target=train_set[train_set.columns[-3:]]

#splitting design set from test set (test set will be used only for the final model assessment)
#the random seed is fixed to use the same design set for all the models

x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=0, shuffle=True)

x_train=x_train.astype(np.float64)
y_train=y_train.astype(np.float64)

#we add this metric (Mean euclidean error) to evaluate the performance of the model 
def MEE(x, y):
    return np.mean(np.linalg.norm(x - y, 2, axis=1))

### Plot of the validation error (MEE) varying the hyperparameters to choose the best
We fix 3 different metrics weighting and not the neighbours with distance varying the number of neighbours

In [None]:
plt.figure(1,(12,4))
n_neigh=np.arange(1,40,1)

w=['distance', 'uniform']
for weights in w:

    mis=['l1','cosine','euclidean']
    for i,metric in enumerate(mis):
        parameters_KNN = {
        'n_neighbors': n_neigh,
        'weights': [weights],
        'metric': [metric]
    }
        grid_search_KNN = GridSearchCV(
            estimator=KNeighborsRegressor(),
            param_grid=parameters_KNN,
            refit=True,
            cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=0),
            n_jobs=-1,
            return_train_score = True,
            verbose=3,
            scoring=make_scorer(MEE, greater_is_better=False),
        )
            
        KNN=grid_search_KNN.fit(x_train, y_train)
        cv_results_df = pd.DataFrame(grid_search_KNN.cv_results_)
        error=cv_results_df['mean_test_score'].values
        plt.subplot(1,3,i+1)
        plt.subplots_adjust(wspace=0)
        if i==0:
            plt.ylabel('MEE validation')
        else:
            plt.yticks([])
        plt.title('metric='+metric)
        plt.errorbar(n_neigh,-error,label='weights='+weights,marker='.',linestyle='')
        plt.xlabel('K')
        plt.ylim(2.5,6)
        plt.xticks([0,10,20,30])
        plt.legend()

plt.show()

## Final grid search for KNN

In [None]:
n_neigh=np.arange(1,20,1)
parameters_KNN = {
    'n_neighbors': n_neigh,
    'weights': ('uniform','distance'),
    'metric': ('cosine','minkowski'),
    'p':[1,2,3,5,7]
}      

grid_search_KNN = GridSearchCV(
    estimator=KNeighborsRegressor(),
    param_grid=parameters_KNN,
    refit=True,
    cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    scoring=make_scorer(MEE, greater_is_better=False),
)
KNN=grid_search_KNN.fit(x_train, y_train)

## Evaluating the model

In [None]:
cv_results_df = pd.DataFrame(KNN.cv_results_)
best_model_index=KNN.best_index_

print('best params',KNN.best_params_) 

val_loss=cv_results_df['mean_test_score'][best_model_index]
val_std=cv_results_df['std_test_score'][best_model_index]
train_loss=cv_results_df['mean_train_score'][best_model_index]
train_std=cv_results_df['std_train_score'][best_model_index]
print('Train loss:',train_loss,'+/-', train_std)
print('Validation loss:',val_loss,'+/-', val_std)

cv_results_df