# KNN Regressor, ML-CUP

## Import Libraries

In [None]:
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, train_test_split
import pandas as pd
import sklearn
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
from sklearn.metrics import make_scorer,mean_squared_error


### Function to compute the mean euclidean error

In [None]:
def mean_euclidean_error(y_true, y_pred):
     errors= np.sqrt(np.sum((y_true - y_pred) ** 2, axis=1))
     return np.mean(errors)

In [None]:
scoring=make_scorer(mean_euclidean_error,greater_is_better=False)

## Read the dataset

In [None]:
def ReadFile(s):
    column=['Id','i1','i2','i3','i4','i5','i6','i7','i8','i9','i10','Y1','Y2','Y3']
    dataset=pd.read_csv(s,sep=",", names=column,skiprows=7)
    dataset.set_index('Id', inplace=True)
    return dataset


In [None]:
data=ReadFile("Dataset_Cup/ML-CUP23-TR.csv")

In [None]:
featureTrain=data.iloc[:,0:10]
TargetTrain=data.iloc[:,10:13]

In [None]:
featureTrain

In [None]:
TargetTrain

In [None]:
X_train, X_test, y_train, y_test = train_test_split(featureTrain, TargetTrain, test_size=0.2,random_state=42)

## Initial test  

In [None]:
# Generating sample data (replace this with your own dataset)
k_range = list(range(1, 10))
scores = []

for i in k_range:
    knn = KNeighborsRegressor(n_neighbors=i)
    # Calculating score using cross-validation (negative mean squared error)
    score = -np.mean(cross_val_score(knn, X_train, y_train, cv=5, scoring=scoring))
    scores.append(score)

# Plotting the scores
sn.lineplot(x=k_range, y=scores, marker='o')
plt.title("MEE")
plt.xlabel("K Values")
plt.ylabel("Score")
plt.show()

In [None]:
best_k = k_range[np.argmin(scores)]
best_score = min(scores)

print(f"Best K value: {best_k}")
print(f"Corresponding Score: {best_score}")

In [None]:
knn = KNeighborsRegressor(n_neighbors=3)

In [None]:
knn.fit(X_train,y_train)

In [None]:
knn.predict(X_test)

# First Result on Test

In [None]:
y_pred=knn.predict(X_test)

In [None]:
print('MSE',  mean_squared_error(y_test, y_pred))
print('MEE', mean_euclidean_error(y_test, y_pred))

# Grid Search to find the best parameters

In [None]:

params = {
     'n_neighbors':np.arange(1,30),
     'algorithm':['auto','ball_tree', 'kd_tree', 'brute'],
     'leaf_size':[10,20,30],
     'weights':['uniform','distance'],
     'p':np.arange(2,10)

}
#hyperparameter tuning with grid search CV
n_jobs_search = -1
knr_search = KNeighborsRegressor()
knr_cv = GridSearchCV(estimator=knr_search, param_grid=params, scoring=scoring,
                      cv=5, verbose=4, n_jobs=n_jobs_search)
knr_cv.fit(X_train, y_train)

In [None]:
knr_cv.best_params_

In [None]:
knr_cv.best_score_

# Print the best Results on Training Validation and internal test

In [None]:
knn = KNeighborsRegressor(algorithm="auto",n_neighbors=4,leaf_size=10,p=2,weights='distance',)

In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


### Train

In [None]:

knn.fit(X_tr,y_tr)

y_pred=knn.predict(X_tr)
print('MEE  on Train ',  mean_euclidean_error(y_tr, y_pred))
print('MSE on Train',mean_squared_error(y_tr,y_pred))

### Validation

In [None]:

y_pred=knn.predict(X_val)
print('MEE  on Validation ',  mean_euclidean_error(y_val, y_pred))
print('MSE on Validation',mean_squared_error(y_val,y_pred))

### Test

In [None]:
y_pred=knn.predict(X_test)
print('MEE  on Test ',  mean_euclidean_error(y_test, y_pred))
print('MSE on Test',mean_squared_error(y_test,y_pred))