In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics 
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import OneHotEncoder

## MONK 1

In [None]:
path=r'/home/ludovico/ML-project/data/monks-1'

#importing training and test set
monk1_train = pd.read_csv(path+'.train', header=None, delim_whitespace=True, dtype=str)
monk1_test = pd.read_csv(path+'.test', header=None, delim_whitespace=True, dtype=str)

#separating input from target
y_train=monk1_train[0]
x_train=monk1_train[monk1_train.columns[1:-1]]
y_test=monk1_test[0]
x_test=monk1_test[monk1_train.columns[1:-1]]

# non encoded data
x_train_not_enc=x_train
x_test_not_enc=x_test

# encoded data
encoder = OneHotEncoder(sparse_output=False)
x_train = encoder.fit_transform(x_train)
x_test = encoder.fit_transform(x_test)

## Model selection for MONK 1

We do 2 independent grid search in the case of encoded/not encoded data, varying the knn hyperparameters. 

In [None]:
estimator_KNN = KNeighborsClassifier(algorithm='auto')
n_neigh=np.arange(1,50,1)
parameters_KNN = {
    'n_neighbors': n_neigh,
    'weights': ('uniform','distance'),
    'metric':('cosine','minkowski'),
    'p': [1,2,3,5,7]
}        

grid_search_KNN = GridSearchCV(
    estimator=estimator_KNN,
    param_grid=parameters_KNN,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    refit=True,
)

grid_search_KNN_2 = GridSearchCV(
    estimator=estimator_KNN,
    param_grid=parameters_KNN,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    refit=True,
)

### Fitting and evaluting the knn model

In [None]:
# grid search with encoded data
KNN_1=grid_search_KNN.fit(x_train, y_train)
# grid search with not encoded data
KNN_2=grid_search_KNN_2.fit(x_train_not_enc, y_train)

print('encoded',KNN_1.best_params_)
print('not encoded',KNN_2.best_params_) 

best1=KNN_1.best_score_
best2=KNN_2.best_score_

print('Best Score Encoded - Validation:', best1 )
print('Best Score not Encoded- Validation:', best2 )

# choosing the best between the enc/not enc case
if best1>best2:
    KNN=KNN_1
else:
    KNN=KNN_2
    x_test=x_test_not_enc

# Winning models and results
cv_results_df = pd.DataFrame(KNN.cv_results_)
best_model_index=KNN.best_index_

val_acc=cv_results_df['mean_test_score'][best_model_index]
val_std=cv_results_df['std_test_score'][best_model_index]
train_acc=cv_results_df['mean_train_score'][best_model_index]
train_std=cv_results_df['std_train_score'][best_model_index]

print('Train accuracy:',train_acc,'+/-', train_std)
print('validation accuracy:',val_acc,'+/-', val_std)

## Model assessment

In [None]:
y_pred_KNN =KNN.predict(x_test)

print('Accuracy Score - KNN - Test-error:', metrics.accuracy_score(y_test, y_pred_KNN))  

print('\n Classification report',metrics.classification_report(y_test, y_pred_KNN))

cm = metrics.confusion_matrix(y_test, y_pred_KNN, labels=KNN.classes_)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=KNN.classes_)
disp.plot()
plt.show()

## MONK 2

In [None]:
path=r'/home/ludovico/ML-project/data/monks-2'

#importing training and test set
monk2_train = pd.read_csv(path+'.train', header=None, delim_whitespace=True, dtype=str)
monk2_test = pd.read_csv(path+'.test', header=None, delim_whitespace=True, dtype=str)

#separating input from target
y_train=monk2_train[0]
x_train=monk2_train[monk2_train.columns[1:-1]]
y_test=monk2_test[0]
x_test=monk2_test[monk2_train.columns[1:-1]]

# non encoded data
x_train_not_enc=x_train
x_test_not_enc=x_test

# encoded data
encoder = OneHotEncoder(sparse_output=False)
x_train = encoder.fit_transform(x_train)
x_test = encoder.fit_transform(x_test)

## Model selection for MONK 2

In [None]:
estimator_KNN = KNeighborsClassifier(algorithm='auto')
n_neigh=np.arange(1,50,1)
parameters_KNN = {
    'n_neighbors': n_neigh,
    'weights': ('uniform','distance'),
    'metric':('cosine','minkowski'),
    'p': [1,2,3,5,7]
}        

grid_search_KNN = GridSearchCV(
    estimator=estimator_KNN,
    param_grid=parameters_KNN,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    refit=True,
)

grid_search_KNN_2 = GridSearchCV(
    estimator=estimator_KNN,
    param_grid=parameters_KNN,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    refit=True,
)

In [None]:
# grid search with encoded data
KNN_1=grid_search_KNN.fit(x_train, y_train)
# grid search with not encoded data
KNN_2=grid_search_KNN_2.fit(x_train_not_enc, y_train)

print('encoded',KNN_1.best_params_)
print('not encoded',KNN_2.best_params_) 

best1=KNN_1.best_score_
best2=KNN_2.best_score_

print('Best Score Encoded - Validation:', best1 )
print('Best Score not Encoded- Validation:', best2 )

# choosing the best between the enc/not enc case
if best1>best2:
    KNN=KNN_1
else:
    KNN=KNN_2
    x_test=x_test_not_enc

# Winning models and results
cv_results_df = pd.DataFrame(KNN.cv_results_)
best_model_index=KNN.best_index_

val_acc=cv_results_df['mean_test_score'][best_model_index]
val_std=cv_results_df['std_test_score'][best_model_index]
train_acc=cv_results_df['mean_train_score'][best_model_index]
train_std=cv_results_df['std_train_score'][best_model_index]

print('Train accuracy:',train_acc,'+/-', train_std)
print('validation accuracy:',val_acc,'+/-', val_std)

## Model assessment

In [None]:
y_pred_KNN =KNN.predict(x_test)

print('Accuracy Score - KNN - Test-error:', metrics.accuracy_score(y_test, y_pred_KNN))  

print('\n Classification report',metrics.classification_report(y_test, y_pred_KNN))

cm = metrics.confusion_matrix(y_test, y_pred_KNN, labels=KNN.classes_)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=KNN.classes_)
disp.plot()
plt.show()

## MONK 3

In [None]:
path=r'/home/ludovico/ML-project/data/monks-3'

#importing training and test set
monk3_train = pd.read_csv(path+'.train', header=None, delim_whitespace=True, dtype=str)
monk3_test = pd.read_csv(path+'.test', header=None, delim_whitespace=True, dtype=str)

#separating input from target
y_train=monk3_train[0]
x_train=monk3_train[monk3_train.columns[1:-1]]
y_test=monk3_test[0]
x_test=monk3_test[monk3_train.columns[1:-1]]

# non encoded data
x_train_not_enc=x_train
x_test_not_enc=x_test

# encoded data
encoder = OneHotEncoder(sparse_output=False)
x_train = encoder.fit_transform(x_train)
x_test = encoder.fit_transform(x_test)

## Model selection for MONK 3

In [None]:
estimator_KNN = KNeighborsClassifier(algorithm='auto')
n_neigh=np.arange(1,50,1)
parameters_KNN = {
    'n_neighbors': n_neigh,
    'weights': ('uniform','distance'),
    'metric':('cosine','minkowski'),
    'p': [1,2,3,5,7]
}        

grid_search_KNN = GridSearchCV(
    estimator=estimator_KNN,
    param_grid=parameters_KNN,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    refit=True,
)

grid_search_KNN_2 = GridSearchCV(
    estimator=estimator_KNN,
    param_grid=parameters_KNN,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    refit=True,
)

In [None]:
# grid search with encoded data
KNN_1=grid_search_KNN.fit(x_train, y_train)
# grid search with not encoded data
KNN_2=grid_search_KNN_2.fit(x_train_not_enc, y_train)

print('encoded',KNN_1.best_params_)
print('not encoded',KNN_2.best_params_) 

best1=KNN_1.best_score_
best2=KNN_2.best_score_

print('Best Score Encoded - Validation:', best1 )
print('Best Score not Encoded- Validation:', best2 )

# choosing the best between the enc/not enc case
if best1>best2:
    KNN=KNN_1
else:
    KNN=KNN_2
    x_test=x_test_not_enc

# Winning models and results
cv_results_df = pd.DataFrame(KNN.cv_results_)
best_model_index=KNN.best_index_

val_acc=cv_results_df['mean_test_score'][best_model_index]
val_std=cv_results_df['std_test_score'][best_model_index]
train_acc=cv_results_df['mean_train_score'][best_model_index]
train_std=cv_results_df['std_train_score'][best_model_index]

print('Train accuracy:',train_acc,'+/-', train_std)
print('validation accuracy:',val_acc,'+/-', val_std)

## Model assessment

In [None]:
y_pred_KNN =KNN.predict(x_test)

print('Accuracy Score - KNN - Test-error:', metrics.accuracy_score(y_test, y_pred_KNN))  

print('\n Classification report',metrics.classification_report(y_test, y_pred_KNN))

cm = metrics.confusion_matrix(y_test, y_pred_KNN, labels=KNN.classes_)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=KNN.classes_)
disp.plot()
plt.show()