In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV

from sklearn import metrics 
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder

## Monk 2

In [None]:
path=r'data/monks-2'
monk2_train = pd.read_csv(path+'.train', header=None, delim_whitespace=True, dtype=str)
monk2_test = pd.read_csv(path+'.test', header=None, delim_whitespace=True, dtype=str)

y_train=monk2_train[0]
x_train=monk2_train[monk2_train.columns[1:-1]]

y_test=monk2_test[0]
x_test=monk2_test[monk2_train.columns[1:-1]]

x_test_not_enc=x_test
x_train_not_enc=x_train

encoder = OneHotEncoder(sparse_output=False)
x_train = encoder.fit_transform(x_train)
x_test = encoder.fit_transform(x_test)

best_model_kernel=[]
enc=[]




## 2 independent grid searches are performed for encoded and not encoded data, with polynomial kernel

In [None]:
estimator_SVC= SVC()

#range for each hyperparameter

estimator_SVC= SVC()

C=np.logspace(-3,3,4)
gamma=np.logspace(-3,3,6)
coef=np.linspace(-10,10,5)
deg=np.arange(2,5,1)

parameters_SVM_poly= {
    'C':C,
    'kernel': ['poly'],
    'gamma':gamma,
    'coef0': coef ,
    'degree': deg ,
    'max_iter': [100000]  
}          


#not encoded grid search, with 5-fold cv and 5 repeats
grid_search = GridSearchCV(
    estimator=estimator_SVC,
    param_grid=parameters_SVM_poly,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    refit=True,
    verbose=4
)

#encoded grid search, with 5-fold cv and 5 repeats

grid_search_2 = GridSearchCV(
    estimator=estimator_SVC,
    param_grid=parameters_SVM_poly,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    refit=True,
    verbose=4
)

SVM_1=grid_search.fit(x_train, y_train)
SVM_2=grid_search_2.fit(x_train_not_enc, y_train)

## Evaluating the best model with polynomial kernel

In [None]:
#best models and scores for encoding and not encoding 
print('encoded',SVM_1.best_params_)
print('not encoded',SVM_2.best_params_) 

best1=SVM_1.best_score_
best2=SVM_2.best_score_

print('Best Score Encoded - Validation:', best1 )
print('Best Score not Encoded- Validation:', best2 )

#selection of best between encoded and not encoded
if best1>best2:
    SVM_poly=SVM_1
    enc_poly='Yes'
    print('\n best model: encoded \n')
else:
    SVM_poly=SVM_2
    enc_poly='No'
    print('\n best model: not encoded \n')

#training an validation scores for the best model
cv_results_df = pd.DataFrame(SVM_poly.cv_results_)
best_model_index=SVM_poly.best_index_

val_acc=cv_results_df['mean_test_score'][best_model_index]
val_std=cv_results_df['std_test_score'][best_model_index]
train_acc=cv_results_df['mean_train_score'][best_model_index]
train_std=cv_results_df['std_train_score'][best_model_index]

print('Train accuracy:',train_acc,'+/-', train_std)
print('validation accuracy:',val_acc,'+/-', val_std)

#the best model and the type of encoding is stored in a list to select the best models at the end
best_model_kernel.append(SVM_poly)
enc.append(enc_poly)


## 2 independent grid searches are performed for encoded and not encoded data, with RBF kernel

In [None]:
estimator_svm = SVC()
#range of hyperparamters
C=np.logspace(-3,5,10)

gamma=(0.001,0.01,0.1,10,100,1000)


#rbf kernel parameters
parameters_SVM_rbf= {
    'C':C,
    'kernel': ['rbf'],
    'gamma':gamma,
    #'max_iter':[1000000]
}    

#encoded grid search, with 5-fold cv and 5 repeats


grid_search = GridSearchCV(
    estimator=estimator_SVC,
    param_grid=parameters_SVM_rbf,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    refit=True,
    verbose=4
)


#not encoded grid search, with 5-fold cv and 5 repeats
grid_search_2 = GridSearchCV(
    estimator=estimator_SVC,
    param_grid=parameters_SVM_rbf,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    refit=True,
    verbose=4
)



SVM_1=grid_search.fit(x_train, y_train)
SVM_2=grid_search_2.fit(x_train_not_enc, y_train)



##  Evaluating the best model with RBF kernel

In [None]:
#best paramters and scores for encoded and not encoded data
print('encoded',SVM_1.best_params_)
print('not encoded',SVM_2.best_params_) 

best1=SVM_1.best_score_
best2=SVM_2.best_score_

print('Best Score Encoded - Validation:', best1 )
print('Best Score not Encoded- Validation:', best2 )

#selection of the best bewteen encoded and not encoded

if best1>best2:
    SVM_rbf=SVM_1
    enc_rbf='Yes'
    print('\n best model: encoded \n')
else:
    SVM_rbf=SVM_2
    enc_rbf='No'
    print('\n best model: not encoded \n')


cv_results_df = pd.DataFrame(SVM_rbf.cv_results_)
best_model_index=SVM_rbf.best_index_
#training an validation scores for the best model
val_acc=cv_results_df['mean_test_score'][best_model_index]
val_std=cv_results_df['std_test_score'][best_model_index]
train_acc=cv_results_df['mean_train_score'][best_model_index]
train_std=cv_results_df['std_train_score'][best_model_index]

print('Train accuracy:',train_acc,'+/-', train_std)
print('validation accuracy:',val_acc,'+/-', val_std)

#the best model and the best kind of encoding is stored for the final model selection
best_model_kernel.append(SVM_rbf)
enc.append(enc_rbf)

## 2 independent grid searches are performed for encoded and not encoded data, with sigmoid kernel

In [None]:
estimator_SVC = SVC()
#range of hyperparamters
C=np.logspace(-4,4,20)
gamma=(0.001,0.01,0.1,10,100,1000)
coef=np.linspace(-10,10,20)


#sigmoid kernel parameters
parameters_SVM_sigmoid= {
    'C':C,
    'kernel': ['sigmoid'],
    'gamma':gamma,
    'coef0':coef   
}      


#grid search for encoded data

grid_search = GridSearchCV(
    estimator=estimator_SVC,
    param_grid=parameters_SVM_sigmoid,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    refit=True,
    verbose=4
)

#grid search for not encoded data

grid_search_2 = GridSearchCV(
    estimator=estimator_SVC,
    param_grid=parameters_SVM_sigmoid,
    scoring = 'accuracy',
    cv=RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    refit=True,
    verbose=4
)




SVM_1=grid_search.fit(x_train, y_train)
SVM_2=grid_search_2.fit(x_train_not_enc, y_train)






## Evaluating the best model with sigmoid kernel

In [None]:
#best paramters and scores for encoded and not encoded data
print('encoded',SVM_1.best_params_)
print('not encoded',SVM_2.best_params_) 

best1=SVM_1.best_score_
best2=SVM_2.best_score_

print('Best Score Encoded - Validation:', best1 )
print('Best Score not Encoded- Validation:', best2 )
#the best encoding is selected
if best1>best2:
    SVM_sigmoid=SVM_1
    enc_sigmoid='Yes'
    print('\n best model: encoded \n')
else:
    SVM_sigmoid=SVM_2
    enc_sigmoid='No'
    print('\n best model: not encoded \n')
#training an validation scores for the best model
cv_results_df = pd.DataFrame(SVM_sigmoid.cv_results_)
best_model_index=SVM_sigmoid.best_index_

val_acc=cv_results_df['mean_test_score'][best_model_index]
val_std=cv_results_df['std_test_score'][best_model_index]
train_acc=cv_results_df['mean_train_score'][best_model_index]
train_std=cv_results_df['std_train_score'][best_model_index]

print('Train accuracy:',train_acc,'+/-', train_std)
print('validation accuracy:',val_acc,'+/-', val_std)
#the best model and the best kind of encoding is stored for the final model selection
best_model_kernel.append(SVM_sigmoid)
enc.append(enc_sigmoid)

## Final model selection

In [None]:
#list of accuracies for all models
val_acc=[]
for model in best_model_kernel:
    val_acc.append(model.best_score_)

#the best model is the one with highest accuracy on the validation
SVM=best_model_kernel[np.argmax(np.array(val_acc))]
#the best encoding is the one with highest accuracy
e=enc[np.argmax(np.array(val_acc))]
print('best params:',SVM.best_params_)
print('encoded:',e)

## Model assessment

In [None]:
# if the best model is the one with not encoding
if e=='No':
    x_test=x_test_not_enc
y_pred =SVM.predict(x_test)

print('Accuracy Score - SVM - Test-error:', metrics.accuracy_score(y_test, y_pred))  

print('\n Classification report',metrics.classification_report(y_test, y_pred))

#plot of the confusion matrix for the classifier
cm = metrics.confusion_matrix(y_test, y_pred, labels=SVM.classes_)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=SVM.classes_)
disp.plot()
plt.show()