In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn import metrics 
from sklearn.svm import SVR
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer

plt.rcParams.update({'font.size': 12})

## CUP SVM

In [None]:
#importing data
path=r'/home/ludovico/ML-project/data/cup/ML-CUP23-'
train_set = pd.read_csv(path+'TR.csv',skiprows=7, header=None, delimiter=',', dtype=str)

input=train_set[train_set.columns[1:-3]]
target=train_set[train_set.columns[-3:]]

#splitting design set from test set (test set will be used only for the final model assessment)
#the random seed is fixed to use the same design set for all the models

x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.2, random_state=0, shuffle=True)

x_train=x_train.astype(np.float64)
y_train=y_train.astype(np.float64)


#we add this metric (Mean euclidean error) to evaluate the performance of the model 
def MEE(x, y):
    return np.mean(np.linalg.norm(x - y, 2, axis=1))

#list to choose the best kernel for the SVR model
best_model_kernel=[]

## SVM RBF

### Plot of the validation error (MEE) varying the hyperparameters to choose the best
We fix 2 different epsilon and 3 different gamma varying C in a logarithmically spaced range

In [None]:
plt.figure(1,(12,4))

C=np.logspace(0,4,30)
epsilon=[0.1,0.5]
for eps in epsilon:

    g=[0.01,0.2,5]
    for i,gamma in enumerate(g):
        parameters_SVM = {
        'estimator__C':C,
        'estimator__kernel': ['rbf'],
        'estimator__gamma':[gamma],
        'estimator__epsilon':[eps],
        'estimator__max_iter':[1000000]
    }
        grid_search_SVM = GridSearchCV(
        estimator=MultiOutputRegressor(SVR()),
        param_grid=parameters_SVM,
        cv=RepeatedKFold(n_splits=5, n_repeats=5, random_state=0),
        n_jobs=-1,
        return_train_score = True,
        scoring=make_scorer(MEE, greater_is_better=False),
    )
            
        SVM=grid_search_SVM.fit(x_train, y_train)
        cv_results_df = pd.DataFrame(grid_search_SVM.cv_results_)
        error=cv_results_df['mean_test_score'].values
        plt.subplot(1,len(g),i+1)
        #plt.subplots_adjust(wspace=0)
        if i==0:
            plt.ylabel('MEE validation')
        plt.title(r'$\gamma$='+str(gamma))
        plt.errorbar(C,-error,marker='.',label=r'$\epsilon$='+str(eps),linestyle='')
        plt.xlabel('C')
        plt.xscale('log')
        plt.yscale('log')
        plt.ylim(0.5,40)
        plt.legend()

plt.show()

## Final grid search for SVM RBF 

In [None]:

parameters_SVM= {
    'estimator__C': [10,100,1000,10000,20000],
    'estimator__kernel': ['rbf'],
    'estimator__gamma':[0.01,0.1,0.2,1],
    'estimator__epsilon': [0.1,0.3],
    'estimator__max_iter': [100000] 
}      

# with GridSearch
grid_search_SVM = GridSearchCV(
    estimator=MultiOutputRegressor(SVR()),
    param_grid=parameters_SVM,
    refit=True,
    cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    scoring=make_scorer(MEE, greater_is_better=False),
)

SVM_rbf=grid_search_SVM.fit(x_train, y_train)
best_model_kernel.append(SVM_rbf)

## Evaluating the model

In [None]:
cv_results_df = pd.DataFrame(SVM_rbf.cv_results_)
best_model_index=SVM_rbf.best_index_

print('best params', SVM_rbf.best_params_) 

val_loss=cv_results_df['mean_test_score'][best_model_index]
val_std=cv_results_df['std_test_score'][best_model_index]
train_loss=cv_results_df['mean_train_score'][best_model_index]
train_std=cv_results_df['std_train_score'][best_model_index]
print('Train loss:',train_loss,'+/-', train_std)
print('Validation loss:',val_loss,'+/-', val_std)

cv_results_df

## SVM poly

## Grid search for SVM poly

In [None]:

C=np.logspace(0,4,5)

gamma=np.logspace(-2,2,5)

coef=np.linspace(0,10,3)

deg=np.arange(2,6,3)

parameters_SVM= {
    'estimator__C':C,
    'estimator__kernel': ['poly'],
    'estimator__gamma':gamma,
    'estimator__coef0':coef ,
    'estimator__degree':[5] ,
    'estimator__epsilon': [0.1],
    'estimator__max_iter':[1000] 
}      

# with GridSearch
grid_search_SVM = GridSearchCV(
    estimator=MultiOutputRegressor(SVR()),
    param_grid=parameters_SVM,
    refit=True,
    cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    scoring=make_scorer(MEE, greater_is_better=False),
)

SVM_poly=grid_search_SVM.fit(x_train, y_train)
best_model_kernel.append(SVM_poly)

## Evaluating the model

In [None]:
cv_results_df = pd.DataFrame(SVM_poly.cv_results_)
best_model_index=SVM_poly.best_index_

print('best params', SVM_poly.best_params_) 

val_loss=cv_results_df['mean_test_score'][best_model_index]
val_std=cv_results_df['std_test_score'][best_model_index]
train_loss=cv_results_df['mean_train_score'][best_model_index]
train_std=cv_results_df['std_train_score'][best_model_index]
print('Train loss:',train_loss,'+/-', train_std)
print('Validation loss:',val_loss,'+/-', val_std)

cv_results_df

## SVM sigmoid

### Plot of the validation error (MEE) varying the hyperparameters to choose the best
We fix 3 different beta0 (coef0) and 3 different beta1(gamma) varying C in a logarithmically spaced range

In [None]:
C=np.logspace(-3,4,40)

plt.figure(1,(12,4))
coef=[-5,-4,-3]
for c in coef:

    g=[0.01,0.1,0.5]
    for i,gamma in enumerate(g):
        parameters_SVM= {
        'estimator__C':C ,
        'estimator__kernel': ['sigmoid'],
        'estimator__gamma': [gamma],
        'estimator__coef0':[c],
        'estimator__epsilon': [eps],
        }
        grid_search_SVM = GridSearchCV(
        estimator=MultiOutputRegressor(SVR()),
        param_grid=parameters_SVM,
        cv=RepeatedKFold(n_splits=5, n_repeats=1, random_state=0),
        n_jobs=-1,
        return_train_score = True,
        scoring=make_scorer(MEE, greater_is_better=False),
        )
            
        SVM=grid_search_SVM.fit(x_train, y_train)
        cv_results_df = pd.DataFrame(grid_search_SVM.cv_results_)
        error=cv_results_df['mean_test_score'].values
        plt.subplot(1,len(g),i+1)
        
        if i==0:
            plt.ylabel('MEE validation')
        plt.title(r'$\beta_0$='+str(gamma))
        plt.errorbar(C,-error,marker='.',label=r'$\beta_1$='+str(c),linestyle='')
        plt.xlabel('C')
        plt.yscale('log')
        plt.xscale('log')
        plt.ylim(0.1,10000)
        plt.legend()

plt.show()

## Final grid search for SVM sigmoid

In [None]:

coef=np.linspace(-5,-2,4)


parameters_SVM= {
    'estimator__C':[100,1000,10000,50000],
    'estimator__kernel': ['sigmoid'],
    'estimator__gamma': [0.1,0.3],
    'estimator__coef0':coef,
    'estimator__epsilon': [0.1],
}      

# with GridSearch
grid_search_SVM = GridSearchCV(
    estimator=MultiOutputRegressor(SVR()),
    param_grid=parameters_SVM,
    refit=True,
    cv=RepeatedKFold(n_splits=5, n_repeats=10, random_state=0),
    n_jobs=-1,
    return_train_score = True,
    scoring=make_scorer(MEE, greater_is_better=False),
)

SVM_sigmoid=grid_search_SVM.fit(x_train, y_train)
best_model_kernel.append(SVM_sigmoid)

## Evaluating the model

In [None]:
cv_results_df = pd.DataFrame(SVM_sigmoid.cv_results_)
best_model_index=SVM_sigmoid.best_index_

print('best params', SVM_sigmoid.best_params_) 

val_loss=cv_results_df['mean_test_score'][best_model_index]
val_std=cv_results_df['std_test_score'][best_model_index]
train_loss=cv_results_df['mean_train_score'][best_model_index]
train_std=cv_results_df['std_train_score'][best_model_index]
print('Train loss:',train_loss,'+/-', train_std)
print('Validation loss:',val_loss,'+/-', val_std)

cv_results_df

## Final model selection 
The best model among the three kernel is the one with the lowest MEE score on the validation set

In [None]:
val_acc=[]
for model in best_model_kernel:
    val_acc.append(model.best_score_)

SVM=best_model_kernel[np.argmax(np.array(val_acc))]

print('best params:',SVM.best_params_)