# Data Mining Project
### Giorgio Donati, g.donati24@studenti.unipi.it
### Pietro Francaviglia, p.francaviglia1@studenti.unipi.it
#### A.Y. 2021-2022

## Libraries and datasets

In [1]:
from sklearn.svm import SVC

from t3_constants import *
from t3_utility import *

In [2]:
with open(f'{PICKLE_FOLDER}{TRAIN}{X}.pkl', 'rb') as f:
    train_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TRAIN}{Y}.pkl', 'rb') as f:
    train_y = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TEST}{X}.pkl', 'rb') as f:
    test_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TEST}{Y}.pkl', 'rb') as f:
    test_y = pickle.load(f)

x = train_x.values
y = train_y.values.ravel()

## Support Vector Classifier

### RBF kernel

In [8]:
kernel = 'rbf'

C_list = [0.1, 0.5, 1, 10, 50, 100, 500, 1000]
gamma_list = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5]

hyperparams_grid = itertools.product(C_list, gamma_list)

results = []

for C_val, gamma in hyperparams_grid:
    SVC_model=SVC(kernel=kernel, gamma=gamma, C=C_val)
    mean_val_score, std_val_score = cross_validation_summary(SVC_model, x, y)
    res = C_val, gamma, mean_val_score, std_val_score
    results.append(res)

rbf_df = pd.DataFrame(results, columns=['C', 'gamma', 'mean_val_score', 'std_val_score'])

In [9]:
rbf_df.sort_values('mean_val_score', ascending=False)

Unnamed: 0,C,gamma,mean_val_score,std_val_score
32,1000.0,0.01,0.945638,0.018584
22,10.0,0.1,0.937638,0.019276
21,10.0,0.05,0.932889,0.014021
26,100.0,0.01,0.932863,0.018752
31,1000.0,0.005,0.932851,0.019425
27,100.0,0.05,0.929638,0.028414
16,1.0,0.1,0.924863,0.024146
28,100.0,0.1,0.921663,0.021816
34,1000.0,0.1,0.920063,0.02274
33,1000.0,0.05,0.920051,0.025395


In [10]:
rbf_df.groupby('C')['mean_val_score', 'std_val_score'].mean()

  rbf_df.groupby('C')['mean_val_score', 'std_val_score'].mean()


Unnamed: 0_level_0,mean_val_score,std_val_score
C,Unnamed: 1_level_1,Unnamed: 2_level_1
0.1,0.668004,0.020524
0.5,0.812796,0.029629
1.0,0.859651,0.022804
10.0,0.899575,0.02351
100.0,0.910487,0.023552
1000.0,0.917941,0.021717


In [11]:
rbf_df.groupby('gamma')['mean_val_score', 'std_val_score'].mean()

  rbf_df.groupby('gamma')['mean_val_score', 'std_val_score'].mean()


Unnamed: 0_level_0,mean_val_score,std_val_score
gamma,Unnamed: 1_level_1,Unnamed: 2_level_1
0.001,0.719363,0.021377
0.005,0.82105,0.02676
0.01,0.865774,0.022372
0.05,0.908362,0.026064
0.1,0.911558,0.024507
0.5,0.842347,0.020656


In [12]:
test_SVC = SVC(C=100, gamma=0.1)
test_SVC.fit(train_x, train_y)
test_SVC.score(test_x, test_y)

0.9235668789808917

### Polynomial kernel

In [None]:
kernel = 'poly'

C_list = [0.1, 0.5, 1, 10, 100]
coeff_list = [0.5, 1, 1.5]
degree_list = [2, 3, 4]

hyperparams_grid = itertools.product(C_list, coeff_list, degree_list)

results = []

for C_val, coeff, deg in hyperparams_grid:
    SVC_model=SVC(kernel='poly', degree=deg, coef0=coeff, C=C_val)
    mean_val_score, std_val_score = cross_validation_summary(SVC_model, X, y)
    res = C, degree, coeff, res[0], res[1]
    results.append(res)

columns=['C', 'degree', 'coeff', 'mean_val_score', 'std_val_score']
df = pd.DataFrame(results, columns=columns)