# Data Mining Project
### Giorgio Donati, g.donati24@studenti.unipi.it
### Pietro Francaviglia, p.francaviglia1@studenti.unipi.it
#### A.Y. 2021-2022

## Libraries and datasets

In [1]:
from sklearn.svm import SVC

from t3_constants import *
from t3_utility import *

In [2]:
with open(f'{PICKLE_FOLDER}{TRAIN}{X}.pkl', 'rb') as f:
    train_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TRAIN}{Y}.pkl', 'rb') as f:
    train_y = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TEST}{X}.pkl', 'rb') as f:
    test_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TEST}{Y}.pkl', 'rb') as f:
    test_y = pickle.load(f)

x = train_x.values
y = train_y.values

## Support Vector Classifier

### RBF kernel

In [3]:
kernel = 'rbf'

C_list = [0.1, 0.5, 1, 10, 50, 100, 500, 1000]
gamma_list = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5]

hyperparams_grid = itertools.product(C_list, gamma_list)

results = []

for C_val, gamma in hyperparams_grid:
    SVC_model=SVC(kernel=kernel, gamma=gamma, C=C_val)
    mean_val_score, std_val_score = cross_validation_summary(SVC_model, x, y)
    res = C_val, gamma, mean_val_score, std_val_score
    results.append(res)

rbf_df = pd.DataFrame(results, columns=['C', 'gamma', 'mean_val_score', 'std_val_score'])

In [4]:
rbf_df.sort_values('mean_val_score', ascending=False).head(10)

Unnamed: 0,C,gamma,mean_val_score,std_val_score
44,1000.0,0.01,0.945638,0.018584
38,500.0,0.01,0.937651,0.017174
22,10.0,0.1,0.937638,0.019276
21,10.0,0.05,0.932889,0.014021
32,100.0,0.01,0.932863,0.018752
43,1000.0,0.005,0.932851,0.019425
37,500.0,0.005,0.931263,0.016584
27,50.0,0.05,0.931251,0.021315
33,100.0,0.05,0.929638,0.028414
28,50.0,0.1,0.926438,0.027965


In [5]:
rbf_df.groupby('C')['mean_val_score', 'std_val_score'].mean()

  rbf_df.groupby('C')['mean_val_score', 'std_val_score'].mean()


Unnamed: 0_level_0,mean_val_score,std_val_score
C,Unnamed: 1_level_1,Unnamed: 2_level_1
0.1,0.668004,0.020524
0.5,0.812796,0.029629
1.0,0.859651,0.022804
10.0,0.899575,0.02351
50.0,0.905422,0.025818
100.0,0.910487,0.023552
500.0,0.91608,0.019984
1000.0,0.917941,0.021717


In [6]:
rbf_df.groupby('gamma')['mean_val_score', 'std_val_score'].mean()

  rbf_df.groupby('gamma')['mean_val_score', 'std_val_score'].mean()


Unnamed: 0_level_0,mean_val_score,std_val_score
gamma,Unnamed: 1_level_1,Unnamed: 2_level_1
0.001,0.75816,0.022132
0.005,0.844611,0.02402
0.01,0.880541,0.02313
0.05,0.912684,0.025386
0.1,0.914481,0.024718
0.5,0.856989,0.021267


In [7]:
test_SVC = SVC(C=100, gamma=0.1)
test_SVC.fit(train_x, train_y)
test_SVC.score(test_x, test_y)

0.9235668789808917

### Polynomial kernel

In [8]:
kernel = 'poly'

C_list = [0.1, 0.5, 1, 10, 100]
coeff_list = [0.5, 1, 1.5]
degree_list = [2, 3, 4]

hyperparams_grid = itertools.product(C_list, coeff_list, degree_list)

results = []

for C_val, coeff, deg in hyperparams_grid:
    SVC_model=SVC(kernel='poly', degree=deg, coef0=coeff, C=C_val)
    mean_val_score, std_val_score = cross_validation_summary(SVC_model, x, y)
    res = C_val, coeff, deg, mean_val_score, std_val_score
    results.append(res)

columns=['C', 'coeff', 'degree', 'mean_val_score', 'std_val_score']
poly_df = pd.DataFrame(results, columns=columns)

In [9]:
poly_df.sort_values('mean_val_score', ascending=False).head(10)

Unnamed: 0,C,coeff,degree,mean_val_score,std_val_score
14,0.5,1.0,4,0.937663,0.018598
23,1.0,1.0,4,0.937651,0.024539
25,1.0,1.5,3,0.936063,0.020935
17,0.5,1.5,4,0.932876,0.016562
28,10.0,0.5,3,0.932863,0.019422
31,10.0,1.0,3,0.931251,0.028055
29,10.0,0.5,4,0.929663,0.025576
26,1.0,1.5,4,0.929663,0.017196
27,10.0,0.5,2,0.928076,0.020949
16,0.5,1.5,3,0.928076,0.01969


In [10]:
poly_df.groupby('C')['mean_val_score', 'std_val_score'].mean()

  poly_df.groupby('C')['mean_val_score', 'std_val_score'].mean()


Unnamed: 0_level_0,mean_val_score,std_val_score
C,Unnamed: 1_level_1,Unnamed: 2_level_1
0.1,0.891707,0.02655
0.5,0.919385,0.020576
1.0,0.924876,0.02108
10.0,0.927173,0.024476
100.0,0.922363,0.029795


In [11]:
poly_df.groupby('coeff')['mean_val_score', 'std_val_score'].mean()

  poly_df.groupby('coeff')['mean_val_score', 'std_val_score'].mean()


Unnamed: 0_level_0,mean_val_score,std_val_score
coeff,Unnamed: 1_level_1,Unnamed: 2_level_1
0.5,0.909977,0.025577
1.0,0.920398,0.024892
1.5,0.920928,0.023017


In [12]:
poly_df.groupby('degree')['mean_val_score', 'std_val_score'].mean()

  poly_df.groupby('degree')['mean_val_score', 'std_val_score'].mean()


Unnamed: 0_level_0,mean_val_score,std_val_score
degree,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0.912731,0.023334
3,0.919235,0.024498
4,0.919336,0.025655


In [13]:
test_SVC = SVC(kernel='poly', C=0.5, degree=4, coef0=1)
test_SVC.fit(train_x, train_y)
test_SVC.score(test_x, test_y)

0.9363057324840764