# Data Mining Project
### Giorgio Donati, g.donati24@studenti.unipi.it
### Pietro Francaviglia, p.francaviglia1@studenti.unipi.it
#### A.Y. 2021-2022

## Libraries and datasets

In [53]:
from sklearn.svm import SVC

from t3_constants import *
from t3_utility import *

# Support Vector Classifier (RBF)
## Training

In [54]:
with open(f'{PICKLE_FOLDER}{TRAIN}{X}.pkl', 'rb') as f:
    train_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TRAIN}{Y}.pkl', 'rb') as f:
    train_y = pickle.load(f)

In [55]:
# parameters dictionary for Decision Tree grid search
param_d = {
    CPRM: [0.1, 0.5, 1, 10, 50, 100, 500, 1000],
    GMM: [0.001, 0.005, 0.01, 0.05, 0.1, 0.5],
    KRL: ['rbf']
}
# iteration on parameters to collect results
grid = itertools.product(*[param_d[key] for key in sorted(param_d)])
results = []
for c_param, gamma, kernel in grid:
    model = SVC(kernel=kernel, gamma=gamma, C=c_param)
    mean_val_score, std_val_score = cross_validation_summary(model, train_x.values, train_y.values)
    res = c_param, gamma, kernel, mean_val_score, std_val_score
    results.append(res)

# print top 5 combination results
result_df = pd.DataFrame(results, columns=sorted(param_d) + [MVS, SVS])
result_df.sort_values(MVS, ascending=False).head(8)

Unnamed: 0,c_param,gamma,kernel,mean_val_score,std_val_score
44,1000.0,0.01,rbf,0.945638,0.018584
38,500.0,0.01,rbf,0.937651,0.017174
22,10.0,0.1,rbf,0.937638,0.019276
21,10.0,0.05,rbf,0.932889,0.014021
32,100.0,0.01,rbf,0.932863,0.018752
43,1000.0,0.005,rbf,0.932851,0.019425
37,500.0,0.005,rbf,0.931263,0.016584
27,50.0,0.05,rbf,0.931251,0.021315


In [56]:
result_df.groupby(CPRM)[MVS, SVS].mean().sort_values(MVS, ascending=False)

  result_df.groupby(CPRM)[MVS, SVS].mean().sort_values(MVS, ascending=False)


Unnamed: 0_level_0,mean_val_score,std_val_score
c_param,Unnamed: 1_level_1,Unnamed: 2_level_1
1000.0,0.917941,0.021717
500.0,0.91608,0.019984
100.0,0.910487,0.023552
50.0,0.905422,0.025818
10.0,0.899575,0.02351
1.0,0.859651,0.022804
0.5,0.812796,0.029629
0.1,0.668004,0.020524


In [57]:
result_df.groupby(GMM)[MVS, SVS].mean().sort_values(MVS, ascending=False)

  result_df.groupby(GMM)[MVS, SVS].mean().sort_values(MVS, ascending=False)


Unnamed: 0_level_0,mean_val_score,std_val_score
gamma,Unnamed: 1_level_1,Unnamed: 2_level_1
0.1,0.914481,0.024718
0.05,0.912684,0.025386
0.01,0.880541,0.02313
0.5,0.856989,0.021267
0.005,0.844611,0.02402
0.001,0.75816,0.022132


### Test Model

In [58]:
best_result = result_df.sort_values(MVS).tail(1)

# best model
test_model = SVC(
    kernel=best_result[KRL].values[0],
    gamma=best_result[GMM].values[0],
    C=best_result[CPRM].values[0]
)
_ = test_model.fit(train_x, train_y)

## Testing

In [59]:
with open(f'{PICKLE_FOLDER}{TEST}{X}.pkl', 'rb') as f:
    test_x = pickle.load(f)
with open(f'{PICKLE_FOLDER}{TEST}{Y}.pkl', 'rb') as f:
    test_y = pickle.load(f)

### Score

In [60]:
test_model.score(test_x, test_y)

0.8980891719745223