#  Classification of tumors with Support Vector Machines (Hyperparameter tunning)
### We want to try Support Vector Machines, but we have no idea what hyperparameters we should use. So we define a structure with the values ​​we want to test for each hyperparameter.

In [1]:
from sklearn import datasets
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

import pandas as pd
import numpy as np
from scipy import stats

* Let's get back to the problem with biopsies and breast cancer.

In [2]:
bc = datasets.load_breast_cancer(as_frame=True)
bc.data

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [3]:
param_grid = [
    {
        "kernel": ["rbf"], 
        "gamma": [1e-2, 1e-3, 1e-4], 
        "C": [1, 10, 100, 1000],
        "decision_function_shape": ["ovo", "ovr"]
    },
    {
        "kernel": ["linear"], 
        "C": [1, 10, 100, 1000],
        "decision_function_shape": ["ovo", "ovr"]
    },
]

* Next, we use the `GridSearchCV` class, which will try all possible combinations of the parameters and find the best model.

* `GridSearchCV', as its name says, also performs $k$-fold validation, where $k = 5$ unless we define something else.

* Also, like other scikit-learn methods that support parallelization, with the `n_jobs` parameter we ask for the number of parallel processes we want to run (-1 means as many as possible).

In [4]:
clf = SVC()
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1)
grid_search.fit(bc.data, bc.target)

GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid=[{'C': [1, 10, 100, 1000],
                          'decision_function_shape': ['ovo', 'ovr'],
                          'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']},
                         {'C': [1, 10, 100, 1000],
                          'decision_function_shape': ['ovo', 'ovr'],
                          'kernel': ['linear']}])

* Next, we use the `GridSearchCV` class, which will try all possible combinations of the parameters and find the best model.

* `GridSearchCV', as its name says, also performs $k$-fold validation, where $k = 5$ unless we define something else.

* Also, like other scikit-learn methods that support parallelization, with the `n_jobs` parameter we ask for the number of parallel processes we want to run (-1 means as many as possible).

In [5]:
grid_search.best_params_

{'C': 100, 'decision_function_shape': 'ovo', 'kernel': 'linear'}

* If we want to get the best model to use, this is available from the `best_estimator_` property.

In [6]:
grid_search.best_estimator_

SVC(C=100, decision_function_shape='ovo', kernel='linear')

* If we want to see all the results, we can get them with the `cv_results_` property.

In [7]:
grid_search_results = pd.DataFrame(grid_search.cv_results_)
print(grid_search_results.shape)
grid_search_results

(32, 17)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_decision_function_shape,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.034367,0.002893,0.018493,0.003409,1,ovo,0.01,rbf,"{'C': 1, 'decision_function_shape': 'ovo', 'ga...",0.622807,0.622807,0.631579,0.631579,0.619469,0.625648,0.004993,31
1,0.027807,0.001518,0.012188,0.002316,1,ovo,0.001,rbf,"{'C': 1, 'decision_function_shape': 'ovo', 'ga...",0.929825,0.921053,0.921053,0.947368,0.893805,0.922621,0.017318,17
2,0.01584,0.0014,0.007788,0.000753,1,ovo,0.0001,rbf,"{'C': 1, 'decision_function_shape': 'ovo', 'ga...",0.903509,0.938596,0.938596,0.95614,0.938053,0.934979,0.017169,11
3,0.032065,0.001456,0.014848,0.00078,1,ovr,0.01,rbf,"{'C': 1, 'decision_function_shape': 'ovr', 'ga...",0.622807,0.622807,0.631579,0.631579,0.619469,0.625648,0.004993,31
4,0.026748,0.002187,0.011573,0.001273,1,ovr,0.001,rbf,"{'C': 1, 'decision_function_shape': 'ovr', 'ga...",0.929825,0.921053,0.921053,0.947368,0.893805,0.922621,0.017318,17
5,0.01439,0.00029,0.00798,0.00058,1,ovr,0.0001,rbf,"{'C': 1, 'decision_function_shape': 'ovr', 'ga...",0.903509,0.938596,0.938596,0.95614,0.938053,0.934979,0.017169,11
6,0.032557,0.001228,0.016269,0.00068,10,ovo,0.01,rbf,"{'C': 10, 'decision_function_shape': 'ovo', 'g...",0.622807,0.622807,0.631579,0.640351,0.619469,0.627403,0.007619,25
7,0.027339,0.001334,0.010299,0.001004,10,ovo,0.001,rbf,"{'C': 10, 'decision_function_shape': 'ovo', 'g...",0.894737,0.903509,0.921053,0.938596,0.884956,0.90857,0.019142,19
8,0.013677,0.0013,0.006487,0.001484,10,ovo,0.0001,rbf,"{'C': 10, 'decision_function_shape': 'ovo', 'g...",0.894737,0.938596,0.947368,0.95614,0.920354,0.931439,0.021841,13
9,0.027278,0.000304,0.012535,0.001342,10,ovr,0.01,rbf,"{'C': 10, 'decision_function_shape': 'ovr', 'g...",0.622807,0.622807,0.631579,0.640351,0.619469,0.627403,0.007619,25


* It might be better to sort them in descending order of performance.

In [8]:
grid_search_results.sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_decision_function_shape,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
29,13.012354,6.133496,0.001773,0.000196,100,ovr,,linear,"{'C': 100, 'decision_function_shape': 'ovr', '...",0.938596,0.947368,0.973684,0.973684,0.982301,0.963127,0.016974,1
28,13.753024,2.014074,0.002387,0.000601,100,ovo,,linear,"{'C': 100, 'decision_function_shape': 'ovo', '...",0.938596,0.947368,0.973684,0.973684,0.982301,0.963127,0.016974,1
31,11.413814,8.383838,0.002977,0.001363,1000,ovr,,linear,"{'C': 1000, 'decision_function_shape': 'ovr', ...",0.947368,0.947368,0.973684,0.929825,0.973451,0.954339,0.016957,3
30,13.373683,9.776557,0.002199,0.000617,1000,ovo,,linear,"{'C': 1000, 'decision_function_shape': 'ovo', ...",0.947368,0.947368,0.973684,0.929825,0.973451,0.954339,0.016957,3
27,6.902102,1.274604,0.003198,0.000837,10,ovr,,linear,"{'C': 10, 'decision_function_shape': 'ovr', 'k...",0.938596,0.938596,0.973684,0.947368,0.964602,0.952569,0.0142,5
26,6.894012,1.496287,0.002854,0.000383,10,ovo,,linear,"{'C': 10, 'decision_function_shape': 'ovo', 'k...",0.938596,0.938596,0.973684,0.947368,0.964602,0.952569,0.0142,5
24,2.078829,0.843934,0.002588,0.000359,1,ovo,,linear,"{'C': 1, 'decision_function_shape': 'ovo', 'ke...",0.947368,0.929825,0.973684,0.921053,0.955752,0.945536,0.018689,7
25,2.088413,0.840969,0.002672,0.000415,1,ovr,,linear,"{'C': 1, 'decision_function_shape': 'ovr', 'ke...",0.947368,0.929825,0.973684,0.921053,0.955752,0.945536,0.018689,7
17,0.012243,0.000993,0.004748,0.00035,100,ovr,0.0001,rbf,"{'C': 100, 'decision_function_shape': 'ovr', '...",0.947368,0.938596,0.95614,0.938596,0.911504,0.938441,0.014957,9
14,0.012212,0.000722,0.005215,0.00011,100,ovo,0.0001,rbf,"{'C': 100, 'decision_function_shape': 'ovo', '...",0.947368,0.938596,0.95614,0.938596,0.911504,0.938441,0.014957,9


* If we want the best performance directly:

In [9]:
grid_search.best_score_

0.9631268436578171

* As before, we can also test with a `SGDClassifier`.

* We will start with a different approach.

* Instead of exhausting all possible combinations in the search, we will randomly sample between them.

In [10]:
param_dist = {
    "average": [True, False],
    "l1_ratio": stats.uniform(0, 1),
    "alpha": stats.loguniform(1e-2, 1e0),
}

* To do this, we use the `RandomizedSearchCV` class by giving the number of samples we want it to try from the space of possible combinations of the parameters.

In [11]:
clf = SGDClassifier()
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=15, n_jobs=-1)
random_search.fit(bc.data, bc.target)

RandomizedSearchCV(estimator=SGDClassifier(), n_iter=15, n_jobs=-1,
                   param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x13577f1c0>,
                                        'average': [True, False],
                                        'l1_ratio': <scipy.stats._distn_infrastructure.rv_frozen object at 0x135758f70>})

* To see what we found:

In [12]:
random_search.best_params_

{'alpha': 0.3664163569482923,
 'average': False,
 'l1_ratio': 0.15413448344948555}

* And the optimal estimator:

In [13]:
random_search.best_estimator_

SGDClassifier(alpha=0.3664163569482923, l1_ratio=0.15413448344948555)

* And how good this is:

In [14]:
random_search.best_score_

0.912125446359261

* If we want (and can) do an exhaustive price search, of course we proceed as before:

In [15]:
param_grid = {
    "average": [True, False],
    "l1_ratio": np.linspace(0, 1, num=10),
    "alpha": np.power(10, np.arange(-2, 1, dtype=float)),
}

* Again we use the `GridSearchCV` class.

In [16]:
clf = SGDClassifier()
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1)
grid_search.fit(bc.data, bc.target)

GridSearchCV(estimator=SGDClassifier(), n_jobs=-1,
             param_grid={'alpha': array([0.01, 0.1 , 1.  ]),
                         'average': [True, False],
                         'l1_ratio': array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ])})

* To see what we found:

In [17]:
grid_search.best_params_

{'alpha': 0.01, 'average': False, 'l1_ratio': 0.0}

* And the best appraiser:

In [18]:
grid_search.best_estimator_

SGDClassifier(alpha=0.01, l1_ratio=0.0)

* And its performance:

In [19]:
grid_search.best_score_

0.9155721161310355

* But shouldn't we also try normalization?

* So we will make a pipeline.

In [20]:
clf = make_pipeline(StandardScaler(), SGDClassifier())

* We can define parameters to search for each of the steps of a pipeline.

* Here we will only give for the second.

In [21]:
param_grid = {
    "sgdclassifier__average": [True, False],
    "sgdclassifier__l1_ratio": np.linspace(0, 1, num=10),
    "sgdclassifier__alpha": np.power(10, np.arange(-2, 1, dtype=float)),
}

* So we proceed to the search...

In [22]:
grid_search = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1)
grid_search.fit(bc.data, bc.target)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('sgdclassifier', SGDClassifier())]),
             n_jobs=-1,
             param_grid={'sgdclassifier__alpha': array([0.01, 0.1 , 1.  ]),
                         'sgdclassifier__average': [True, False],
                         'sgdclassifier__l1_ratio': array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ])})

* And here's what we got:

In [23]:
grid_search.best_params_

{'sgdclassifier__alpha': 0.01,
 'sgdclassifier__average': True,
 'sgdclassifier__l1_ratio': 0.7777777777777777}

* The best estimator:

In [24]:
grid_search.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('sgdclassifier',
                 SGDClassifier(alpha=0.01, average=True,
                               l1_ratio=0.7777777777777777))])

* And the best performance:

In [25]:
grid_search.best_score_

0.9806707033069401