# Wine data set 

In [7]:
from sklearn import datasets

X, y = datasets.load_wine(return_X_y=True, as_frame=True)
X.shape, y.shape

((178, 13), (178,))

In [8]:
X, y

(     alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
 0      14.23        1.71  2.43               15.6      127.0           2.80   
 1      13.20        1.78  2.14               11.2      100.0           2.65   
 2      13.16        2.36  2.67               18.6      101.0           2.80   
 3      14.37        1.95  2.50               16.8      113.0           3.85   
 4      13.24        2.59  2.87               21.0      118.0           2.80   
 ..       ...         ...   ...                ...        ...            ...   
 173    13.71        5.65  2.45               20.5       95.0           1.68   
 174    13.40        3.91  2.48               23.0      102.0           1.80   
 175    13.27        4.28  2.26               20.0      120.0           1.59   
 176    13.17        2.59  2.37               20.0      120.0           1.65   
 177    14.13        4.10  2.74               24.5       96.0           2.05   
 
      flavanoids  nonflavanoid_phenols

In [11]:
from sklearn.model_selection import train_test_split
from sklearn import svm

In [22]:
import numpy as np

def find_best_split(X, y):
    scores = {}
    for i in np.arange(0.1, 0.99, 0.1): 
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=i, random_state=0)
        clf = svm.SVC(kernel='linear', C=1)

        clf.fit(X_train, y_train)
        
        scores[i] = clf.score(X_test, y_test)
    
    best_split = max(scores, key=scores.get)
    return best_split, scores[best_split]

find_best_split(X, y)

(0.1, 1.0)

Ottengo prestazioni migliori del classificatore se utilizzo un training set piuttosto piccolo, ovvero il 10% del data set totale.

In [29]:
from sklearn.model_selection import cross_val_score

def find_best_cv(X, y):
    clf = svm.SVC(kernel='linear', C=1)
    scores = {}

    for i in range(2, 11):
        scores[i] = cross_val_score(clf, X, y, cv=i).mean()

    best_score = max(scores, key=scores.get)
    return best_score, scores[best_score]

find_best_cv(X, y)
        

(8, 0.9612154150197629)

In questo caso, applicando una KFold Cross-validation ottengo prestazioni peggiore rispetto all'applicazione di una Hold-out con un training set del 10%.

In [36]:
from sklearn import preprocessing 
from sklearn import pipeline

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

scaler = preprocessing.StandardScaler()
clf = svm.SVC(kernel='linear', C=1)

pipeline = pipeline.make_pipeline(scaler, clf)

cv_scores = cross_val_score(pipeline, X_train, y_train, cv=8)

print(cv_scores, cv_scores.mean())

pipeline.fit(X_train, y_train)
print(pipeline.score(X_test, y_test))

[0.875      0.75       1.         0.9375     0.93333333 1.
 0.93333333 1.        ] 0.9286458333333334
1.0


In [43]:
from sklearn.model_selection import KFold, GridSearchCV

def best_params_clf(X, y):
    param_grid = [
            {'C': np.arange(1, 101, 10), 'kernel': ['linear']},
            {'C': np.arange(1, 121, 10), 'gamma': np.arange(0.0001, 0.001, 0.0005), 'kernel': ['rbf']},
        ]
    
    model = svm.SVC()

    inner_cv = KFold(n_splits=8, shuffle=True, random_state=0)
    outer_cv = KFold(n_splits=8, shuffle=True, random_state=0)

    clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv)
    clf.fit(X, y)

    return (clf.best_score_, clf.best_params_)

best_params_clf(X, y)


(0.9604743083003953, {'C': 11, 'kernel': 'linear'})