In [5]:
import pandas as pd
import numpy as np

import timeit
from IPython.display import clear_output

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler

In [6]:
with np.load('cifar4-overfeat-split.npz', allow_pickle=False) as data:
    split_data = dict(data.items())

for key in split_data.keys():
    locals()[key] = split_data[key]

We don't have a lot of samples, so we are using the SVC classifier: https://stackoverflow.com/questions/29704231/in-sklearn-what-is-the-difference-between-a-svm-model-with-linear-kernel-and-a-s

### Create an SVM classifier with a linear kernel. Tune its C parameter using grid search cross-validation.

In [7]:
strat_cv = StratifiedKFold(n_splits=5, random_state=0)

We use PCA for the same reason as before, to reduce dimensionality and noise, and improve our computing time and results. 

In [8]:
param_grid_linear = {'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10]}

pca = PCA(n_components=408)

grid_cv = GridSearchCV(SVC(kernel='linear'), 
                       param_grid=param_grid_linear, 
                       cv=strat_cv,
                      n_jobs=-1)

In [9]:
X_tr_pca = pca.fit_transform(X_tr)

start = timeit.default_timer()

grid_cv.fit(X_tr_pca, y_tr)

stop = timeit.default_timer()
print('CV grid search done in {:.2f} minutes'.format((stop - start)/60))

CV grid search done in 1.21 minutes


In [10]:
# Get parameters with their scores
params = grid_cv.cv_results_['params']
scores = grid_cv.cv_results_['mean_test_score']
st_dev = grid_cv.cv_results_['std_test_score']

# Create a list of (parameters, score) pairs
results_linear = pd.DataFrame({'C': [list(value.values())[0] for a,value in enumerate(params)], 
                               'mean accuracy':scores, 'standard deviation':st_dev})
results_linear

Unnamed: 0,C,mean accuracy,standard deviation
0,0.001,0.838,0.010416
1,0.005,0.8325,0.009779
2,0.01,0.82525,0.010259
3,0.05,0.79925,0.005734
4,0.1,0.793,0.004783
5,0.5,0.78725,0.009918
6,1.0,0.78725,0.009918
7,10.0,0.78725,0.009918


### Create an SVM classifier with an RBF kernel. Tune its C and γ parameters using grid search.

In [11]:
param_grid_rbf = ParameterGrid([{'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10],
                 'gamma': [10**k for k in range(-5, 2)]
                       }])

grid_cv_rbf = GridSearchCV(SVC(kernel='rbf'), 
                           param_grid=param_grid_rbf.param_grid, 
                            cv=strat_cv,
                            n_jobs=-1)

In [12]:
start = timeit.default_timer()

grid_cv_rbf.fit(X_tr_pca, y_tr)

stop = timeit.default_timer()
print('CV grid search on SVM rbf done in {:.2f} minutes'.format((stop - start)/60))

CV grid search on SVM rbf done in 29.82 minutes


Had a nap. And dinner. And a small digestive nap. 

In [13]:
params_rbf = grid_cv_rbf.cv_results_['params']

In [14]:
# Get parameters with their scores
params_rbf = grid_cv_rbf.cv_results_['params']
scores_rbf = grid_cv_rbf.cv_results_['mean_test_score']
st_dev_rbf = grid_cv_rbf.cv_results_['std_test_score']

# Create a list of (parameters, score) pairs
results_rbf = pd.DataFrame({'C': [list(value.values())[0] for a,value in enumerate(params_rbf)], 
                            'gamma': [list(value.values())[1] for a,value in enumerate(params_rbf)],
                            'mean accuracy':scores_rbf, 
                            'standard deviation':st_dev_rbf})
results_rbf.sort_values('mean accuracy', ascending=False).head(20)

Unnamed: 0,C,gamma,mean accuracy,standard deviation
50,10.0,0.0001,0.84025,0.012976
43,1.0,0.0001,0.8315,0.01032
49,10.0,1e-05,0.8305,0.010565
36,0.5,0.0001,0.8235,0.009097
42,1.0,1e-05,0.801,0.011549
51,10.0,0.001,0.7985,0.005208
44,1.0,0.001,0.79075,0.008314
29,0.1,0.0001,0.7875,0.010518
35,0.5,1e-05,0.78575,0.01391
22,0.05,0.0001,0.767,0.010446


In [15]:
linear_top_c = results_linear['C'][results_linear['mean accuracy'].idxmax()]
linear_stdev = results_linear['standard deviation'][results_linear['mean accuracy'].idxmax()]
linear_acc = results_linear['mean accuracy'].max()

print('Linear SVM - top accuracy across folds: {:.5f} (std: {:.5f}) with C: {}'.format(linear_acc, linear_stdev,
                                                                                      linear_top_c))


rbf_top_c = results_rbf['C'][results_rbf['mean accuracy'].idxmax()]
rbf_stdev = results_rbf['standard deviation'][results_rbf['mean accuracy'].idxmax()]
rbf_acc = results_rbf['mean accuracy'].max()
rbf_top_gamma = results_rbf['gamma'][results_rbf['mean accuracy'].idxmax()]

print('RBF SVM - top accuracy across folds: {:.5f} (std: {:.5f}) with C: {} and gamma: {}'.format(rbf_acc, rbf_stdev,
                                                                                         rbf_top_c, rbf_top_gamma))

Linear SVM - top accuracy across folds: 0.83800 (std: 0.01042) with C: 0.001
RBF SVM - top accuracy across folds: 0.84025 (std: 0.01298) with C: 10.0 and gamma: 0.0001


Results are pretty close. 

### Evaluate and report the accuracy of your (tuned) estimators on the 1,000 points from the test set.

In [16]:
X_te_pca = pca.transform(X_te)

# Linear SVC
svc_lin = SVC(kernel='linear', C=linear_top_c)
svc_lin.fit(X_tr_pca, y_tr)

score_linear = svc_lin.score(X_te_pca, y_te)

# SVC RBF
svc_rbf = SVC(kernel='rbf', C=rbf_top_c, gamma=rbf_top_gamma)
svc_rbf.fit(X_tr_pca, y_tr)

score_rbf = svc_rbf.score(X_te_pca, y_te)

# Print
print('Linear SVM accuracy (test set): {:.5f}'.format(score_linear))
print('RBF SVM accuracy (test set): {:.5f}'.format(score_rbf))

Linear SVM accuracy (test set): 0.84100
RBF SVM accuracy (test set): 0.83100


Kids, don't fit the test set.  

In [17]:
%store score_linear
%store score_rbf

Stored 'score_linear' (float64)
Stored 'score_rbf' (float64)
