## Support vector machines

In [1]:
import numpy as np
import os
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

# Load the numpy .npz file
with np.load(os.path.join('data', 'cifar4-train.npz'), allow_pickle=False) as data:
    cifar4_data = dict(data.items())
    
print('Data loaded')
print('It is a dictionary with keys:', list(cifar4_data.keys()))

Data loaded
It is a dictionary with keys: ['pixels', 'overfeat', 'labels', 'names', 'allow_pickle']


In [2]:
X = cifar4_data['overfeat']
y = cifar4_data['labels']

print('X shape:',X.shape)
print('y shape:', y.shape)

X shape: (5000, 4096)
y shape: (5000,)


In [3]:
from sklearn.decomposition import PCA

# PCA to reduce dimensions. 
# Using 164 as this is the result from the previous exercise ensuring 90% of PVE explained
pca = PCA(n_components=164)
X = pca.fit_transform(X)
print('X shape:',X.shape)

X shape: (5000, 164)


From the data exploration we use 164 features and we apply it to the data set.

In [5]:
# Splitting the data
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=4000,test_size=1000,
                                          random_state=0, stratify=y)

# checking the shapes
print('X_train:', X_train.shape, X_train.dtype)
print('y_train:', y_train.shape, y_train.dtype)
print('X_test:', X_test.shape, X_test.dtype)
print('y_test:', y_test.shape, y_test.dtype)

X_train: (4000, 164) float32
y_train: (4000,) int64
X_test: (1000, 164) float32
y_test: (1000,) int64


### SVM classifier with a linear kernel

In [10]:
from sklearn.model_selection import GridSearchCV

# Create cross validation object
grid_cv = GridSearchCV(SVC(kernel='linear'), {'C': [0.01,0.1,1,5,10]}, cv=5)

# Fit estimator
grid_cv.fit(X_train, y_train)

# Get the results
grid_cv.cv_results_.keys()

# Collect results in a DataFrame
df = pd.DataFrame.from_items([
    ('C', grid_cv.cv_results_['param_C']),
    ('mean accuracy',grid_cv.cv_results_['mean_test_score']),
    ('standard deviation',grid_cv.cv_results_['std_test_score'])
])
df.sort_values(by='mean accuracy', ascending=False)


Unnamed: 0,C,mean accuracy,standard deviation
0,0.01,0.8175,0.017302
1,0.1,0.803,0.019931
2,1.0,0.7895,0.01629
3,5.0,0.786,0.016982
4,10.0,0.786,0.016628


In [13]:
# Best combination
idx_max = df.loc[df['mean accuracy'].idxmax()]

print('Linear SVM - top accuracy across folds:',idx_max[1],
      ' (std:',idx_max[2], ') with C:',idx_max[0])

Linear SVM - top accuracy across folds: 0.8175  (std: 0.017302456472998287 ) with C: 0.01


### SVM classifier with a RBF kernel

In [15]:
# Create cross validation object
grid_cv_rbf = GridSearchCV(SVC(kernel='rbf'), {'C': [0.01,0.1,1,5,10], 'gamma':[0.1,1,10,100,1000]}, cv=5)

# Fit estimator
grid_cv_rbf.fit(X_train, y_train)

# Get the results
grid_cv_rbf.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_C', 'param_gamma', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [17]:
# Collect results in a DataFrame
df_rbf = pd.DataFrame.from_items([
    ('C', grid_cv_rbf.cv_results_['param_C']),
    ('gamma',grid_cv_rbf.cv_results_['param_gamma']),
    ('mean accuracy',grid_cv_rbf.cv_results_['mean_test_score']),
    ('standard deviation',grid_cv_rbf.cv_results_['std_test_score'])
])
df_rbf.sort_values(by='mean accuracy', ascending=False).head()

Unnamed: 0,C,gamma,mean accuracy,standard deviation
0,0.01,0.1,0.69475,0.009695
5,0.1,0.1,0.69475,0.009695
21,10.0,1.0,0.2535,0.002151
6,0.1,1.0,0.2535,0.002151
11,1.0,1.0,0.2535,0.002151


In [23]:
# Best combination
idx_max = df_rbf.loc[df_rbf['mean accuracy'].idxmax()]

print('RBF SVM - top accuracy across folds:',idx_max[2],
      ' (std:',idx_max[3], ') with C:',idx_max[0],'and gamma:',idx_max[1])

RBF SVM - top accuracy across folds: 0.69475  (std: 0.00969535971483263 ) with C: 0.01 and gamma: 0.1


### Test set

In [27]:
# SVM classifier with a linear kernel
svm_linear = SVC(kernel = 'linear', C=0.01)
svm_linear.fit(X_train,y_train)
svm_linear.score(X_test,y_test)

0.817

In [28]:
# SVM classifier with a RBF kernel
svm_rbf = SVC(kernel = 'rbf', C=0.01, gamma=0.1)
svm_rbf.fit(X_train,y_train)
svm_rbf.score(X_test,y_test)

0.709

In [29]:
print('Linear SVM accuracy (test set): 82%')
print('RBF SVM accuracy (test set): 71%')

Linear SVM accuracy (test set): 82%
RBF SVM accuracy (test set): 71%
