# Kernelized Support Vector Machines for Classification: SVC

### Initial imports

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [68]:
from sklearn.datasets import make_blobs

In [69]:
X, y = make_blobs(centers=4, random_state=0)
print("X shape: {}".format(X.shape))
print("y shape: {}".format(y.shape))

X shape: (100, 2)
y shape: (100,)


In [70]:
pd.Series(y).value_counts()

3    25
2    25
1    25
0    25
dtype: int64

In [71]:
y = y%2
pd.Series(y).value_counts() #now it's binary classification

1    50
0    50
dtype: int64

In [72]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=.25, random_state=0)

linear_svm = LinearSVC()
linear_svm.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [74]:
print("train set accuracy: {:.3f}".format(linear_svm.score(X_train, y_train)))
print("test set accuracy: {:.3f}".format(linear_svm.score(X_test, y_test)))

train set accuracy: 0.573
test set accuracy: 0.600


### Feature Engineering

In [75]:
#adding the squared feature 1
X_new_train = np.hstack([X_train, X_train[:,1:]**2])
X_new_test = np.hstack([X_test, X_test[:,1:]**2])

print(X_new_train[:3])

[[  0.91498017   9.17198797  84.12536323]
 [  0.85624076   3.86236175  14.91783827]
 [ -2.39572443   7.39763997  54.72507714]]


### Parameter tuning with GridSearchCV

In [76]:
from sklearn.model_selection import GridSearchCV

In [77]:
param_grid = {"C":[0.001,0.01,0.1,1,10,100]}

gscv = GridSearchCV(LinearSVC(), param_grid, cv=5)
gscv.fit(X_new_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [78]:
print("gscv-train set accuracy: {:.3f}".format(gscv.score(X_new_train, y_train)))
print("gscv-test set accuracy: {:.3f}".format(gscv.score(X_new_test, y_test)))
print("best parameters: {}".format(gscv.best_params_))

gscv-train set accuracy: 0.920
gscv-test set accuracy: 0.840
best parameters: {'C': 1}


# SVC

In [79]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer

In [80]:
cancer = load_breast_cancer()
print("cancer keys:\n{}".format(cancer.keys()))

X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.25,
                                                    stratify=cancer.target, random_state=0)
print("\nX_train shape: {}".format(X_train.shape))

cancer keys:
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

X_train shape: (426, 30)


In [81]:
svc = SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [82]:
print("SVC-train set accuracy: {:.3f}".format(svc.score(X_train, y_train)))
print("SVC-test set accuracy: {:.3f}".format(svc.score(X_test, y_test)))

SVC-train set accuracy: 1.000
SVC-test set accuracy: 0.629


### Parameter tuning with GridSearchCV

In [83]:
param_grid_svc = {'C':[0.001,0.01,0.1,1,10,100],
                  'gamma':[0.001,0.01,0.1,1,10,100]}

svc_grid = GridSearchCV(SVC(), param_grid_svc, cv=5)
svc_grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [84]:
print("SVC-Grid-train set accuracy: {:.3f}".format(svc_grid.score(X_train, y_train)))
print("SVC-Grid-test set accuracy: {:.3f}".format(svc_grid.score(X_test, y_test)))
print("best parameters: {}".format(svc_grid.best_params_))

SVC-Grid-train set accuracy: 0.981
SVC-Grid-test set accuracy: 0.909
best parameters: {'C': 1, 'gamma': 0.001}


### Preprocessing data

In [85]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

In [89]:
pipe = Pipeline([('scaler', MinMaxScaler()), ("svc",SVC())])

param_grid_pipe = {'svc__C':[0.001,0.01,0.1,1,10,100,1000],
                  'svc__gamma':[0.001,0.01,0.1,1,10,100,1000]}

grid = GridSearchCV(pipe, param_grid_pipe, cv=5)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'svc__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'svc__gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [90]:
print("best cv accuracy: {:.3f}".format(grid.best_score_))
print("training set accuracy: {:.3f}".format(grid.score(X_train, y_train)))
print("test set accuracy: {:.3f}".format(grid.score(X_test, y_test)))
print("best parameters: {}".format(grid.best_params_))

best cv accuracy: 0.984
training set accuracy: 0.993
test set accuracy: 0.951
best parameters: {'svc__C': 10, 'svc__gamma': 0.1}
