# Support Vector Machine

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
%matplotlib inline

In [4]:
from sklearn.datasets import load_breast_cancer

In [5]:
cancer = load_breast_cancer()

In [6]:
type(cancer)

sklearn.utils.Bunch

In [7]:
cancer.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [8]:
# print(cancer['DESCR'])

In [9]:
df_feat = pd.DataFrame(cancer['data'], columns=cancer['feature_names'])

In [10]:
df_feat.head(2)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902


In [11]:
  df_feat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
mean radius                569 non-null float64
mean texture               569 non-null float64
mean perimeter             569 non-null float64
mean area                  569 non-null float64
mean smoothness            569 non-null float64
mean compactness           569 non-null float64
mean concavity             569 non-null float64
mean concave points        569 non-null float64
mean symmetry              569 non-null float64
mean fractal dimension     569 non-null float64
radius error               569 non-null float64
texture error              569 non-null float64
perimeter error            569 non-null float64
area error                 569 non-null float64
smoothness error           569 non-null float64
compactness error          569 non-null float64
concavity error            569 non-null float64
concave points error       569 non-null float64
symmetry error             569 

In [12]:
cancer['target_names']

array(['malignant', 'benign'], dtype='<U9')

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X = df_feat
y = cancer['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [15]:
# import Support Vector Classification model
from sklearn.svm import SVC

## Model parameters

- **C (default=1.0): Penalty parameter C of the error term**. The C parameter tells the SVM optimization how much you want to avoid misclassifying each training example.
  - For large values of C:
    - The optimization will choose a smaller-margin hyperplane if that hyperplane does a better job of getting all the training points classified correctly.
  - Conversely, a very small value of C:
    - Will cause the optimizer to look for a larger-margin separating hyperplane, even if that hyperplane misclassifies more points.
  - For very tiny values of C:
    - You should get misclassified examples, often even if your training data is linearly separable.


- **cache_size=200**: Specify the size of the kernel cache (in MB).


- **class_weight=None**: Set the parameter C of class i to class_weight[i]*C for SVC.

If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as:
> n_samples / (n_classes * np.bincount(y))

- **coef0=0.0**: Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.
- **decision_function_shape='ovr'**:

Whether to return a one-vs-rest ('ovr') decision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one ('ovo') decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, one-vs-one ('ovo') is always used as multi-class strategy.

Which decision_function_shape should be used for multi-label classification problem?

OneVsRestClassifier is designed to model each class against all of the other classes independently, and create a classifier for each situation. The way I understand this process is that OneVsRestClassifier grabs a class, and **creates a binary label for whether a point is or isn't that class**. Then this labelling gets fed into whatever estimator you have chosen to use. I believe the confusion comes in in that SVC also allows you to make this same choice, but **in effect with this implementation the choice will not matter because you will always only be feeding two classes into the SVC**.


- **degree=3**: Degree of the polynomial kernel function ('poly'). Ignored by all other kernels.
- **gamma='auto'**:

Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.

Current default is 'auto' which uses 1 / n_features, if gamma='scale' is passed then it uses 1 / (n_features * X.var()) as value of gamma. The current default of gamma, 'auto', will change to 'scale' in version 0.22. 'auto_deprecated', a deprecated version of 'auto' is used as a default indicating that no explicit value of gamma was passed.

- **kernel='rbf'**:

Specifies the kernel type to be used in the algorithm. It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or a callable. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape (n_samples, n_samples).

- **max_iter=-1**: Hard limit on iterations within solver, or -1 for no limit.
- **probability=False**: Whether to **enable probability estimates**. This must be enabled prior to calling fit, and will slow down that method.

- **random_state=None**:

The seed of the pseudo random number generator used when shuffling the data for probability estimates. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random.

- **shrinking=True**: Whether to use the shrinking heuristic. 

The shrinking heuristics are there to speed up the optimization. As it says in the FAQ, they sometimes help, and sometimes they do not. I believe it's a matter of runtime, rather than convergence.

- **tol=0.001**: Tolerance for stopping criterion.
- **verbose=False**



In [16]:
model = SVC(gamma='auto')

In [17]:
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [18]:
predictions = model.predict(X_test)

In [19]:
from sklearn.metrics import classification_report, confusion_matrix

In [20]:
print(confusion_matrix(y_test, predictions))
print('\n')
print(classification_report(y_test, predictions))

[[  0  66]
 [  0 105]]


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        66
           1       0.61      1.00      0.76       105

   micro avg       0.61      0.61      0.61       171
   macro avg       0.31      0.50      0.38       171
weighted avg       0.38      0.61      0.47       171



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Warning reason

We have the following warning:
> UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.

The reason this is happening is that we are classifying everything into a single class is because our models needs to have his parameters adjusted.

And it may also help to actually normalize the data as well.

What we can do is try to find the best parameters using a grid search.

Now a grid search allows you to find the right parameters such as like what C or gamma values to use and finding those right parameters is usually a tricky task.

But luckily we can be a little lazy and just try a bunch of combinations and see what works best.
And the idea of creating a grid of parameters is trying out all the best possible combinations it's called a **grid search**.

## Grid search (find best parameters)

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
param_grid = {'C':[0.1, 1, 10, 100, 1000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001]}

In [23]:
grid = GridSearchCV(SVC(), param_grid, verbose=3)

In [24]:
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] .......... C=0.1, gamma=1, score=0.631578947368421, total=   0.0s
[CV] C=0.1, gamma=1 ..................................................
[CV] .......... C=0.1, gamma=1, score=0.631578947368421, total=   0.0s
[CV] C=0.1, gamma=1 ..................................................
[CV] ......... C=0.1, gamma=1, score=0.6363636363636364, total=   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ........ C=0.1, gamma=0.1, score=0.631578947368421, total=   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ........ C=0.1, gamma=0.1, score=0.631578947368421, total=   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....... C=0.1, gamma=0.1, score=0.6363636363636364, total=   0.0s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ....... C=0

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    1.1s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [25]:
grid.best_params_

{'C': 10, 'gamma': 0.0001}

In [26]:
grid.best_estimator_

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

## Predict again with grid parameters

In [30]:
grid_predictions = grid.predict(X_test)

In [31]:
print(confusion_matrix(y_test, grid_predictions))
print('\n')
print(classification_report(y_test, grid_predictions))

[[ 60   6]
 [  3 102]]


              precision    recall  f1-score   support

           0       0.95      0.91      0.93        66
           1       0.94      0.97      0.96       105

   micro avg       0.95      0.95      0.95       171
   macro avg       0.95      0.94      0.94       171
weighted avg       0.95      0.95      0.95       171

