#### SVM is a classifier that finds optimal hyper plane that maximise the margin between two classes
#### In 2-D it is a line and in 3-D it is a plane
#### To separate data that cannot be seperated by line or hyperplane we can use kernel tricks
#### Kernel trick will convert the data that is not linearly seperable in the N-dimension to a higher dimension plane where the data is seperable

### When to use SVM

* Incase of binary classification targets
* When feature to row ratio is high(short and fat)
* for complex relationship
* If there is a lot of outliers because SVM only consider points close to the line. So most  of the outliers will be ignored

### But don't use it 
* if your feature to row ratio is low
* If you are looking for transparency
* if you want a quick benchmark model as SVM takes a lot of time to train

In [1]:
from sklearn.svm import SVC #support vector classifier

SVC() # here we are going to tune C and kernel parameters

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [2]:
dir(SVC)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__doc__',
 '__format__',
 '__getattribute__',
 '__getstate__',
 '__hash__',
 '__init__',
 '__module__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_cache',
 '_abc_negative_cache',
 '_abc_negative_cache_version',
 '_abc_registry',
 '_check_proba',
 '_compute_kernel',
 '_decision_function',
 '_dense_decision_function',
 '_dense_fit',
 '_dense_predict',
 '_dense_predict_proba',
 '_estimator_type',
 '_get_coef',
 '_get_param_names',
 '_impl',
 '_pairwise',
 '_predict_log_proba',
 '_predict_proba',
 '_sparse_decision_function',
 '_sparse_fit',
 '_sparse_kernels',
 '_sparse_predict',
 '_sparse_predict_proba',
 '_validate_for_predict',
 '_validate_targets',
 '_warn_from_fit_status',
 'coef_',
 'decision_function',
 'fit',
 'get_params',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'score',
 'set_param

In [3]:
import joblib # pickle the model and save it
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV # use for hyper parmeter tuning
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [4]:
# print the results
def print_results(results):
    print("Best params is {}".format(results.best_params_))
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
          print('{} (+/-){} for {}'.format(round(mean, 3), round(std *2, 3), params))

In [10]:
svc = SVC()
parameters = {'kernel': ['linear', 'rbf'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

cv = GridSearchCV(svc, parameters, cv=5)

# What the above code will do is that it'll grab the first hyper parameter combination. 
# So that will be the linear kernel with C set to 0.1. 
# It'll pass those hyper parameters into SVC, and then it'll use that setting and run cross validation. 
# So because we're doing 5-fold cross-validation, it'll loop through the five subsets of data, 
# each time fitting on four and evaluating it on the fifth, and then I'll store the average test score 
# for that loop. And I'll do this for each hyper parameter combination

In [11]:
train_features = pd.read_csv('train_features.csv')

# if we dont put header = None, then automatically it will assume that the first record will be the header
train_labels = pd.read_csv('train_labels.csv', header=None)

In [12]:
cv.fit(train_features, train_labels.values.ravel())

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'kernel': ['linear', 'rbf'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
print_results(cv)

Best params is {'kernel': 'linear', 'C': 10}
0.667 (+/-)0.065 for {'kernel': 'linear', 'C': 0.001}
0.624 (+/-)0.005 for {'kernel': 'rbf', 'C': 0.001}
0.713 (+/-)0.087 for {'kernel': 'linear', 'C': 0.01}
0.624 (+/-)0.005 for {'kernel': 'rbf', 'C': 0.01}
0.796 (+/-)0.116 for {'kernel': 'linear', 'C': 0.1}
0.624 (+/-)0.005 for {'kernel': 'rbf', 'C': 0.1}
0.796 (+/-)0.116 for {'kernel': 'linear', 'C': 1}
0.62 (+/-)0.009 for {'kernel': 'rbf', 'C': 1}
0.807 (+/-)0.123 for {'kernel': 'linear', 'C': 10}
0.614 (+/-)0.019 for {'kernel': 'rbf', 'C': 10}
0.8 (+/-)0.138 for {'kernel': 'linear', 'C': 100}
0.614 (+/-)0.019 for {'kernel': 'rbf', 'C': 100}
0.801 (+/-)0.109 for {'kernel': 'linear', 'C': 1000}
0.614 (+/-)0.019 for {'kernel': 'rbf', 'C': 1000}


In [14]:
cv.best_estimator_

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [16]:
### pickle the model
joblib.dump(cv.best_estimator_, 'SVM_model.pkl')

['SVM_model.pkl']