# __Support Vector Machines__

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [2]:
df = pd.read_csv('breast_cancer.csv')
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,outcome
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


In [3]:
df.shape

(569, 31)

In [4]:
X = df.iloc[:,:-1]
y = df.outcome

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=47)

In [6]:
X_train.shape

(455, 30)

In [7]:
X_test.shape

(114, 30)

In [8]:
from sklearn.metrics import accuracy_score, classification_report

# C parameter
# gamma value

model = SVC(
    kernel='linear',
    C=4,
    random_state=225,
    gamma=0.5
)

model.fit(X_train, y_train)

y_predict_test = model.predict(X_test)
y_predict_train = model.predict(X_train)

print('train: ', accuracy_score(y_train, y_predict_train))
print('test: ', accuracy_score(y_test, y_predict_test))
print(classification_report(y_test, y_predict_test))

train:  0.9714285714285714
test:  0.9473684210526315
              precision    recall  f1-score   support

           0       0.98      0.93      0.96        70
           1       0.90      0.98      0.93        44

    accuracy                           0.95       114
   macro avg       0.94      0.95      0.95       114
weighted avg       0.95      0.95      0.95       114



# __HyperParameter Tuning__

## __GridSearchCV__

In [14]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1,0.5,1.0,2.0,5.0,10.0,25.0,50.0,10.0],
    'gamma': [0.01,0.05,0.1,0.25,0.5,1.0,5.0,10.0]
}

In [15]:
clf =  GridSearchCV(SVC(), param_grid=parameters,  verbose=3)

clf.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV 1/5] END ...............C=0.1, gamma=0.01, kernel=linear; total time=   0.0s
[CV 2/5] END ...............C=0.1, gamma=0.01, kernel=linear; total time=   0.0s
[CV 3/5] END ...............C=0.1, gamma=0.01, kernel=linear; total time=   0.0s
[CV 4/5] END ...............C=0.1, gamma=0.01, kernel=linear; total time=   0.0s
[CV 5/5] END ...............C=0.1, gamma=0.01, kernel=linear; total time=   0.0s
[CV 1/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 2/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 3/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 4/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 5/5] END ..................C=0.1, gamma=0.01, kernel=rbf; total time=   0.0s
[CV 1/5] END .................C=0.1, gamma=0.01, kernel=poly; total time=   0.3s
[CV 2/5] END .................C=0.1, gamma=0.

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 25.0, 50.0, 10.0],
                         'gamma': [0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 5.0, 10.0],
                         'kernel': ['linear', 'rbf', 'poly']},
             verbose=3)

In [16]:
clf.best_score_

0.9130434782608695

In [17]:
clf.best_params_

{'C': 0.1, 'gamma': 0.01, 'kernel': 'linear'}

In [18]:
pd.DataFrame(clf.cv_results_).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.015803,0.006225,0.002789,0.001156,0.1,0.01,linear,"{'C': 0.1, 'gamma': 0.01, 'kernel': 'linear'}",0.913043,0.826087,0.913043,0.913043,1.0,0.913043,0.054996,1
1,0.002198,0.001165,0.001412,0.000795,0.1,0.01,rbf,"{'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}",0.608696,0.608696,0.608696,0.608696,0.636364,0.614229,0.011067,145
2,1.119293,0.798338,0.000821,0.001007,0.1,0.01,poly,"{'C': 0.1, 'gamma': 0.01, 'kernel': 'poly'}",0.782609,0.782609,1.0,0.913043,0.954545,0.886561,0.089223,73
3,0.012363,0.004273,0.002048,0.003145,0.1,0.05,linear,"{'C': 0.1, 'gamma': 0.05, 'kernel': 'linear'}",0.913043,0.826087,0.913043,0.913043,1.0,0.913043,0.054996,1
4,0.005252,0.004348,0.000421,0.000842,0.1,0.05,rbf,"{'C': 0.1, 'gamma': 0.05, 'kernel': 'rbf'}",0.608696,0.608696,0.608696,0.608696,0.636364,0.614229,0.011067,145


# Best parameters are found

In [20]:
model2 = SVC(**clf.best_params_)

model2.fit(X_train, y_train)
y_predict = model2.predict(X_test)

print('test: ', accuracy_score(y_test, y_predict))

test:  0.9385964912280702


In [21]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.98      0.91      0.95        70
           1       0.88      0.98      0.92        44

    accuracy                           0.94       114
   macro avg       0.93      0.95      0.94       114
weighted avg       0.94      0.94      0.94       114



# K Fold Cross Validation

- K / n / folds is the number of folds we need
- for each fold accuracy result is calculated
- then an arithmetic mean of those scores will be the accuracy score of the model
- hence reliance on random_state is eradicated