# Support Vector Machine Classification

#### Load the packages and import the data

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("./Data Files/cancer_dataset.csv")
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [2]:
data.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

#### Split data into an X DataFrame and y vector

In [3]:
X = data.drop("target", axis = 1)
y = data["target"]

#### Split the data into a train_set and test_set

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1111)

#### Center and Scale X

In [5]:
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
for i in X_train.columns:
    if X_train[i].dtypes in ["float64", "int64"]:
        m = X_train[i].mean()
        s = X_train[i].std()
        X_train_scaled[i] = (X_train[i] - m) / s
        X_test_scaled[i] = (X_test[i] - m) / s

#### Fit the base SVM Model

In [6]:
from sklearn.svm import SVC

svm_model = SVC(random_state = 1111)
svm_model.fit(X_train_scaled, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=1111, shrinking=True,
  tol=0.001, verbose=False)

#### Predict base SVM model on Test Set

In [7]:
y_pred = svm_model.predict(X_test_scaled)
pred_summary = X_test.copy()
pred_summary[y.name] = y_test  # dependent variable
pred_summary["y_pred"] = y_pred
pred_summary.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target,y_pred
472,14.92,14.93,96.45,686.9,0.08098,0.08549,0.05539,0.03221,0.1687,0.05669,...,112.0,906.6,0.1065,0.2791,0.3151,0.1147,0.2688,0.08273,1,1
175,8.671,14.45,54.42,227.2,0.09138,0.04276,0.0,0.0,0.1722,0.06724,...,58.36,259.2,0.1162,0.07057,0.0,0.0,0.2592,0.07848,1,1
538,7.729,25.49,47.98,178.8,0.08098,0.04878,0.0,0.0,0.187,0.07285,...,57.17,248.0,0.1256,0.0834,0.0,0.0,0.3058,0.09938,1,1
550,10.86,21.48,68.51,360.5,0.07431,0.04227,0.0,0.0,0.1661,0.05948,...,74.08,412.3,0.1001,0.07348,0.0,0.0,0.2458,0.06592,1,1
44,13.17,21.81,85.42,531.5,0.09714,0.1047,0.08259,0.05252,0.1746,0.06177,...,105.5,740.7,0.1503,0.3904,0.3728,0.1607,0.3693,0.09618,0,0


#### Evaluate the base SVM Model

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Base SVM Model", "\n")
print(pd.DataFrame(confusion_matrix(y_test, y_pred)), 
      "      Accuracy:", round(accuracy_score(y_test, y_pred), 3), 
      "\n")
print(classification_report(y_test, y_pred))

Base SVM Model 

    0    1
0  64    3
1   1  103       Accuracy: 0.977 

             precision    recall  f1-score   support

          0       0.98      0.96      0.97        67
          1       0.97      0.99      0.98       104

avg / total       0.98      0.98      0.98       171



#### Use Grid Search (Cross Validation) to tune the C, gamma, and kernel parameters

In [9]:
from sklearn.model_selection import GridSearchCV
parameters = {"C": [0.1, 1, 10, 100, 1000],  # Adjust these values over and over
              "gamma": [1, 0.1, 0.01, 0.001, 0.0001],
              "kernel": ['linear', 'poly', 'rbf', 'sigmoid']}
grid = GridSearchCV(SVC(), parameters, verbose = 2)
grid.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .................... C=0.1, gamma=1, kernel=linear, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .................... C=0.1, gamma=1, kernel=linear, total=   0.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] .................... C=0.1, gamma=1, kernel=linear, total=   0.0s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ...................... C=0.1, gamma=1, kernel=poly, total=   0.0s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ...................... C=0.1, gamma=1, kernel=poly, total=   0.0s
[CV] C=0.1, gamma=1, kernel=poly .....................................
[CV] ...................... C=0.1, gamma=1, kernel=poly, total=   0.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .........

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ................. C=0.1, gamma=0.1, kernel=sigmoid, total=   0.0s
[CV] C=0.1, gamma=0.1, kernel=sigmoid ................................
[CV] ................. C=0.1, gamma=0.1, kernel=sigmoid, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] ................. C=0.1, gamma=0.01, kernel=linear, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] ................. C=0.1, gamma=0.01, kernel=linear, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=linear ................................
[CV] ................. C=0.1, gamma=0.01, kernel=linear, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=poly ..................................
[CV] ................... C=0.1, gamma=0.01, kernel=poly, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=poly ..................................
[CV] ................... C=0.1, gamma=0.01, kernel=poly, total=   0.0s
[CV] C=0.1, gamma=0.01, kernel=poly ..................................
[CV] .

[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:    2.5s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [10]:
grid.best_params_

{'C': 0.1, 'gamma': 1, 'kernel': 'linear'}

#### Predict on test set using new parameters

In [11]:
grid_pred = grid.predict(X_test_scaled)
pred_summary = X_test.copy()
pred_summary[y.name] = y_test  # dependent variable
pred_summary["y_pred"] = grid_pred
pred_summary.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target,y_pred
472,14.92,14.93,96.45,686.9,0.08098,0.08549,0.05539,0.03221,0.1687,0.05669,...,112.0,906.6,0.1065,0.2791,0.3151,0.1147,0.2688,0.08273,1,1
175,8.671,14.45,54.42,227.2,0.09138,0.04276,0.0,0.0,0.1722,0.06724,...,58.36,259.2,0.1162,0.07057,0.0,0.0,0.2592,0.07848,1,1
538,7.729,25.49,47.98,178.8,0.08098,0.04878,0.0,0.0,0.187,0.07285,...,57.17,248.0,0.1256,0.0834,0.0,0.0,0.3058,0.09938,1,1
550,10.86,21.48,68.51,360.5,0.07431,0.04227,0.0,0.0,0.1661,0.05948,...,74.08,412.3,0.1001,0.07348,0.0,0.0,0.2458,0.06592,1,1
44,13.17,21.81,85.42,531.5,0.09714,0.1047,0.08259,0.05252,0.1746,0.06177,...,105.5,740.7,0.1503,0.3904,0.3728,0.1607,0.3693,0.09618,0,0


#### Evaluate the model using new parameters

In [12]:
print("Refined SVM Model", "\n")
print(pd.DataFrame(confusion_matrix(y_test, y_pred)), 
      "      Accuracy:", round(accuracy_score(y_test, y_pred), 3), 
      "\n")
print(classification_report(y_test, y_pred))

Refined SVM Model 

    0    1
0  64    3
1   1  103       Accuracy: 0.977 

             precision    recall  f1-score   support

          0       0.98      0.96      0.97        67
          1       0.97      0.99      0.98       104

avg / total       0.98      0.98      0.98       171

