### Sample program for classification prediction by Support Vector Machine  

#### Import libraries  

In [11]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import datasets

#### Load data and set parameters  

In [12]:
df = pd.read_csv('cancer.csv', delimiter=',', skiprows=14, header=0)
print(df.shape)
print(df.info())
display(df.head())

(569, 32)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ID       569 non-null    int64  
 1   Diag     569 non-null    object 
 2   ave_a    569 non-null    float64
 3   ave_b    569 non-null    float64
 4   ave_c    569 non-null    float64
 5   ave_d    569 non-null    float64
 6   ave_e    569 non-null    float64
 7   ave_f    569 non-null    float64
 8   ave_g    569 non-null    float64
 9   ave_h    569 non-null    float64
 10  ave_i    569 non-null    float64
 11  ave_j    569 non-null    float64
 12  se_a     569 non-null    float64
 13  se_b     569 non-null    float64
 14  se_c     569 non-null    float64
 15  se_d     569 non-null    float64
 16  se_e     569 non-null    float64
 17  se_f     569 non-null    float64
 18  se_g     569 non-null    float64
 19  se_h     569 non-null    float64
 20  se_i     569 non-null    float64
 21  se_j  

Unnamed: 0,ID,Diag,ave_a,ave_b,ave_c,ave_d,ave_e,ave_f,ave_g,ave_h,...,worst_a,worst_b,worst_c,worst_d,worst_e,worst_f,worst_g,worst_h,worst_i,worst_j
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [13]:
X = df.loc[:, 'ave_a':'worst_j']
y = df['Diag']
print(X.shape)
print(y.shape)

(569, 30)
(569,)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=11, stratify=y)

#### Check number of labels  

In [15]:
# print(np.bincount(y))
# print(np.bincount(y_train))
# print(np.bincount(y_test))

#### Scaling to [0, 1] (NOTE: use X_train. X_test should not be included)  

In [16]:
sc = MinMaxScaler(feature_range=(0, 1), copy=True)
sc.fit(X_train)
X_train_std = sc.transform(X_train)

#### Train Support Vector Classifier  

In [17]:
svc = SVC(kernel='rbf', C=1.0, gamma='scale')
svc.fit(X_train_std, y_train)

SVC()

#### Prediction using train_data  

In [18]:
y_train_pred = svc.predict(X_train_std)

#### Report accuracy and confusion matrix for train_data  

In [19]:
print('accuracy(Q10) for train data:', accuracy_score(y_train, y_train_pred))

ct_pred = pd.crosstab(y_train, y_train_pred)
display(ct_pred)

accuracy(Q10) for train data: 0.9773869346733668


col_0,B,M
Diag,Unnamed: 1_level_1,Unnamed: 2_level_1
B,247,3
M,6,142


#### Scaling of X_test (using mean and std of X_train)  

In [20]:
X_test_std = sc.transform(X_test)

#### Prediction using test_data  

In [21]:
y_test_pred = svc.predict(X_test_std)

#### Report accuracy and confusion matrix for test_data  

In [22]:
print('accuracy(Q10) for test data:', accuracy_score(y_test, y_test_pred))

ct_test = pd.crosstab(y_test, y_test_pred)
display(ct_test)

accuracy(Q10) for test data: 0.9824561403508771


col_0,B,M
Diag,Unnamed: 1_level_1,Unnamed: 2_level_1
B,107,0
M,3,61
