# SVM

In [1]:
from sklearn import svm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import time
np.random.seed(7)

In [14]:
dataset = pd.read_csv("breast-cancer-wisconsin.csv",na_values=["?"])
dataset.head(10)

Unnamed: 0,id,clump_thickness,size_uniformity,shape_uniformity,marginal_adhesion,epithelial_size,bare_nucleoli,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1.0,3,1,1,2
1,1002945,5,4,4,5,7,10.0,3,2,1,2
2,1015425,3,1,1,1,2,2.0,3,1,1,2
3,1016277,6,8,8,1,3,4.0,3,7,1,2
4,1017023,4,1,1,3,2,1.0,3,1,1,2
5,1017122,8,10,10,8,7,10.0,9,7,1,4
6,1018099,1,1,1,1,2,10.0,3,1,1,2
7,1018561,2,1,2,1,2,1.0,3,1,1,2
8,1033078,2,1,1,1,2,1.0,1,1,5,2
9,1033078,4,2,1,1,2,1.0,2,1,1,2


In [3]:
# Preprocess data to impute missing values with median
imputer = Imputer(missing_values = np.NaN, strategy='median', axis=0)
dataset = pd.DataFrame(imputer.fit_transform(dataset))
#dataset.head(25)

In [4]:
# Split into input (X) and output (y) variables
X = dataset.iloc[:,1:10]
y = dataset.iloc[:,10]

In [5]:
# Preprocess data for output label binarization from 2,4 to 0,1
lb = preprocessing.LabelBinarizer()
y = pd.DataFrame(lb.fit_transform(y))

In [6]:
#Spliting the data as training data = 80% and testing data = 20%
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

# Linear SVM

In [10]:
#Create SVM classifier
svmclassifier = svm.SVC(kernel = 'linear')

#Train classifier on training data
start_time = time.time()
svmclassifier.fit(X_train, y_train)
end_time = time.time()

#Predict on test dataset
y_pred = svmclassifier.predict(X_test)

#Evaluate predicted values against actual output values
acc = metrics.accuracy_score(y_test,y_pred)
print("\nacc: %.2f%% " % (acc * 100))
print("\nConfusion Matrix:\n",confusion_matrix(y_test, y_pred))  
print("\nTime taken:", format(end_time - start_time))


acc: 96.43% 

Confusion Matrix:
 [[88  3]
 [ 2 47]]

Time taken: 0.007981061935424805


  y = column_or_1d(y, warn=True)


# Polynomial Kernel SVM

In [11]:
#Create SVM classifier
svmclassifier1 = svm.SVC(kernel = 'poly',C=1,gamma=1)

#Train classifier on training data
start_time = time.time()
svmclassifier1.fit(X_train, y_train)
end_time = time.time()

#Predict on test dataset
y_pred = svmclassifier1.predict(X_test)

#Evaluate predicted values against actual output values
acc = metrics.accuracy_score(y_test,y_pred)
print("\nacc: %.2f%% " % (acc * 100))
print("\nConfusion Matrix:\n",confusion_matrix(y_test, y_pred))  
print("\nTime taken:", format(end_time - start_time))


acc: 93.57% 

Confusion Matrix:
 [[88  3]
 [ 6 43]]

Time taken: 0.10870933532714844


  y = column_or_1d(y, warn=True)


# Radial Kernel SVM C=1 gamma =1

In [12]:
#Create SVM classifier
svmclassifier2 = svm.SVC(kernel = 'rbf',C=1,gamma =1)

#Train classifier on training data
start_time = time.time()
svmclassifier2.fit(X_train, y_train)
end_time = time.time()

#Predict on test dataset
y_pred = svmclassifier2.predict(X_test)

#Evaluate predicted values against actual output values
acc = metrics.accuracy_score(y_test,y_pred)
print("\nacc: %.2f%% " % (acc * 100))
print("\nConfusion Matrix:\n",confusion_matrix(y_test, y_pred))  
print("\nTime taken:", format(end_time - start_time))


acc: 90.00% 

Confusion Matrix:
 [[77 14]
 [ 0 49]]

Time taken: 0.01794910430908203


  y = column_or_1d(y, warn=True)


# Radial Kernel SVM C=5 gamma =10

In [13]:
#Create SVM classifier
svmclassifier3 = svm.SVC(kernel = 'rbf',C=5,gamma =10)

#Train classifier on training data
start_time = time.time()
svmclassifier3.fit(X_train, y_train)
end_time = time.time()

#Predict on test dataset
y_pred = svmclassifier3.predict(X_test)

#Evaluate predicted values against actual output values
acc = metrics.accuracy_score(y_test,y_pred)
print("\nacc: %.2f%% " % (acc * 100))
print("\nConfusion Matrix:\n",confusion_matrix(y_test, y_pred))  
print("\nTime taken:", format(end_time - start_time))


acc: 65.00% 

Confusion Matrix:
 [[91  0]
 [49  0]]

Time taken: 0.02496027946472168


  y = column_or_1d(y, warn=True)
