<a href="https://colab.research.google.com/github/drm1072/Workload-DRM/blob/main/SVM_Algorithm_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd #Used for high level Data Indexing processes
import numpy as np  #Numpy provides fast mathematical function processing
import scipy.optimize as opt #Algorithm library used for high level plotting
from sklearn import preprocessing #A library used to make Scikit learn algorithms operate smoothly
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.model_selection import train_test_split #Has replaced "import sklearn.cross validation as cross_validation"
import matplotlib.pyplot as plt #Machine learning library to create plots and data visualizations
%matplotlib inline

In [2]:
#This is the raw github data to import csv files
cancerdata = pd.read_csv('https://raw.githubusercontent.com/gerstung-lab/ProbCox/main/paper/ProbCox/data/application/colon.csv')

In [3]:
cancerdata

Unnamed: 0.1,Unnamed: 0,time,status,sex,age,obstruct,perfor,adhere,nodes,node4,surg,extent,differ
0,1,1521,1,1,-1.411352,0,0,0,0.377695,1,0,0.242496,-0.121247
1,2,968,1,1,-1.411352,0,0,0,0.377695,1,0,0.242496,-0.121247
2,3,3087,0,1,0.267749,0,0,0,-0.752526,0,0,0.242496,-0.121247
3,4,3087,0,1,0.267749,0,0,0,-0.752526,0,0,0.242496,-0.121247
4,5,963,1,0,0.939389,0,0,1,0.942806,1,0,-1.848146,-0.121247
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1771,1772,851,1,1,1.359164,0,0,1,-0.752526,0,0,0.242496,1.836340
1772,1773,2072,0,0,-0.991577,1,0,0,0.095140,1,1,0.242496,-0.121247
1773,1774,2072,0,0,-0.991577,1,0,0,0.095140,1,1,0.242496,-0.121247
1774,1775,1820,0,0,0.519614,1,0,0,-0.752526,0,0,0.242496,-0.121247


In [4]:
#Shows us the entire dataset in rows and columns
cancerdata.shape

(1776, 13)

In [5]:
#This is the dataframe
#Now we want to preview our data using the code below
#the number of rows and colums are displayed at the bottom
cancerdata.head()
#ID = Patient numbers
#Clump = Clump thickness
#Unifsize = Uniformity of cell size
#Unifshape + Uniformity of cell shape
#MargAdh = Marginal adhesion
#SingEpisize = Single epithelial cell size
#BareNuc = Bare Nuclei
#BlandChrom = Bland Chromatin
#NormNucl = Normal Nuclei
#Mit = Mitoses
#Class = Benign or Malignant

Unnamed: 0.1,Unnamed: 0,time,status,sex,age,obstruct,perfor,adhere,nodes,node4,surg,extent,differ
0,1,1521,1,1,-1.411352,0,0,0,0.377695,1,0,0.242496,-0.121247
1,2,968,1,1,-1.411352,0,0,0,0.377695,1,0,0.242496,-0.121247
2,3,3087,0,1,0.267749,0,0,0,-0.752526,0,0,0.242496,-0.121247
3,4,3087,0,1,0.267749,0,0,0,-0.752526,0,0,0.242496,-0.121247
4,5,963,1,0,0.939389,0,0,1,0.942806,1,0,-1.848146,-0.121247


In [6]:
cancerdata.isnull().values.any()

False

In [7]:
cancerdata

Unnamed: 0.1,Unnamed: 0,time,status,sex,age,obstruct,perfor,adhere,nodes,node4,surg,extent,differ
0,1,1521,1,1,-1.411352,0,0,0,0.377695,1,0,0.242496,-0.121247
1,2,968,1,1,-1.411352,0,0,0,0.377695,1,0,0.242496,-0.121247
2,3,3087,0,1,0.267749,0,0,0,-0.752526,0,0,0.242496,-0.121247
3,4,3087,0,1,0.267749,0,0,0,-0.752526,0,0,0.242496,-0.121247
4,5,963,1,0,0.939389,0,0,1,0.942806,1,0,-1.848146,-0.121247
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1771,1772,851,1,1,1.359164,0,0,1,-0.752526,0,0,0.242496,1.836340
1772,1773,2072,0,0,-0.991577,1,0,0,0.095140,1,1,0.242496,-0.121247
1773,1774,2072,0,0,-0.991577,1,0,0,0.095140,1,1,0.242496,-0.121247
1774,1775,1820,0,0,0.519614,1,0,0,-0.752526,0,0,0.242496,-0.121247


In [8]:
cancerdata.describe(include='all')

Unnamed: 0.1,Unnamed: 0,time,status,sex,age,obstruct,perfor,adhere,nodes,node4,surg,extent,differ
count,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0
mean,888.5,1542.55518,0.493243,0.518018,-1.650332e-16,0.192568,0.030405,0.144144,-7.751557000000001e-17,0.26464,0.268018,4.748766e-16,-5.966667e-16
std,512.831356,946.741234,0.500095,0.499816,1.0,0.394427,0.171748,0.351335,1.0,0.441265,0.443052,1.0,1.0
min,1.0,8.0,0.0,0.0,-3.510228,0.0,0.0,0.0,-1.035082,0.0,0.0,-3.938787,-2.078834
25%,444.75,573.0,0.0,0.0,-0.5718019,0.0,0.0,0.0,-0.7525265,0.0,0.0,0.2424956,-0.1212469
50%,888.5,1856.0,0.0,1.0,0.09983843,0.0,0.0,0.0,-0.4699711,0.0,0.0,0.2424956,-0.1212469
75%,1332.25,2331.0,1.0,1.0,0.7714788,0.0,0.0,0.0,0.3776951,1.0,1.0,0.2424956,-0.1212469
max,1776.0,3329.0,1.0,1.0,2.11476,1.0,1.0,1.0,8.289246,1.0,1.0,2.333137,1.83634


In [9]:
#code to show us the different datatypes
cancerdata.dtypes

Unnamed: 0      int64
time            int64
status          int64
sex             int64
age           float64
obstruct        int64
perfor          int64
adhere          int64
nodes         float64
node4           int64
surg            int64
extent        float64
differ        float64
dtype: object

In [10]:
#Age should be an integer based on the data set
#extended code to make that change
cancerdata = cancerdata[pd.to_numeric(cancerdata['age'], errors='coerce').notnull()]
cancerdata['age'] = cancerdata['age'].astype('int')

In [11]:
#Check datatypes again
cancerdata.dtypes

Unnamed: 0      int64
time            int64
status          int64
sex             int64
age             int64
obstruct        int64
perfor          int64
adhere          int64
nodes         float64
node4           int64
surg            int64
extent        float64
differ        float64
dtype: object

In [12]:
cancerdata.describe(include='all')
#run describe all function again

Unnamed: 0.1,Unnamed: 0,time,status,sex,age,obstruct,perfor,adhere,nodes,node4,surg,extent,differ
count,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0,1776.0
mean,888.5,1542.55518,0.493243,0.518018,-0.032658,0.192568,0.030405,0.144144,-7.751557000000001e-17,0.26464,0.268018,4.748766e-16,-5.966667e-16
std,512.831356,946.741234,0.500095,0.499816,0.674738,0.394427,0.171748,0.351335,1.0,0.441265,0.443052,1.0,1.0
min,1.0,8.0,0.0,0.0,-3.0,0.0,0.0,0.0,-1.035082,0.0,0.0,-3.938787,-2.078834
25%,444.75,573.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.7525265,0.0,0.0,0.2424956,-0.1212469
50%,888.5,1856.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.4699711,0.0,0.0,0.2424956,-0.1212469
75%,1332.25,2331.0,1.0,1.0,0.0,0.0,0.0,0.0,0.3776951,1.0,1.0,0.2424956,-0.1212469
max,1776.0,3329.0,1.0,1.0,2.0,1.0,1.0,1.0,8.289246,1.0,1.0,2.333137,1.83634


In [13]:
#surg is the feature Im going to run this test on
cancerdata["surg"].value_counts()

0    1300
1     476
Name: surg, dtype: int64

In [14]:
#I want to see the full data
pd.options.display.max_rows = 999

In [15]:
#check dataframe again
cancerdata

Unnamed: 0.1,Unnamed: 0,time,status,sex,age,obstruct,perfor,adhere,nodes,node4,surg,extent,differ
0,1,1521,1,1,-1,0,0,0,0.377695,1,0,0.242496,-0.121247
1,2,968,1,1,-1,0,0,0,0.377695,1,0,0.242496,-0.121247
2,3,3087,0,1,0,0,0,0,-0.752526,0,0,0.242496,-0.121247
3,4,3087,0,1,0,0,0,0,-0.752526,0,0,0.242496,-0.121247
4,5,963,1,0,0,0,0,1,0.942806,1,0,-1.848146,-0.121247
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1771,1772,851,1,1,1,0,0,1,-0.752526,0,0,0.242496,1.836340
1772,1773,2072,0,0,0,1,0,0,0.095140,1,1,0.242496,-0.121247
1773,1774,2072,0,0,0,1,0,0,0.095140,1,1,0.242496,-0.121247
1774,1775,1820,0,0,0,1,0,0,-0.752526,0,0,0.242496,-0.121247


In [19]:
#Lets setup our model to be trained and tested
#features are the columns
#attributes are the values in the columns
cancerdata_features_columns = cancerdata[['time', 'status', 'sex', 'age', 'obstruct', 'perfor', 'adhere', 'nodes', 'node4', 'extent', 'differ']]
#This is where the Numpy library comes in play
#We are setting up the X (independant variables)
x = np.asarray(cancerdata_features_columns)

In [21]:
#The y variable is our dependent variable
#This is our Y (dependant variable) because it is the answer we are looking for
cancerdata['surg'] = cancerdata['surg'].astype('int')
y = np.asarray(cancerdata['surg'])

In [22]:
#split the dataset into the training set and test the set
#We're splitting the data for training and testing
#70% of it will be for training and 30% of it will be tested for accuracy 70/30
#xtrain: this will be your training group
#xtest: this will be your test group
#ytrain: this will be your target for your training group
#ytest: target for your test group
#random_state of 0 gives you same value each time you change it, You will get different values 
xtrain, xtest, ytrain, ytest = train_test_split(x, y, train_size = 0.70, test_size = 0.30, random_state = 0, shuffle = True)

In [23]:
#print function shows us the actual values of the train and test the splits of the data rows we have
print ('Train set:', xtrain.shape, ytrain.shape)
print ('Test set:', xtrain.shape, ytrain.shape)

Train set: (1243, 11) (1243,)
Test set: (1243, 11) (1243,)


In [24]:
#Modeling (SVM with Scikit-learn)
#The SVM algorithm offers a choice of kernel functions for performing its processing.
#Basically, mapping data into a higher dimension for better accuracy is "kerneling."
#The mathematical function used for the transformation is known as the kernel function (1. Linear 2.Polynomial 3.Radius basis function(RBF) 4.Sigmoid)
#clfdata stands for classifier 
from sklearn import svm
clfdata = svm.SVC(kernel='rbf') #rbf is chosen because it is popular
clfdata.fit(xtrain, ytrain)#We are doing whats called fitting the model with the data we trained earlier

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [25]:
#Implement (yhat) function to all test values
#373 represents 30% of 1243 rows/values 
yhat = clfdata.predict(xtest)
yhat [0:373]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [27]:
from sklearn.metrics import classification_report, confusion_matrix #importing confusion matrix library
import itertools
#itertools is an additional library frequently used in healthcare modeling/analysis because of speed and efficient looping

In [32]:
#compute confusion matrix
cnf_matrix = confusion_matrix(ytest, yhat, labels=[2,4])
np.set_printoptions(precision=2)

print(classification_report(ytest, yhat))

#plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes= ['Benign(2)', 'Malignant(4)'], normalize= False, title='Confusion matrix')
#precision is a measure of the accuracy provided that a class label has been predicted. It is defined by: precision = TP / (TP + FP)
#Recall is true positive rate.  It is defined as = Recall = TP (TP + FN)

ValueError: ignored