In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['cell_samples.csv']


# Dataset:
The example is based on a dataset that is publicly available from the UCI Machine Learning Repository (Asuncion and Newman, 2007)[http://mlearn.ics.uci.edu/MLRepository.html]. The dataset consists of several hundred human cell sample records, each of which contains the values of a set of cell characteristics. The ID field contains the patient identifiers. The characteristics of the cell samples from each patient are contained in fields Clump to Mit. The values are graded from 1 to 10, with 1 being the closest to benign. The Class field contains the diagnosis, as confirmed by separate medical procedures, as to whether the samples are benign (value = 2) or malignant (value = 4).


In [2]:
df=pd.read_csv("../input/cell_samples.csv")

In [3]:
df.head()

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
import matplotlib.pyplot as plt

## Checking for datatypes

In [5]:

df.dtypes

ID              int64
Clump           int64
UnifSize        int64
UnifShape       int64
MargAdh         int64
SingEpiSize     int64
BareNuc        object
BlandChrom      int64
NormNucl        int64
Mit             int64
Class           int64
dtype: object

## It looks like that 'BareNucl' is not of int type. Hence, to apply any algorithm we need to change it to a the 'int' type.

In [6]:
df=df[pd.to_numeric(df["BareNuc"],errors='coerce').notnull()]

In [7]:
df.dtypes

ID              int64
Clump           int64
UnifSize        int64
UnifShape       int64
MargAdh         int64
SingEpiSize     int64
BareNuc        object
BlandChrom      int64
NormNucl        int64
Mit             int64
Class           int64
dtype: object

In [8]:
df["BareNuc"]=df["BareNuc"].astype('int')

In [9]:
X = df[['Clump', 'UnifSize', 'UnifShape', 'MargAdh', 'SingEpiSize', 'BareNuc', 'BlandChrom', 'NormNucl', 'Mit']]
y=df["Class"]

# Train/Test data

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=3)

In [12]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(478, 9)
(205, 9)
(478,)
(205,)


In [13]:
from sklearn import svm

In [14]:
from sklearn.metrics import jaccard_similarity_score as jsc

# Applying Support Vector Machines from sklearn library and checking for different kernels and comparing them with the help of their accuracy score which is measured using jaccard_similarity_score.

In [15]:
clf=svm.SVC(kernel='rbf').fit(x_train,y_train)



In [16]:
yhat=clf.predict(x_test)

In [17]:
yhat

array([2, 4, 2, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 2, 4, 4, 2, 4, 2, 2,
       2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 4, 2, 2, 2, 4, 4, 4, 4, 4, 2, 4,
       2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 4,
       4, 4, 2, 4, 2, 4, 2, 4, 2, 4, 4, 2, 4, 4, 2, 4, 2, 2, 4, 4, 2, 4,
       4, 2, 4, 4, 2, 2, 2, 2, 2, 4, 2, 4, 4, 4, 4, 2, 4, 2, 4, 2, 4, 4,
       4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 4, 2, 2, 4, 4, 2,
       2, 2, 2, 4, 4, 2, 2, 4, 4, 2, 4, 4, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2,
       2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2,
       2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 4])

In [18]:
print("The accuracy score for our model when we use RBF kernel is:",jsc(y_test,yhat)*100,"%")

The accuracy score for our model when we use RBF kernel is: 96.58536585365853 %




## For RBF kernel the accuracy score is approx: 96.5853%

In [19]:
clf=svm.SVC(kernel='linear').fit(x_train,y_train)

In [20]:
yhat1=clf.predict(x_test)

In [21]:
yhat1

array([2, 2, 2, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 2, 2, 2, 4, 4, 2, 4, 2, 2,
       2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 4, 2, 2, 2, 4, 4, 4, 4, 4, 2, 4,
       2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 4, 2, 2, 2, 4, 4, 2, 2, 2, 2, 4, 4,
       4, 4, 2, 4, 2, 4, 2, 4, 2, 4, 4, 2, 4, 4, 2, 4, 2, 2, 4, 4, 2, 4,
       2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 2, 4, 4, 4, 4, 2, 4, 2, 4, 2, 4, 4,
       4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 4, 2, 2, 4, 4, 2,
       2, 2, 2, 2, 4, 2, 2, 4, 4, 2, 4, 4, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2,
       2, 2, 4, 2, 2, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2, 2,
       2, 2, 4, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 4])

In [22]:
print("The accuracy score for our model when we use Linear kernel is:",jsc(y_test,yhat1)*100,"%")

The accuracy score for our model when we use Linear kernel is: 96.09756097560975 %




## The accuracy score for our model when we use Linear kernel is: 96.09756097560975 %


In [23]:
clf=svm.SVC(kernel='sigmoid').fit(x_train,y_train)



In [24]:
yhat2=clf.predict(x_test)
yhat2

array([2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4,
       4, 4, 4, 4, 2, 4, 4, 2, 4, 2, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2,
       4, 4, 2, 4, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 2, 2,
       2, 2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2,
       2, 2, 2, 2, 4, 4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 2, 2, 2, 2,
       2, 4, 2, 2, 4, 2, 4, 4, 4, 4, 2, 4, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4,
       2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2, 4, 2, 4, 4, 4,
       2, 2, 2, 4, 2, 4, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 4, 2, 2, 4,
       2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 4, 2, 4, 4,
       4, 2, 2, 2, 2, 2, 2])

In [25]:
print("The accuracy score for our model when we use Polynomial kernel is:",jsc(y_test,yhat2)*100,"%")

The accuracy score for our model when we use Polynomial kernel is: 32.19512195121951 %




## The accuracy score for our model when we use Polynomial kernel is: 32.19512195121951 %


# Therefore, polynomial kernel performs worst in our case and the RBF kernel performs best in our case