# Python Machine Learning In Biology:
# Support Vector Machines

We'll build an SVM using the `cancer.csv` dataset. 

#### Import modules

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

#### Read in dataset 

In [3]:
cancer = pd.read_csv("data/cancer.csv")

In [4]:
cancer.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
cancer.shape

(569, 31)

#### Store features as "X"

In [5]:
X = cancer.drop(['diagnosis'], 1)


#### Store response as "y" and encode them as numbers

In [6]:
y = cancer['diagnosis']
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)

#### Split dataset into training set and test set

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=109) # 70% training and 30% test

#### Create a SVM Classifier with a linear kernel

In [9]:
clf = svm.SVC(kernel='linear') 

#### Train the model using the training sets

In [10]:
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### Predict the response for test dataset

In [11]:
y_pred = clf.predict(X_test)

#### Evaluate the Model

Model Accuracy: how often is the classifier correct?

In [12]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9649122807017544


Model Precision: (AKA positive predictive value)  
Model Recall: (AKA sensitivity) 

*We'll talk more in depth about these when we talk about evaluation metrics*

<img src = "precisionrecall.png"/>

In [14]:
print("Precision:",metrics.precision_score(y_test, y_pred)) 
print("Recall:",metrics.recall_score(y_test, y_pred))

Precision: 0.9384615384615385
Recall: 0.9682539682539683


# Independent Practice

Compare three SVMs with different kernels on the iris data.
* Gaussian
* Linear
* Poly of degree 3

In [15]:
from sklearn import preprocessing
iris = pd.read_csv('data/iris.csv')

In [16]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [17]:
le = preprocessing.LabelEncoder()
y = iris['species']

In [18]:
y = le.fit_transform(y)

In [19]:
X = iris.drop(['species'], 1)

In [20]:
C = 1.0  # SVM regularization parameter

In [21]:
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C)

In [22]:
rbf_svc.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.7, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [23]:
svc = svm.SVC(kernel='linear', C=C)

In [24]:
svc.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [25]:
poly_svc = svm.SVC(kernel='poly', degree=3, C=C)

In [26]:
poly_svc.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)