# SVM

This notebook talks about SVM classification using OneVsRestClassifier. We will use the IRIS dataset.
Outline:
    Import the tools required
    Load the dataset
    Division the dataset into train/test
    Train a classifier
    Classify using SVM
    Accuracy measures
    Precision/Recall
    AUC/ROC
    Plotting what we have learnt

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from itertools import cycle

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

In [None]:
# Load the iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

#Understand the dataset at hand before proceeding
print ("samples = " + str(len(X)))

# Features are - Sepal Length, Sepal Width, Petal Length and Petal Width.
print ("features = " + str(len(X[0])))
print ("first row = " + str(X[0]))

# Based on the plant and it's flower's dimensions, 
# we classify the iris as Setosa, Versicolour, and Virginica

# setup plot details - will be required when we plot stuff later
colors = cycle(['navy', 'turquoise', 'darkorange', 'cornflowerblue', 'teal'])

# Binarize the output.
# e.g. y = 2 becomes [0,0,1]. y = 0 becomes [1,0,0]
y = label_binarize(y, classes=[0, 1, 2])
n_classes = y.shape[1]
print (n_classes)
print (y[0])

In [None]:
# Split into training and test. Random state is specified so that we can create the exact same results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=8) # 50% data is test, 50% is train

# Initialize multi-class SVM classifier
classifier = OneVsRestClassifier(svm.SVC(C=1000, probability=True))

# One v/s All used because of computational efficiency (only n_classes classifiers are needed)
# one advantage of this approach is its interpretability. Since each class is represented by one and one classifier only, 
# it is possible to gain knowledge about the class by inspecting its corresponding classifier. 
# This is the most commonly used strategy for multiclass classification and is a fair default choice. 
# Source: sklearn documentation

# Provide training data to the classifier so that it may learn about it
classifier.fit(X_train, y_train)

# Get the decision funtion for testing data
y_score = classifier.decision_function(X_test)

In [None]:
# Predicting class labels for instances in test data
predictions = classifier.predict(X_test)
print (predictions[0])

In [None]:
# Checking the instances where predictions match the actual labels, and reporting accuracy
print (np.sum(predictions==y_test)/(n_classes*len(y_test)))

In [None]:
# Distance of the samples X to the separating hyperplane.
print (y_score[0])

In [None]:
# Compute Precision-Recall and plot curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# We will calculate precision and recall for each class
precision = dict()
recall = dict()
average_precision = dict()

for i in range(n_classes):
    precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], #y_test, y_score are 75 x 3
                                                        y_score[:, i])
    # Third value in above statement is threshold
    average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])


In [None]:
# Plot Precision-Recall curve for each class

# Clear the plot
plt.clf()
lw = 2 #line width

for i, color in zip(range(n_classes), colors):
    # Plot for each class using the color specified earlier
    plt.plot(recall[i], precision[i], color=color, lw=lw,
             label='Precision-recall curve of class {0} (area = {1:0.2f})'
                   ''.format(i, average_precision[i]))

plt.xlim([0.0, 1.0]) # Specifies the coordinates limits for x-axis
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Extension of Precision-Recall curve to multi-class')
plt.legend(loc="lower right")
plt.show()

In [None]:
# ROC Curve
from sklearn.metrics import roc_curve, auc
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC example')
plt.legend(loc="lower right")
plt.show()

# closer the curve follows the left-hand border and then the top border of the ROC space, the more accurate the test.
# closer the curve comes to the 45-degree diagonal of the ROC space, the less accurate the test.
# Accuracy is measured by the area under the ROC curve. An area of 1 represents a perfect test

Plotting Decision Boundaries

In [None]:
import numpy as np
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features. We could
                      # avoid this ugly slicing by using a two-dim dataset
y = iris.target

h = .02  # step size in the mesh


In [None]:
C = 1.0  # SVM regularization parameter
svc = svm.SVC(C=C).fit(X, y) # No OneVsRest used here

In [None]:
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

In [None]:
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
plt.subplot(2, 2, i + 1)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.title('SVM')
plt.show()