In [1]:
import numpy as np
from scipy import stats
from sklearn.datasets.samples_generator import make_blobs
import os, sys, plotly.graph_objects as go
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path) 
from erudition.learning.helpers.plots.plotly_render import render, scatter, shape

In [2]:
X, y = make_blobs(n_samples=50, centers=2, random_state=0,cluster_std=0.6)

data_plot = scatter(X[:,0], X[:, 1], 'Circle Blobs', mode='markers', size=5, opacity=1, color = [['orange', 'white'][a] for a in y])
fig = go.Figure(data=[data_plot])
render(fig, title='Circle Blobs', height=600, width=600)

# The Problem

A linear discriminative classifier would attempt to draw a straight line separating the two sets of data, 
and thereby create a model for classification. For two dimensional data like that shown here, this is a 
task we could do by hand. But immediately we see a problem: there is more than one possible dividing line 
that can perfectly discriminate between the two classes!

We can draw them as follows:

In [3]:
# setup the x coordinates
xfit = np.linspace(-1,3.5)

# draw three straight lines using visually obtained gradient and intercept values
data = [data_plot]
for m, b in [(1, 0.65), (0.5, 1.6), (-0.2, 2.9)]:
    data.append(scatter(xfit, m*xfit+b, 'Line', mode='lines'))

# lets also add a point to the plot that lies between each classification
data.append(scatter([0.6], [2.1], 'Point', mode='markers', size=10, color='yellow', opacity=1))
fig = go.Figure(data=data)
render(fig, title='Classification Split Example', height=600, width=600)

Three three lines perfectly discriminate between the samples. However you can see our new data point would be assigned a differen classification, depending on which line we select.

# Support Vector Machines
## Maximizing the Margin

Support vector machines offer one way to improve on this. The intuition is this: rather than simply drawing a zero-width line between the classes, we can draw around each line a margin of some width, up to the nearest point. Here is an example of how this might look:

In [4]:
# setup the x coordinates
xfit = np.linspace(-1,3.5)

# draw three straight lines using visually obtained gradient and intercept values
data = [data_plot]
for m, b, d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]:
    y_fit = m * xfit + b
    data.append(go.Scatter(
            x=np.append(xfit, xfit[::-1]),
            y=list(map(lambda x: x + d, y_fit))+list(map(lambda x: x - d, y_fit[::-1])),
            fill='toself',
            line_color='rgba(255,255,255,0)',
            showlegend=False,
            opacity=0.2,
            )
    )
    
    data.append(scatter(xfit, m*xfit+b, 'Line', mode='lines'))

# lets also add a point to the plot that lies between each classification
data.append(scatter([0.6], [2.1], 'Point', mode='markers', size=10, color='yellow', opacity=1))
fig = go.Figure(data=data)
render(fig, title='Classification Split Example<br>(With Margins)', height=600, width=600)

In support vector machines, the line that maximizes this margin is the one we will choose as the optimal model. Support vector machines are an example of such a maximum margin estimator.

# Fitting a support vector machine
Let's see the result of an actual fit to this data: we will use Scikit-Learn's support vector classifier to train an SVM model on this data. For the time being, we will use a linear kernel and set the C parameter to a very large number (we'll discuss the meaning of these in more depth momentarily).

In [5]:
from sklearn.svm import SVC

model = SVC(kernel='linear', C=1E10)
model.fit(X,y)

SVC(C=10000000000.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [6]:
x = np.linspace(-1, 3.5, 30)
y = np.linspace(-1, 6, 30)

# create a grid to evaluate the model
Y, X = np.meshgrid(y,x)
xy = np.vstack([X.ravel(), Y.ravel()]).T
P = model.decision_function(xy).reshape(X.shape)
fig = go.Figure(data=[data_plot])
shapes = []
for sv in model.support_vectors_:
    shapes.append(shape(sv[0], sv[1]))

fig.update_layout(shapes=shapes)
render(fig, title='Show Support Vectors', height=600, width=600)