In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.datasets.samples_generator import make_blobs

# Support Vector Machines

A method that builds on the linear regression we've seen so far is Support Vector Machines. This algorithm can be used for both classification and regression, but let's look at a regression example to understand how it works. Let's assume we have data with two classes:

In [None]:
X, y = make_blobs(n_samples=50, centers=2, random_state=0, cluster_std=0.60)
plt.figure(figsize=(10,5))
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='viridis');

When fitting a linear model, there are many lines that separate our training data:

In [None]:
xfit = np.linspace(-1, 3.5)
plt.figure(figsize=(10,5))
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='viridis')
for m, b in [(1, 0.65), (0.5, 1.6), (-0.2, 2.9)]:
    plt.plot(xfit, m * xfit + b, '-k')

How can we make sure the line we have is the best one? To do so, we'll define an area around the line as the "margin"

In [None]:
xfit = np.linspace(-1, 3.5)
plt.figure(figsize=(10,5))
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='viridis')

for m, b, d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]:
    yfit = m * xfit + b
    plt.plot(xfit, yfit, '-k')
    plt.fill_between(xfit, yfit - d, yfit + d, edgecolor='none',
                     color='#AAAAAA', alpha=0.4)

These vectors, or lines here, that define the edges of the margin are called "support vectors". The insight with SVM is that we can constrain our fitting method to have support vectors that are based on the data. For example, we can require that the support vector lines pass directly through at least one data point.

In [None]:
from sklearn.svm import SVC
model = SVC(kernel='linear', C=1e10)
model.fit(X, y)

In [None]:
from figures.plot_svc import plot_svc_decision_function

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='viridis')
plot_svc_decision_function(model);

Now we can be sure that the line we have is based on the data, and not just randomness from the fitting method. 

SVMs also adress one of the main issues we've seen so far: data is usually not linear. For example, sometimes data is more like a circle:

In [None]:
from sklearn.datasets.samples_generator import make_circles
X, y = make_circles(100, factor=.1, noise=.1)

clf = SVC(kernel='linear').fit(X, y)
plt.figure(figsize=(10,5))
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='viridis')
plot_svc_decision_function(clf, plot_support=False);

 To overcome this problem, SVMs use what is called the "kernel trick". In this method, the data is modified by a function, called a kernel. The first kernel we'll look at is called a "Radial Basis Function"

In [None]:
clf = SVC(kernel='rbf', C=1E6)
clf.fit(X, y)

The radial basis function is able to transform the data from circular into linear, which allows our linear classification method to find a minimum. Once the classifier is found, the same kernel can be used to transform the classifier to match the original data.

In [None]:
plt.figure(figsize=(10,5))
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='viridis')
plot_svc_decision_function(clf)
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
            s=300, lw=1, facecolors='none');

The choice of kernel is an important parameter in SVMs. Another important parameter is C, which determines the "rigidity" of our support vectors. A smaller C will allow for more data points to be within the boundaries, which can be good for data which is tightly mixed together.

In [None]:
X, y = make_blobs(n_samples=100, centers=2,
                  random_state=0, cluster_std=0.8)

fig, ax = plt.subplots(1, 2, figsize=(16, 6))
fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1)

for axi, C in zip(ax, [10.0, 0.1]):
    model = SVC(kernel='linear', C=C).fit(X, y)
    axi.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn')
    plot_svc_decision_function(model, axi)
    axi.scatter(model.support_vectors_[:, 0],
                model.support_vectors_[:, 1],
                s=300, lw=1, facecolors='none');
    axi.set_title('C = {0:.1f}'.format(C), size=14)

Kernels can be any transformation function. Let's look at multiplying our data by a matrix:

In [None]:
from sklearn import svm, datasets

iris = datasets.load_iris()
X = iris.data[:, :2]
Y = iris.target

M = np.array([[0.5, 1.0], [0, 1.6]])
def my_kernel(X, Y):
    return np.dot(np.dot(X, M), Y.T)

transformed = np.dot(X, M)
fig, ax = plt.subplots(1, 2, figsize=(16, 6))
ax[0].scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors='k')
ax[0].set_title("Data")
ax[1].scatter(transformed[:, 0], transformed[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors='k')
ax[1].set_title("Transformed");

linear_clf = svm.SVC(kernel='linear')
linear_clf.fit(X, Y)
print("Custom kernel score: ", linear_clf.score(X, Y))

custom_clf = svm.SVC(kernel=my_kernel)
custom_clf.fit(X, Y)
print("Custom kernel score: ", custom_clf.score(X, Y))

<div class="alert alert-success">
    <b>EXERCISE</b>:
     <ul>
      <li>
      Download ``03_svm_iris.py`` from the course website. Here there is a custom kernel called "my_kernel." Modify the values in the kernel, or define a new operation, to try to maximize the classification score. You can also try the 'linear', 'poly', and 'rbf' kernels for comparison. Modify C to also get a better classification score.
      </li>
    </ul>
</div>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features. We could
                      # avoid this ugly slicing by using a two-dim dataset
Y = iris.target


def my_kernel(X, Y):
    """
    We create a custom kernel:

                 (2  0)
    k(X, Y) = X  (    ) Y.T
                 (0  1)
    """
    M = np.array([[2, 0], [0, 1.0]])
    return np.dot(np.dot(X, M), Y.T)



h = .02  # step size in the mesh

# we create an instance of SVM and fit out data.
clf = svm.SVC(kernel=my_kernel)
clf.fit(X, Y)

# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired, edgecolors='k')
plt.title('3-Class classification using Support Vector Machine with custom'
          ' kernel')
plt.axis('tight')
plt.show()

print("Classification score: ", clf.score(X, Y))

## Regression

Support vector machines can also easily be used for regression. The idea is similar: when fitting the line, use data points to determine the margins on the line. Kernels can also be used to transform the data to acheive a better fit.

In [None]:
from sklearn.svm import SVR

# Generate sample data
X = np.sort(5 * np.random.rand(40, 1), axis=0)
y = 5 * np.sin(X).ravel()

# Add noise to targets
y[::5] += 3 * (0.5 - np.random.rand(8))

# Fit regression model
svr_rbf = SVR(kernel='rbf', gamma=0.1)
svr_lin = SVR(kernel='linear')
svr_poly = SVR(kernel='poly', degree=2)
y_rbf = svr_rbf.fit(X, y).predict(X)
y_lin = svr_lin.fit(X, y).predict(X)
y_poly = svr_poly.fit(X, y).predict(X)

# Look at the results
plt.figure(figsize=(10,5))
plt.scatter(X, y, color='darkorange', label='data')
plt.plot(X, y_rbf, color='navy', label='RBF model')
plt.plot(X, y_lin, color='c', label='Linear model')
plt.plot(X, y_poly, color='cornflowerblue', label='Polynomial model')
plt.xlabel('data')
plt.ylabel('target')
plt.title('Support Vector Regression')
plt.legend()
plt.show()

<div class="alert alert-success">
    <b>EXERCISE</b>:
     <ul>
      <li>
      Download ``03_svr_diabetes.py`` from the course website. None of the kernels seem to be classifying the data very well. Do you have any guesses why? Try to fix it so that you have better classification.
      </li>
    </ul>
</div>

In [None]:
%matplotlib inline
import numpy as np
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

diabetes = datasets.load_diabetes()
feature = 3

X = diabetes.data
y = diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    random_state=1234)
# Fit regression model
svr_lin = SVR(kernel='linear')
svr_rbf = SVR(kernel='rbf', gamma=0.1)
svr_poly = SVR(kernel='poly', degree=2)
y_lin = svr_lin.fit(X_train, y_train).predict(X_train)
y_rbf = svr_rbf.fit(X_train, y_train).predict(X_train)
y_poly = svr_poly.fit(X_train, y_train).predict(X_train)

print("Linear train error: ", mean_squared_error(y_train, y_lin),
      " test error: ", mean_squared_error(y_test, svr_lin.predict(X_test)))

print("RBF train error: ", mean_squared_error(y_train, y_rbf),
      " test error: ", mean_squared_error(y_test, svr_rbf.predict(X_test)))

print("Polynomial train error: ", mean_squared_error(y_train, y_poly),
      " test error: ", mean_squared_error(y_test, svr_rbf.predict(X_test)))

plt.figure(figsize=(20,10))
plt.scatter(X_train[:, feature], y_train, color='darkorange', label='data')
plt.scatter(X_train[:, feature], y_lin, color='c', label='Linear model')
plt.scatter(X_train[:, feature], y_rbf, color='navy', label='RBF model')
plt.scatter(X_train[:, feature], y_poly, color='cornflowerblue', label='Polynomial model')
plt.xlabel('data')
plt.ylabel('target')
plt.title('Support Vector Regression')
plt.legend()
plt.show()