In [1]:
# Useful starting lines
%matplotlib inline

import random
from datetime import datetime

import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2

# Support Vector Machines
## Classification Using SVM
Load dataset. We will re-use the CERN dataset from project 1, available from https://inclass.kaggle.com/c/epfml-project-1/data

In [2]:
from helpers import load_csv_data

seed = 666
DATA_TRAIN_PATH = 'data/train.csv'

np.random.seed(seed)
y, X, ids = load_csv_data(DATA_TRAIN_PATH, sub_sample=True)
print(y.shape, X.shape)

(5000,) (5000, 30)


## prepare cost function and error function

In [3]:
def support_vector(y, X, w):
    return 1 - y * (X @ w)

def clipped_support_vector(y, X, w):
    return np.clip(support_vector(y, X, w), 0, np.inf)

In [4]:
def calculate_primal_loss(y, X, w, lambda_):
    """compute the full cost (the primal objective), that is loss plus regularizer.
    X: the full dataset matrix, shape = (num_samples, num_features)
    y: the corresponding +1 or -1 labels, shape = (num_samples)
    w: shape = (num_features)
    """
    v = clipped_support_vector(y, X, w)
    return np.sum(v) + lambda_ / 2 * np.sum(w ** 2)

In [5]:
def accuracy(y1, y2):
    return np.mean(y1 == y2)

def prediction(X, w):
    return (X @ w > 0) * 2 - 1

def calculate_accuracy(y, X, w):
    """compute the full cost (the primal objective), that is loss plus regularizer.
    X: the full dataset matrix, shape = (num_samples, num_features)
    y: the corresponding +1 or -1 labels, shape = (num_samples)
    w: shape = (num_features)
    """
    predicted_y = prediction(X, w)
    return accuracy(predicted_y, y)

## Stochastic Gradient Descent for SVM

Compute the (stochastic) subgradient for the n-th summand of the SVM optimization objective

In [6]:
def calculate_stochastic_gradient(y, X, w, lambda_, n):
    """compute the stochastic gradient of loss plus regularizer.
    X: the dataset matrix, shape = (num_samples, num_features)
    y: the corresponding +1 or -1 labels, shape = (num_samples)
    w: shape = (num_features)
    N: num_samples
    """
    # Be careful about the constant N (size) term!
    # The complete objective for SVM is a sum, not an average as in earlier SGD examples!
    def is_support(y_n, x_n, w):
        """a datapoint is support if max{} is not 0. """
        return y_n * x_n @ w < 1
    
    x_n, y_n = X[n], y[n]
    grad = - y_n * x_n.T if is_support(y_n, x_n, w) else np.zeros_like(x_n.T)
    grad = np.squeeze(grad) + lambda_ / X.shape[0] * w
    return grad

Implement stochastic gradient descent: Pick a data point uniformly at random and update w based on the gradient for the n-th summand of the objective

In [7]:
def sgd_for_svm_demo(y, X):
    
    max_iter = 100
    gamma = 0.0001
    lambda_ = 0.1
    
    num_samples, num_features = X.shape
    sample_indices = range(num_samples)
    w = np.random.rand(num_features)
    
    for cur_iter in range(max_iter):
        # n = sample one data point uniformly at random data from x
        num_of_points = random.sample(sample_indices, num_samples)
        for n in num_of_points: 
            loss = calculate_primal_loss(y, X, w, lambda_)
            grad = calculate_stochastic_gradient(y, X, w, lambda_, n)
            w -= gamma * grad
        
        if cur_iter % 10 == 0:
            print("Current iteration={i}, the loss={l}".format(i=cur_iter, l=loss))
    
    print("accuracy = {l}".format(l=calculate_accuracy(y, X, w)))

start_time = datetime.now()
sgd_for_svm_demo(y, X)
end_time = datetime.now()

print('it takes {}'.format(end_time - start_time))

Current iteration=0, the loss=662340.5535586537
Current iteration=10, the loss=395162.8087122275
Current iteration=20, the loss=1793593.3868853478
Current iteration=30, the loss=488859.97792401194
Current iteration=40, the loss=376929.77122837794
Current iteration=50, the loss=427103.47353419947
Current iteration=60, the loss=413552.6945558849
Current iteration=70, the loss=1021680.4641977681
Current iteration=80, the loss=1767759.027591515
Current iteration=90, the loss=564897.6092979783
accuracy = 0.708
it takes 0:04:05.606112


## Coordinate Descent (Ascent) for SVM

Compute the closed-form update for the n-th variable alpha, in the dual optimization problem, given alpha and the current corresponding w

In [8]:
def calculate_coordinate_update(y, X, lambda_, alpha, w, n):
    """compute the stochastic gradient of loss plus regularizer.
    X: the dataset matrix, shape = (num_samples, num_features)
    y: the corresponding +1 or -1 labels, shape = (num_samples)
    w: shape = (num_features)
    N: num_samples
    """        
    # calculate the update of coordinate at index=n.
    x_n, y_n = X[n], y[n]
    old_alpha_n = alpha[n]

    g = 1.0 * (y_n * x_n.dot(w) - 1)

    if old_alpha_n == 0:
        g = min(g, 0)
    elif old_alpha_n == 1.0 / lambda_:
        g = max(g, 0)
    else:
        g = g
    if g != 0:
        alpha[n] = min(
            max(old_alpha_n - g / (x_n.T.dot(x_n)), 0.0),
            1.0 / lambda_)
    
        # compute the corresponding update on the primal vector w
        w += 1.0 * (alpha[n] - old_alpha_n) * y_n * x_n
    return w, alpha

In [10]:
def calculate_dual_loss(y, X, w, alpha, lambda_):
    """calculate the loss for dual problem."""
    return lambda_ * np.sum(alpha) - lambda_ / 2 * np.sum(w ** 2)

In [11]:
def coordinate_descent_for_svm_demo(y, X):
    max_iter = 10000
    lambda_ = 0.1
    
    num_samples, num_features = X.shape
    sample_indices = range(num_samples)
    w = np.zeros(num_features)
    alpha = np.zeros(num_samples)
    
    for cur_iter in range(max_iter):
        # n = uniformly random data point from x
        num_of_points = random.sample(sample_indices, num_samples)

        for iteration in num_of_points:
            n = sample_indices[iteration]
            w, alpha = calculate_coordinate_update(y, X, lambda_, alpha, w, n)

        if cur_iter % 1000 == 0:
            # primal objective
            primal_value = calculate_primal_loss(y, X, w, lambda_)
            # dual objective
            dual_value = calculate_dual_loss(y, X, w, alpha, lambda_)
            # primal dual gap
            duality_gap = primal_value - dual_value
            print('cur_iter:%i, primal:%.5f, dual:%.5f, gap:%.5f'%(
                    cur_iter, primal_value, dual_value, duality_gap))

    print("accuracy = {l}".format(l=calculate_accuracy(y, X, w)))

start_time = datetime.now()
coordinate_descent_for_svm_demo(y, X)
end_time = datetime.now()

print('it takes {}'.format(end_time - start_time))

cur_iter:0, primal:3807.29140, dual:0.00049, gap:3807.29091
cur_iter:1000, primal:4535.83538, dual:0.41155, gap:4535.42384
cur_iter:2000, primal:6622.38311, dual:0.81297, gap:6621.57013
cur_iter:3000, primal:3802.04106, dual:1.21119, gap:3800.82988
cur_iter:4000, primal:3132.62319, dual:1.60773, gap:3131.01546
cur_iter:5000, primal:3654.16174, dual:2.00418, gap:3652.15756
cur_iter:6000, primal:4517.17608, dual:2.40025, gap:4514.77583
cur_iter:7000, primal:3408.09117, dual:2.79624, gap:3405.29493
cur_iter:8000, primal:3911.54770, dual:3.19182, gap:3908.35588
cur_iter:9000, primal:3830.62083, dual:3.58723, gap:3827.03361
accuracy = 0.6012
it takes 0:30:57.292089
