In [None]:
from collections import defaultdict
import numpy as np
import scipy
import scipy.sparse as sps
import math
import matplotlib.pyplot as plt
import time
from sklearn.datasets import load_svmlight_file
import random
%matplotlib inline

# Support Vector Machines
## Classification Using SVM
Load dataset. We will use w1a dataset from LibSVM datasets https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/

The original optimization problem for the Support Vector Machine (SVM) is given by
\begin{equation}\label{eq:primal}
  \min_{w \in R^d} \  \sum_{i=1}^n \ell(y_i A_i^\top w) + \frac\lambda2 \|w\|^2
\end{equation}
where $\ell : R\rightarrow R$, $\ell(z) := \max\{0,1-z\}$ is the hinge loss function.
Here for any $i$, $1\le i\le n$, the vector $A_i\in R^d$ is the $i$-th data example, and $y_i\in\{\pm1\}$ is the corresponding label.
  
The dual optimization problem for the SVM is given by 
\begin{equation}\label{eq:dual}
 \max_{\boldsymbol{\alpha} \in R^n } \  \alpha^\top\boldsymbol{1} - \tfrac1{2\lambda} \alpha^\top Y A A^\top Y\alpha
 \text{    such that    $0\le \alpha_i \le 1  \ \forall i$}
\end{equation}
where $Y := \mathop{diag}(y)$, and $A\in R^{n \times d}$ again collects all $n$ data examples as its columns. 

Note that $w$ can be derived from $\alpha$ as
\begin{equation}
    w(\alpha) = \frac{1}{\lambda} A^\top Y \alpha.
\end{equation}

In [None]:
DATA_TRAIN_PATH = 'data/w1a'

A, y = load_svmlight_file(DATA_TRAIN_PATH)
A = A.toarray()
print(y.shape, A.shape)

## Prepare cost and prediction functions

In [None]:
def calculate_primal_objective(y, A, w, lambda_):
    """
    Compute the full cost (the primal objective), that is loss plus regularizer.
    y: +1 or -1 labels, shape = (num_examples)
    A: Dataset matrix, shape = (num_examples, num_features)
    w: Model weights, shape = (num_features)
    return: scalar value
    """
    # ***************************************************
    # INSERT YOUR CODE HERE
    # TODO
    # ***************************************************
    raise NotImplementedError

In [None]:
def calculate_accuracy(y, A, w):
    """
    Compute the training accuracy on the training set (can be called for test set as well).
    y: +1 or -1 labels, shape = (num_examples)
    A: Dataset matrix, shape = (num_examples, num_features)
    w: Model weights, shape = (num_features)
    return: scalar value
    """
    # ***************************************************
    # INSERT YOUR CODE HERE
    # TODO
    # ***************************************************
    raise NotImplementedError

## Coordinate Descent (Ascent) for SVM

Compute the closed-form update for the i-th variable alpha, in the dual optimization problem, given alpha and the current corresponding w.


Hints: 
- Differentiate the dual objective with respect to one `alpha[i]`.
- Set the derivative to zero to compute a new `alpha[i]`.
- Make sure the values of alpha stay inside a `[0, 1]` box.
- You can formulate the update as `alpha[i] = projection(alpha[i] + lambda_ * (some update))`.
- You can test the correctness of your implementation by checking if the difference between the dual objective and primal objective goes to zero. This difference, the duality gap, should get smaller than 10 in 700000 iterations.

In [None]:
def calculate_coordinate_update(y, A, lambda_, alpha, w, i):
    """
    Compute a coordinate update (closed form) for coordinate i.
    y: +1 or -1 labels, shape = (num_examples)
    A: Dataset matrix, shape = (num_examples, num_features)
    lambda_: Regularization parameter, scalar
    alpha: Dual variables, shape = (num_examples)
    w: Model weights, shape = (num_examples)
    i: Index of the entry of the dual variable 'alpha' that is to be updated
    return: New weights w (shape (num_features)), New dual variables alpha (shape (num_examples))
    """
    # ***************************************************
    # INSERT YOUR CODE HERE
    # TODO
    # ***************************************************
    # calculate the update of coordinate at index=n.
    a_i, y_i = A[i], y[i]
    old_alpha_i = np.copy(alpha[i])
    
    raise NotImplementedError
    return w, alpha

In [None]:
def calculate_dual_objective(y, A, w, alpha, lambda_):
    """
    Calculate the objective for the dual problem.
    Follow the formula given above.
    y: +1 or -1 labels, shape = (num_examples)
    A: Dataset matrix, shape = (num_examples, num_features)
    alpha: Dual variables, shape = (num_examples)
    lambda_: Regularization parameter, scalar
    return: Scalar value
    """
    # ***************************************************
    # INSERT YOUR CODE HERE
    # TODO
    # ***************************************************
    raise NotImplementedError

In [None]:
def coordinate_descent_for_svm_demo(y, A, trace=False):
    max_iter = 1000000
    lambda_ = 0.01
    history = defaultdict(list) if trace else None
    
    num_examples, num_features = A.shape
    w = np.zeros(num_features)
    alpha = np.zeros(num_examples)
    
    for it in range(max_iter):
        # i = sample one data point uniformly at random from the columns of A
        i = random.randint(0,num_examples-1)
        
        w, alpha = calculate_coordinate_update(y, A, lambda_, alpha, w, i)
            
        if it % 100000 == 0:
            # primal objective
            primal_value = calculate_primal_objective(y, A, w, lambda_)
            # dual objective
            dual_value = calculate_dual_objective(y, A, w, alpha, lambda_)
            # primal dual gap
            duality_gap = primal_value - dual_value
            
            print('iteration=%i, primal:%.5f, dual:%.5f, gap:%.5f'%(
                    it, primal_value, dual_value, duality_gap))
        if it % 1000 == 0:
            primal_value = calculate_primal_objective(y, A, w, lambda_)
            if trace:
                history["objective_function"] += [primal_value]
                history['iter'].append(it)

            
    print("training accuracy = {l}".format(l=calculate_accuracy(y, A, w)))
    return history

history_cd = coordinate_descent_for_svm_demo(y, A, trace=True)

# Stochastic gradient descent for SVM

Let's now compare it with SGD on original problem for the SVM. In this part, you will implement stochastic gradient descent on the primal SVM objective. The stochasticity comes from sampling data points.

In [None]:
def compute_stoch_gradient_svm(A_sample, b_sample, lambda_, w_t, num_data_points):
    """
    Calculate stochastic gradient over A_batch, b_batch.
    A_sample: A data sample, shape=(num_features)
    b_sample: Corresponding +1 or -1 label, scalar
    w_t: Model weights, shape=(num_features)
    num_data_points: Total size of the dataset, scalar integer
    """
    # ***************************************************
    # INSERT YOUR CODE HERE
    # TODO
    # ***************************************************
    raise NotImplementedError

In [None]:
def stochastic_gradient_descent_svm_demo(A, b, gamma, batch_size=1, trace=False):
    history = defaultdict(list) if trace else None
    num_data_points, num_features = np.shape(A)
    max_iter = 1000000
    lambda_ = 0.01
    
    w_t = np.zeros(num_features)
    
    current_iter = 0
    while (current_iter < max_iter):
        i = random.randint(0,num_data_points - 1)
        b_batch, A_batch = b[i], A[i]
        gradient = compute_stoch_gradient_svm(A_batch, b_batch, lambda_, w_t, num_data_points)
        w_t = w_t - gamma * gradient
        if current_iter % 100000 == 0:
            primal_value = calculate_primal_objective(y, A, w_t, lambda_)
            print('iteration=%i, primal:%.5f'%(
                    current_iter, primal_value))
        if current_iter % 1000 == 0:
            primal_value = calculate_primal_objective(y, A, w_t, lambda_)
            if trace:
                history['objective_function'].append(primal_value)
                history['iter'].append(current_iter)
        current_iter += 1
    print("training accuracy = {l}".format(l=calculate_accuracy(y, A, w_t)))
    return history


Try different stepsized and find the best one

In [None]:
# ***************************************************
# INSERT YOUR CODE HERE
# TODO
# ***************************************************

Plot learning curves

In [None]:
# ***************************************************
# INSERT YOUR CODE HERE
# TODO
# ***************************************************

## Compare SGD with Coordinate Descent

Compare two algorithms in terms of convergence, time complexities per iteration. Which one is easier to use?