## DATA 558: Homework 7

Geoffrey Li

May 31, 2019

In [73]:
import numpy as np
import scipy
import matplotlib.pyplot as plt

## Exercise 2

### Load and Standardize Data

In [2]:
train = np.loadtxt('./optdigits.tra', delimiter=',')
test = np.loadtxt('./optdigits.tes', delimiter=',')

X_train = train[:, 0:-1]
y_train = train[:, train.shape[1]-1]

X_test = test[:, 0:-1]
y_test = test[:, test.shape[1]-1]

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
# Subsetting data to just 2 classes: 1 (+1) and 0 (-1).
pos_class = 1
neg_class = 0

train_subset = (y_train == pos_class) | (y_train == neg_class)
test_subset = (y_test == pos_class) | (y_test == neg_class)

y_train_subset = y_train[np.where(train_subset)]
X_train_subset = X_train[train_subset.nonzero()[0]]

y_test_subset = y_test[np.where(test_subset)]
X_test_subset = X_test[test_subset.nonzero()[0]]

In [5]:
y_train_subset = np.fromiter(map(lambda n: 1 if n == pos_class else -1, y_train_subset), dtype=int).reshape(-1, 1)
y_test_subset = np.fromiter(map(lambda n: 1 if n == pos_class else -1, y_test_subset), dtype=int).reshape(-1, 1)

### Define Gradient Functions

Compute the gradient ∇F(α) of F.

$$ \frac{\partial F}{\partial \alpha} = \frac{1}{n} \sum_{i=1}^n l'_{hh}(y_i, (K\alpha)_i) + 2\lambda K \alpha$$

where 
$$ l'_{hh}(y_i, (K\alpha)_i) = $$

\begin{cases}
      0, & \text{if}\ y(K\alpha)_i  \geq 1.5 \\
      -y_iK_i(1.5-y_i(K\alpha)_i) & \text{if}\ | 1-y(K\alpha)_i|  \leq 0.5 \\
      -y_iK_i , & \text{if}\ y(K\alpha)_i < 0.5
\end{cases}


In [25]:
def grad(alpha, K, y, lam):
    t = K@alpha
    h = 0.5
    grad = np.zeros((len(y), 1))
    for i in range(len(y)):
        if y[i]*t[i] > 1 + h:
            grad += 0
        elif y[i]*t[i] < 1 - h:
            grad += (-y[i]*K[:, i])[:, np.newaxis]
        elif (y[i]*t[i] >= 1 - h) & (y[i]*t[i] <= 1 + h):
            grad += ((1+h-y[i]*t[i]) * (-y[i]*K[:, i]))[:, np.newaxis]
        else:
            print('Gradient compute error.')
    
    grad *= 1/len(y)
    grad += 2*lam*K.dot(alpha)
    
    return grad

In [7]:
def obj(alpha, K, y, lam):
    t = K@alpha
    h = 0.5
    obj = 0
    for i in range(len(y)):
        if y[i]*t[i] > 1 + h:
            obj += 0
        elif y[i]*t[i] < 1 - h:
            obj += 1 - y[i]*t[i]
        elif (y[i]*t[i] >= 1 - h) & (y[i]*t[i] <= 1 + h):
            obj += (1+h-y[i]*t[i])**2/(4*h)
        else:
            print('Gradient compute error.')
    
    obj *= 1/len(y)
    obj += lam*alpha.T.dot(K).dot(alpha)[0]
    
    return obj

Write a function computegram that computes, for any set of datapoints x1, . . . , xn, the
kernel matrix K.

In [99]:
def computegram(X, kernel, **kwargs):
    n = len(X)
    K = np.zeros((n, n))
    for i in range(n):
        for j in range(n):
            if 'power' in kwargs:
                K[i, j] = kernel(X[i], X[j], kwargs['power'])
            elif 'sigma' in kwargs:
                K[i, j] = kernel(X[i], X[j], kwargs['sigma'])
    
    return K

Write a function kerneleval that computes, for any set of datapoints x1, . . . , xn and a
new datapoint x⋆, the vector of kernel evaluations [k(x1, x⋆), . . . , k(xn, x⋆)]T .

In [9]:
def kerneleval(X, x_star, kernel, **kwargs):
    if 'power' in kwargs:
        return np.fromiter([kernel(x_i, x_star, kwargs['power']) for x_i in X], dtype=np.float).T

You know now by heart the fast gradient algorithm, so no need to recall it here.

In [91]:
def bt_line_search(beta, K, y, lam, eta=1, alpha=0.5, betaparam=0.5, maxiter=100):
    grad_beta = grad(beta, K, y, lam)
    norm_grad_beta = np.linalg.norm(grad_beta)
    found_eta = 0
    iter = 0
    while found_eta == 0 and iter < maxiter:
        if obj(beta - eta * grad_beta, K, y, lam) < \
                        obj(beta, K, y, lam) - alpha * eta * norm_grad_beta ** 2:
            found_eta = 1
        elif iter == maxiter-1:
            print('Max number of iterations of backtracking line search reached')
            break
        else:
            eta *= betaparam
            iter += 1
    return eta

def fastgradalgo(beta_init, theta_init, K, y, lam, eta_init, maxiter, eps=1e-5):
    beta = beta_init
    theta = theta_init
    eta = eta_init
    grad_theta = grad(theta, K, y, lam)
    grad_beta = grad(beta, K, y, lam)
    grad_beta_norm = np.linalg.norm(grad_beta)
    iter = 0
    beta_list = list()
    while iter < maxiter and grad_beta_norm > eps:
#         print('Fastgradalgo Iteration:', iter)
        
        eta = bt_line_search(theta, K, y, lam, eta=eta)
#         print('Eta:', eta)
        
        beta_new = theta - eta*grad_theta
        theta = beta_new + iter/(iter+3)*(beta_new-beta)
        
        grad_theta = grad(theta, K, y, lam)
        grad_beta = grad(beta, K, y, lam)        
        beta = beta_new.copy()
        
        iter += 1
        if iter % 1 == 0:
            beta_list.append(beta_new)
        
        grad_beta_norm = np.linalg.norm(grad_beta)
#         print('Norm of Gradient at Current Iteration:', grad_beta_norm)
#         print('Objective Value at Current Iteration:', obj(beta, K, y, lam))
        
    return beta_list

In [11]:
def initstepsize(K, lam):
    n = len(K)
    return 1 / scipy.linalg.eigh(2 / n * np.dot(K, K) + 2 * lam * K, eigvals=(n - 1, n - 1), eigvals_only=True)[0]

Write a function mysvm that implements the fast gradient algorithm to train the kernel support vector machine with the smoothed hinge loss. The function takes as input the initial step-size value for the backtracking rule and a stopping criterion based on the norm of the gradient.

In [82]:
def mysvm(K, y, lam, eta_init, eps=1e-5):
    alpha_init = np.zeros(len(K))[:, np.newaxis]
    theta_init = np.zeros(len(K))[:, np.newaxis]
    
    opt_alpha = fastgradalgo(alpha_init, theta_init, K, y, lam, eta_init, maxiter=1000, eps=1e-3)    
    
    return opt_alpha

Define kernels.

In [13]:
def pth_order_poly_kernel(x, y, p):
    b = 1
    return (x.dot(y)+b)**p

### Run my implementation of SVM Classifier

In [81]:
K = computegram(X_train_subset, pth_order_poly_kernel, power=7)
lambduh = 1/10
eta_init = initstepsize(K, lambduh)

eta_init

2.733736490150803e-46

In [83]:
opt_alpha = mysvm(K, y_train_subset, lam=lambduh, eta_init=eta_init, eps=1e-3)

Fastgradalgo Iteration: 0
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.5551414885906986e+21
Objective Value at Current Iteration: [0.99933886]
Fastgradalgo Iteration: 1
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.5507004275261503e+21
Objective Value at Current Iteration: [0.99884583]
Fastgradalgo Iteration: 2
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 7.753814446920945e+20
Objective Value at Current Iteration: [0.99870458]
Fastgradalgo Iteration: 3
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 2.908502490951307e+20
Objective Value at Current Iteration: [0.99868208]
Fastgradalgo Iteration: 4
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 4.900913801033545e+19
Objective Value at Current Iteration: [0.99868102]
Fastgradalgo Iteration: 5
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99868062]
Fastgra

Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99866767]
Fastgradalgo Iteration: 50
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99866719]
Fastgradalgo Iteration: 51
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.9986667]
Fastgradalgo Iteration: 52
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.9986662]
Fastgradalgo Iteration: 53
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99866569]
Fastgradalgo Iteration: 54
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99866518]
Fastgradalgo Iteration: 55
Eta: 2.733736490150803e-46


Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99863259]
Fastgradalgo Iteration: 100
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99863166]
Fastgradalgo Iteration: 101
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99863072]
Fastgradalgo Iteration: 102
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99862976]
Fastgradalgo Iteration: 103
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99862881]
Fastgradalgo Iteration: 104
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99862784]
Fastgradalgo Iteration: 105
Eta: 2.733736490150

Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99857489]
Fastgradalgo Iteration: 150
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99857351]
Fastgradalgo Iteration: 151
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99857211]
Fastgradalgo Iteration: 152
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99857071]
Fastgradalgo Iteration: 153
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.9985693]
Fastgradalgo Iteration: 154
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99856788]
Fastgradalgo Iteration: 155
Eta: 2.7337364901508

Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99849639]
Fastgradalgo Iteration: 199
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99849457]
Fastgradalgo Iteration: 200
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99849273]
Fastgradalgo Iteration: 201
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99849088]
Fastgradalgo Iteration: 202
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99848903]
Fastgradalgo Iteration: 203
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99848716]
Fastgradalgo Iteration: 204
Eta: 2.733736490150

Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.9983939]
Fastgradalgo Iteration: 249
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99839162]
Fastgradalgo Iteration: 250
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99838933]
Fastgradalgo Iteration: 251
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99838703]
Fastgradalgo Iteration: 252
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99838472]
Fastgradalgo Iteration: 253
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.9983824]
Fastgradalgo Iteration: 254
Eta: 2.73373649015080

Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99826877]
Fastgradalgo Iteration: 299
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99826604]
Fastgradalgo Iteration: 300
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.9982633]
Fastgradalgo Iteration: 301
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99826055]
Fastgradalgo Iteration: 302
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99825779]
Fastgradalgo Iteration: 303
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99825502]
Fastgradalgo Iteration: 304
Eta: 2.7337364901508

Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99812102]
Fastgradalgo Iteration: 349
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99811784]
Fastgradalgo Iteration: 350
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99811464]
Fastgradalgo Iteration: 351
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99811144]
Fastgradalgo Iteration: 352
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99810823]
Fastgradalgo Iteration: 353
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99810501]
Fastgradalgo Iteration: 354
Eta: 2.733736490150

Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99795065]
Fastgradalgo Iteration: 399
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99794701]
Fastgradalgo Iteration: 400
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99794336]
Fastgradalgo Iteration: 401
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99793971]
Fastgradalgo Iteration: 402
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99793604]
Fastgradalgo Iteration: 403
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99793237]
Fastgradalgo Iteration: 404
Eta: 2.733736490150

Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99775765]
Fastgradalgo Iteration: 449
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99775356]
Fastgradalgo Iteration: 450
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99774946]
Fastgradalgo Iteration: 451
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99774535]
Fastgradalgo Iteration: 452
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99774123]
Fastgradalgo Iteration: 453
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.9977371]
Fastgradalgo Iteration: 454
Eta: 2.7337364901508

Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99754202]
Fastgradalgo Iteration: 499
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99753748]
Fastgradalgo Iteration: 500
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99753292]
Fastgradalgo Iteration: 501
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99752836]
Fastgradalgo Iteration: 502
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99752379]
Fastgradalgo Iteration: 503
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99751921]
Fastgradalgo Iteration: 504
Eta: 2.733736490150

Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99730377]
Fastgradalgo Iteration: 549
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99729877]
Fastgradalgo Iteration: 550
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99729376]
Fastgradalgo Iteration: 551
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99728875]
Fastgradalgo Iteration: 552
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99728373]
Fastgradalgo Iteration: 553
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.9972787]
Fastgradalgo Iteratio

Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99704288]
Fastgradalgo Iteration: 599
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99703744]
Fastgradalgo Iteration: 600
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99703198]
Fastgradalgo Iteration: 601
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99702651]
Fastgradalgo Iteration: 602
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99702104]
Fastgradalgo Iteration: 603
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99701555]
Fastgradalgo Iterati

Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99675938]
Fastgradalgo Iteration: 649
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99675348]
Fastgradalgo Iteration: 650
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99674757]
Fastgradalgo Iteration: 651
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99674165]
Fastgradalgo Iteration: 652
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99673572]
Fastgradalgo Iteration: 653
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99672978]
Fastgradalgo Iterati

Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99645325]
Fastgradalgo Iteration: 699
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99644689]
Fastgradalgo Iteration: 700
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99644053]
Fastgradalgo Iteration: 701
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99643416]
Fastgradalgo Iteration: 702
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99642778]
Fastgradalgo Iteration: 703
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99642139]
Fastgradalgo Iterati

Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99612449]
Fastgradalgo Iteration: 749
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99611768]
Fastgradalgo Iteration: 750
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99611086]
Fastgradalgo Iteration: 751
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99610404]
Fastgradalgo Iteration: 752
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99609721]
Fastgradalgo Iteration: 753
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1507679934572472e+19
Objective Value at Current Iteration: [0.99609037]
Fastgradalgo Iterati

Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1503752948217391e+19
Objective Value at Current Iteration: [0.99577311]
Fastgradalgo Iteration: 799
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1500509993056481e+19
Objective Value at Current Iteration: [0.99576585]
Fastgradalgo Iteration: 800
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1497263168300906e+19
Objective Value at Current Iteration: [0.99575859]
Fastgradalgo Iteration: 801
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1494012480009978e+19
Objective Value at Current Iteration: [0.99575133]
Fastgradalgo Iteration: 802
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1490757934230809e+19
Objective Value at Current Iteration: [0.99574405]
Fastgradalgo Iteration: 803
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1487456047447792e+19
Objective Value at Current Iteration: [0.99573677]
Fastgradalgo Iterati

Norm of Gradient at Current Iteration: 1.1084707892240744e+19
Objective Value at Current Iteration: [0.99541368]
Fastgradalgo Iteration: 848
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1074827959159958e+19
Objective Value at Current Iteration: [0.9954063]
Fastgradalgo Iteration: 849
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.106493996361942e+19
Objective Value at Current Iteration: [0.99539892]
Fastgradalgo Iteration: 850
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1055043955661982e+19
Objective Value at Current Iteration: [0.99539154]
Fastgradalgo Iteration: 851
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1045139985267972e+19
Objective Value at Current Iteration: [0.99538416]
Fastgradalgo Iteration: 852
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.1035228102355818e+19
Objective Value at Current Iteration: [0.99537678]
Fastgradalgo Iteration: 853
Eta: 2.73373649015080

Norm of Gradient at Current Iteration: 1.0581802433094218e+19
Objective Value at Current Iteration: [0.99504463]
Fastgradalgo Iteration: 898
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.0571579590135138e+19
Objective Value at Current Iteration: [0.99503727]
Fastgradalgo Iteration: 899
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.0561351121638148e+19
Objective Value at Current Iteration: [0.9950299]
Fastgradalgo Iteration: 900
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.0551117075209286e+19
Objective Value at Current Iteration: [0.99502254]
Fastgradalgo Iteration: 901
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.0540877498418549e+19
Objective Value at Current Iteration: [0.99501518]
Fastgradalgo Iteration: 902
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.0530632438800314e+19
Objective Value at Current Iteration: [0.99500782]
Fastgradalgo Iteration: 903
Eta: 2.7337364901508

Norm of Gradient at Current Iteration: 1.0051520138529526e+19
Objective Value at Current Iteration: [0.99468597]
Fastgradalgo Iteration: 947
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.0037005744826638e+19
Objective Value at Current Iteration: [0.99467873]
Fastgradalgo Iteration: 948
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.0022482857028639e+19
Objective Value at Current Iteration: [0.9946715]
Fastgradalgo Iteration: 949
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 1.0007951538284558e+19
Objective Value at Current Iteration: [0.99466427]
Fastgradalgo Iteration: 950
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 9.993411851650175e+18
Objective Value at Current Iteration: [0.99465705]
Fastgradalgo Iteration: 951
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 9.978863860088498e+18
Objective Value at Current Iteration: [0.99464984]
Fastgradalgo Iteration: 952
Eta: 2.733736490150803

Norm of Gradient at Current Iteration: 9.31661245270926e+18
Objective Value at Current Iteration: [0.99433227]
Fastgradalgo Iteration: 997
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 9.301748993787664e+18
Objective Value at Current Iteration: [0.99432539]
Fastgradalgo Iteration: 998
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 9.286880093170276e+18
Objective Value at Current Iteration: [0.99431852]
Fastgradalgo Iteration: 999
Eta: 2.733736490150803e-46
Norm of Gradient at Current Iteration: 9.27200580990161e+18
Objective Value at Current Iteration: [0.99431165]


In [84]:
opt_alpha_T = opt_alpha[len(opt_alpha)-1]
opt_alpha_T

array([[-3.11266546e-31],
       [-8.39134820e-30],
       [-5.80503191e-31],
       [ 1.67837211e-30],
       [-2.96690209e-31],
       [ 2.77594888e-26],
       [-4.65342418e-31],
       [ 6.72678359e-30],
       [-4.88945649e-31],
       [ 9.62367339e-30],
       [ 1.94689269e-26],
       [ 1.79622525e-30],
       [-2.26363575e-31],
       [ 2.95843607e-30],
       [-6.19378864e-31],
       [ 8.82255601e-31],
       [-8.22507999e-31],
       [ 7.13279101e-31],
       [ 5.63205974e-32],
       [-4.16414502e-31],
       [ 1.39413025e-30],
       [-5.86774660e-31],
       [ 3.60058035e-31],
       [-8.84003475e-30],
       [ 5.45891049e-30],
       [ 3.73439196e-30],
       [-2.89007417e-31],
       [-9.89140641e-32],
       [ 4.46073528e-32],
       [-1.84168657e-31],
       [-2.85560228e-31],
       [ 6.23383654e-32],
       [ 7.23393730e-32],
       [-2.21065433e-31],
       [ 1.27032314e-31],
       [ 9.71123793e-32],
       [-5.00431132e-32],
       [ 1.42564243e-31],
       [-3.6

### Performance of SVM Classifier

In [107]:
def misclassification_error(beta, X_train, X_test, y_test, kernel, **kwargs):
    n_test = len(y_test)
    y_pred = np.zeros(n_test)
    y_vals = np.zeros(n_test)

    for i in range(n_test):
        if 'power' in kwargs:
            power = kwargs['power']
            y_vals[i] = np.dot(kernel(X_train, X_test[i, :][:, np.newaxis], power).reshape(-1), beta)
        elif 'sigma' in kwargs:
            sigma = kwargs['sigma']
            y_vals[i] = np.dot(kernel(X_train, X_test[i, :].reshape(1, -1), sigma).reshape(-1), beta)
        else:
            y_vals[i] = np.dot(kernel(X_train, X_test[i, :][:, np.newaxis]).reshape(-1), beta)

    y_pred = np.sign(y_vals)
    return np.mean(y_pred != y_test.squeeze())  

Misclassification Error of Polynomial Kernel (p = 7)

In [86]:
misclassification_error(opt_alpha_T, X_train_subset, X_test_subset, y_test_subset, 
                        pth_order_poly_kernel, power=7)

0.08333333333333333

In [None]:
# Plot misclassification error
# plt.figure(figsize=(10,8))
# plt.plot(range(0,len(opt_alpha)), list(map(lambda a: misclassification_error(
#     a, X_train_subset, X_test_subset, y_test_subset, pth_order_poly_kernel, power=7), opt_alpha)))
# plt.legend(['Test data'], loc='upper right')
# plt.title('Misclassification Error vs. Iteration t')
# plt.xlabel('Iteration t')
# plt.ylabel('Misclassification Error');

### Training with other Kernels.

Misclassification Error of Polynomial Kernel (p = 2)

In [92]:
K = computegram(X_train_subset, pth_order_poly_kernel, power=2)
lambduh = 1/10
eta_init = initstepsize(K, lambduh)
opt_alpha = mysvm(K, y_train_subset, lam=lambduh, eta_init=eta_init, eps=1e-3)
opt_alpha_T = opt_alpha[len(opt_alpha)-1]

misclassification_error(opt_alpha_T, X_train_subset, X_test_subset, y_test_subset, 
                        pth_order_poly_kernel, power=2)

0.002777777777777778

Power 2 seems to work much better than power 7.

Trying Gaussian RBF Kernel.

In [101]:
def gaussian_RBF_kernel(X, Z, sigma=0.5):
    return np.exp(-1/(2*sigma**2)*np.sum((X - Z)**2))

In [105]:
def gaussian_RBF_gram(X, Z=None, sigma=0.5):
    if Z is None:
        Z = X
    
    n = len(X)
    m = len(Z)
    gram = np.zeros((n, m))
    for i in range(n):
        for j in range(m):
            gram[i, j] = np.exp(-1/(2*sigma**2)*np.sum((X[i, :] - Z[j, :])**2))
    
    return gram

In [109]:
K = gaussian_RBF_gram(X_train_subset, sigma=0.5)
lambduh = 1/10
eta_init = initstepsize(K, lambduh)
opt_alpha = mysvm(K, y_train_subset, lam=lambduh, eta_init=eta_init, eps=1e-3)
opt_alpha_T = opt_alpha[len(opt_alpha)-1]

misclassification_error(opt_alpha_T, X_train_subset, X_test_subset, y_test_subset, 
                        gaussian_RBF_gram, sigma=0.5)

0.0

Gaussian RBF Kernel performs better than the polynomial kernels.

### Training with sklearn.SVC

This isn't an exact mirror of our problem since it uses hinge loss and not the smoothed quadratic loss. However we can specify the polynomial kernel (p = 7), which is probably more important for getting as similar of a model as possible.

In [53]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

parameters = {'C':[10**i for i in range(-10, 10)]}
svc = SVC(kernel='poly', degree=7, gamma='scale')
grd = GridSearchCV(svc, parameters, cv=5)
grd.fit(X_train_subset, y_train_subset.squeeze())



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=7, gamma='scale', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [54]:
opt_param_svc = grd.best_params_['C']
opt_param_svc

10

In [55]:
grd.best_score_

0.9973856209150327

In [56]:
svc_opt = SVC(C=opt_param_svc, kernel='poly', degree=7, gamma='scale')
svc_opt.fit(X_train_subset, y_train_subset.squeeze())

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=7, gamma='scale', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [57]:
svc_opt.score(X_test_subset, y_test_subset)

1.0

In [58]:
svc_opt.score(X_test, y_test)

0.10127991096271564

### Training with sklearn.SGDClassifier

This classifier allows for the modified huber hinge loss, but not a polynomial kernel. Just trying it to see what happens. Performance seems to be fine.

In [78]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

parameters = {'alpha':10.0**-np.arange(1,7)}
sgdc = SGDClassifier(loss='modified_huber', penalty='l2', l1_ratio=0, max_iter=1000, tol=1e-3)
grd = GridSearchCV(sgdc, parameters, cv=5)
grd.fit(X_train_subset, y_train_subset.squeeze())

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0, learning_rate='optimal', loss='modified_huber',
       max_iter=1000, n_iter=None, n_iter_no_change=5, n_jobs=None,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': array([1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05, 1.e-06])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [79]:
opt_param_sgdc = grd.best_params_['alpha']
opt_param_sgdc

0.0001

In [80]:
grd.best_score_

0.9973856209150327

In [81]:
sgdc_opt = SGDClassifier(alpha=opt_param_sgdc, loss='modified_huber', penalty='l2', l1_ratio=0, 
                         max_iter=1000, tol=1e-3)
sgdc_opt.fit(X_train_subset, y_train_subset.squeeze())

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0, learning_rate='optimal', loss='modified_huber',
       max_iter=1000, n_iter=None, n_iter_no_change=5, n_jobs=None,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       tol=0.001, validation_fraction=0.1, verbose=0, warm_start=False)

In [82]:
sgdc_opt.score(X_test_subset, y_test_subset)

1.0

In [83]:
sgdc_opt.score(X_test, y_test)

0.10127991096271564

In [92]:
opt_alpha_T_sgdc = sgdc_opt.coef_[0]
opt_alpha_T_sgdc

array([  0.        ,  -4.40906236,  -2.41523458,  -0.61319783,
         7.124864  ,  24.16999469,   5.31515457,  -1.7165444 ,
        -0.29971039,   0.16415272, -27.48639361,  -2.26196044,
        -2.18547596,  10.05437967,  23.10877152,  -1.94976873,
        -0.52601318,  30.86545045,  -1.6825286 ,  28.43278502,
        47.11331391,  -6.54974566, -17.16338411,  -1.43256225,
        -0.41049564, -20.2892575 , -28.64785909,  18.19545706,
        32.15702301, -18.74110328, -13.09003478,  -0.61614694,
        -0.45900818, -17.54879335, -41.86815375,  17.95591059,
        18.11163579, -14.67225408, -25.1531893 ,   0.        ,
        -1.10190168,  -0.88932153, -35.99086188,   1.59369764,
         4.97767836, -13.71017738,  -7.28863624,  -1.18026797,
        -0.8384643 ,  23.86788053,  -0.15435687, -22.41677921,
       -14.71400626, -17.26023894,   4.80357237,  23.98595268,
        -0.20516725,  12.53426782,   4.95227302, -11.53970308,
       -16.41918361,   1.53633705,   8.48480512,  55.09