In [1]:
from IPython.display import IFrame

In [2]:
IFrame("hw8.pdf", width=1000, height=1000)

## Problem 1

__[d]__

This was a QP problem with $d+1$ variables.

## Polynomial Kernels

In [3]:
import os
import random
import numpy as np
import matplotlib.path as mpath
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from scipy.optimize import minimize
from sklearn.svm import SVC
from sklearn import cross_validation



In [8]:
train_file = 'features.train'
test_file = 'features.test'

In [10]:
def load_data(filename):    
    with open(filename, "r") as f:
        data = []
        for line in f:
            if line.strip():
                y, x1, x2 = line.split()
                data.append([ int(float(y)), float(x1), float(x2) ])
    return data

In [11]:
data_train = load_data(train_file)
data_test = load_data(test_file)

In [12]:
C = 0.01
Q = 2

In [13]:
svm = SVC(C=C, kernel='poly', degree = Q, coef0 = 1.0, gamma = 1.0)

In [14]:
def k_vs_all(k, data):
    """
    k-vs-all: recall in 1-vs-all one digit has class +1 w/ rest -1.
    k is just number k--so basically if the data pt has value k (e.g. 1, etc.) we label it with +1 in new y
    """
    y = [data[i][0] for i in range(len(data))]
    X = [[data[i][1], data[i][2]] for i in range(len(data))]
    
    y_new = []
    ct = 0
    for i in range(len(y)):
        if y[i] == k:
            y_new.append(1.0)
            ct += 1
        else:
            y_new.append(-1.0)
            
    return y_new, X

In [15]:
def k_vs_k(k1, k2, data):
    """
    e.g. if 1 vs 5 call k_vs_k(1, 5, data)
    """
    y = [data[i][0] for i in range(len(data))]
    X = [[data[i][1], data[i][2]] for i in range(len(data))]
    
    y_new = []
    X_new = []
    for i in range(len(y)):
        if y[i] == k1:
            y_new.append(1.0)
            X_new.append(X[i])
            
        if y[i] == k2:
            y_new.append(-1.0)
            X_new.append(X[i])
            
    return y_new, X_new

In [16]:
alphs_even = np.array([])
alphs_odd = np.array([])

In [17]:
for k in range(10):
    y, X = k_vs_all(k, data_train)
    svm.fit(X, y)
    
    score = svm.score(X, y)
    num_alphas = svm.n_support_
    alpha_sum = np.array(num_alphas).sum()
    print("%d-vs-all binary classifier in-sample error: %f" % (k, (1.0 - score)))
    
    if k % 2 == 0:
        alphs_even = np.concatenate((alphs_even, [alpha_sum]))
    else:
        alphs_odd = np.concatenate((alphs_odd, [alpha_sum]))

0-vs-all binary classifier in-sample error: 0.105884
1-vs-all binary classifier in-sample error: 0.014401
2-vs-all binary classifier in-sample error: 0.100261
3-vs-all binary classifier in-sample error: 0.090248
4-vs-all binary classifier in-sample error: 0.089425
5-vs-all binary classifier in-sample error: 0.076258
6-vs-all binary classifier in-sample error: 0.091071
7-vs-all binary classifier in-sample error: 0.088465
8-vs-all binary classifier in-sample error: 0.074338
9-vs-all binary classifier in-sample error: 0.088328


In [18]:
aodd_sum = np.sum(alphs_odd)
aeven_sum = np.sum(alphs_even)
a_diff = abs(aodd_sum - aeven_sum)
print("Difference in # Support Vectors between two classifiers: %d" % a_diff)

Difference in # Support Vectors between two classifiers: 2071


## Problem 2

__[e]__

Reading from the above run, we see the best $k$-vs-all, that which has the lowest $E_{in}$, is $8$ versus all.

## Problem 3

__[a]__

Again reading from the above, the best classifier is $1$-vs-all.

## Problem 4

__[c]__

The above computation tells us the two models differ by $2071$ support vectors. This is closest to the answer $1800$.

## Problem 5

__[d]__

In [21]:
Q_1 = 2
Q_2 = 5

In [22]:
C_vals = [0.001, 0.01, 0.1, 1]

In [23]:
Ein_2 = [] # Ein values for Q = 2, C ranging from 0.001 to 1
Ein_5 = [] # Ein values for Q = 5

In [24]:
SV_2 = [] # num SVs when Q = 2, C ranging
SV_5 = [] # num SVs when Q = 5, C ranging

In [25]:
alphas_2 = np.array([])
alphas_5 = np.array([])

In [31]:
for C in C_vals:
    # get data
    y, X = k_vs_k(1, 5, data_train)
    y_out, X_out = k_vs_k(1, 5, data_test)
    
    # create SVMs
    svm_2 = SVC(C=C, kernel='poly', degree = Q_1, coef0 = 1.0, gamma = 1.0)
    svm_5 = SVC(C=C, kernel='poly', degree = Q_2, coef0 = 1.0, gamma = 1.0)
    
    # fit SVMs
    svm_2.fit(X,y)
    svm_5.fit(X,y)
    
    # get scores
    score2 = svm_2.score(X,y)
    score5 = svm_5.score(X,y)
    
    # get out-of-sample scores
    score2_out = svm_2.score(X_out, y_out)
    score5_out = svm_5.score(X_out, y_out)
    
    # convert to errors
    err2 = 1.0 - score2
    err5 = 1.0 - score5
    
    err2_out = 1.0 - score2_out
    err5_out = 1.0 - score5_out
    
    print("1-vs-5 classifier with C = %f, Q = 2 in-sample error: %f" % (C, err2))
    print("1-vs-5 classifier with C = %f, Q = 2 out-sample error: %f" % (C, err2_out))
    print("")
    print("1-vs-5 classifier with C = %f, Q = 5 in-sample error: %f" % (C, err5))
    print("1-vs-5 classifier with C = %f, Q = 5 out-sample error: %f" % (C, err5_out))
    
    # get num SVs
    num_alphas2 = svm_2.n_support_
    num_alphas5 = svm_5.n_support_
    alphas2_sum = np.array(num_alphas2).sum()
    alphas5_sum = np.array(num_alphas5).sum()
    
    # print num SVs
    print("")
    print("number of support vectors with C = %f, Q = 2: %d" % (C, alphas2_sum))
    print("number of support vectors with C = %f, Q = 5: %d" % (C, alphas5_sum))
    print("")

1-vs-5 classifier with C = 0.001000, Q = 2 in-sample error: 0.004484
1-vs-5 classifier with C = 0.001000, Q = 2 out-sample error: 0.016509

1-vs-5 classifier with C = 0.001000, Q = 5 in-sample error: 0.004484
1-vs-5 classifier with C = 0.001000, Q = 5 out-sample error: 0.021226

number of support vectors with C = 0.001000, Q = 2: 76
number of support vectors with C = 0.001000, Q = 5: 25

1-vs-5 classifier with C = 0.010000, Q = 2 in-sample error: 0.004484
1-vs-5 classifier with C = 0.010000, Q = 2 out-sample error: 0.018868

1-vs-5 classifier with C = 0.010000, Q = 5 in-sample error: 0.003844
1-vs-5 classifier with C = 0.010000, Q = 5 out-sample error: 0.021226

number of support vectors with C = 0.010000, Q = 2: 34
number of support vectors with C = 0.010000, Q = 5: 23

1-vs-5 classifier with C = 0.100000, Q = 2 in-sample error: 0.004484
1-vs-5 classifier with C = 0.100000, Q = 2 out-sample error: 0.018868

1-vs-5 classifier with C = 0.100000, Q = 5 in-sample error: 0.003203
1-vs-5 cl

Recall that we're looking for strictly going down or up. We can see that for $Q=2$, the in-sample error $E_{in}$ is achieved with the maximum value of $C$.

## Problem 6

__[b]__

We see that when $C = 0.01$, there are $25$ Support Vectors for the SVM with $Q=5$, as opposed to $76$ for the SVM with $Q=2$.

## Cross Validation

In [32]:
from sklearn.model_selection import KFold

In [33]:
cv_Q = 2
cv_C = [pow(10,-x) for x in reversed(range(5))]
cv_runs = 100 # num runs
cv_splits = 10 # since doing 10-fold

In [34]:
k_fold = KFold(n_splits = cv_splits)

In [35]:
e_cvs = np.ndarray((0, len(cv_C)))
cv_best = np.zeros(len(cv_C))

In [36]:
y_1v5, X_1v5 = k_vs_k(1,5,data_train)

In [38]:
y_1v5

[-1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 -1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 1.0,
 -1.0,
 -1.0,
 -1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 -1.0,
 -1.0,
 -

In [43]:
print(np.array(X_1v5).shape)

(1561, 2)


In [44]:
X_1v5 = np.array(X_1v5)
y_1v5 = np.array(y_1v5)

In [45]:
for run in range(cv_runs):
    Ecvs = np.array([])
    for C in cv_C:
        svm.C = C
        e_vals = np.array([]) # validation errors
        
        # go thru folds
        for train_idx, test_idx in k_fold.split(X_1v5):
            #print(train_idx)
            cv_Xtrain, cv_Xtest = X_1v5[train_idx], X_1v5[test_idx]
            cv_ytrain, cv_ytest = y_1v5[train_idx], y_1v5[test_idx]
            svm.fit(cv_Xtrain, cv_ytrain)
            
            err = 1.0 - svm.score(cv_Xtest, cv_ytest)
            e_vals = np.concatenate((e_vals, [err]))
        
        Ecv = np.average(e_vals)
        Ecvs = np.concatenate((Ecvs, [Ecv]))
        
    best_idx = np.argmin(Ecvs) # index of best C
    
    # save best C
    cv_best[best_idx] = cv_best[best_idx] + 1
    
    # add CV errors
    e_cvs = np.vstack((e_cvs, Ecvs))

In [46]:
Ecv_avg = np.average(e_cvs, axis=0) # get avg CV error
best_overall = np.argmax(cv_best)
Ecv_best = Ecv_avg[best_overall]

In [47]:
print("C = %f is selected most often with average E_cv %f" % (cv_C[best_overall], Ecv_best))

C = 0.001000 is selected most often with average E_cv 0.004483


## Problem 7

__[b]__

Reading from the above, $C=0.001$ is selected most often.

## Problem 8

__[c]__

Again, reading from the above, for the winning selection, the average $E_{CV} = 0.004483$, which is closest to the answer choice $E_{CV} =  0.005$.

## Problem 9

__[e]__

Again, we'll use X_1v5 and y_1v5.

In [53]:
y1v5_out, X1v5_out = k_vs_k(1,5,data_test)

In [54]:
y1v5_out = np.array(y1v5_out)
X1v5_out = np.array(X1v5_out)

In [50]:
C_vals = [pow(10, x) for x in range(-2,8,2)]

In [51]:
C_vals

[0.01, 1, 100, 10000, 1000000]

In [52]:
svm.kernel = 'rbf'
svm.gamma = 1

In [55]:
for C in C_vals:
    svm.C = C
    svm.fit(X_1v5, y_1v5)
    Ein = 1.0 - svm.score(X_1v5, y_1v5)
    Eout = 1.0 - svm.score(X1v5_out, y1v5_out)
    
    print("C = %f | E_in = %f, E_out = %f" % (C, Ein, Eout))

C = 0.010000 | E_in = 0.003844, E_out = 0.023585
C = 1.000000 | E_in = 0.004484, E_out = 0.021226
C = 100.000000 | E_in = 0.003203, E_out = 0.018868
C = 10000.000000 | E_in = 0.002562, E_out = 0.023585
C = 1000000.000000 | E_in = 0.000641, E_out = 0.023585


Reading from above $C = 10^6$ results in the lowest in-sample-error $E_{in}$.

## Problem 10

__[c]__

Again reading from the above results, the intermediate value $C = 100$ results in the lowest out-of-sample error $E_{out}$.