In [None]:
import random
import numpy as np
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython

%load_ext autoreload
%autoreload 2

In [4]:
from sklearn import preprocessing, metrics
import utils
import scipy.io
import numpy as np
from linear_classifier import LinearSVM_twoclass

# load the SPAM email training dataset

X,y = utils.load_mat('data/spamTrain.mat')
yy = np.ones(y.shape)
yy[y==0] = -1

# load the SPAM email test dataset

test_data = scipy.io.loadmat('data/spamTest.mat')
X_test = test_data['Xtest']
y_test = test_data['ytest'].flatten()

##################################################################################
#  YOUR CODE HERE for training the best performing SVM for the data above.       #
#  what should C be? What should num_iters be? Should X be scaled?               #
#  should X be kernelized? What should the learning rate be? What should the     #
#  number of iterations be?                                                      #
##################################################################################

from sklearn.metrics.pairwise import rbf_kernel

svm = LinearSVM_twoclass()
svm.theta = np.zeros((X.shape[1],))

X_train = X[:3600]
yy_train = yy[:3600]
X_val = X[3600:]
yy_val = yy[3600:]

sigma_vals = [0.3,1,3,10,30,100,300]
Cvals = [10,30,100,300,1000,3000,10000]
learning_rates = [1e-5,3e-5,1e-4,3e-4,1e-3,3e-3,1e-2]

bestAcc = 0.0
bests = (0, 0, 0)

for s in sigma_vals:
#    KKtrain = X_train
#    KKval = X_val
    gamma = 1.0 / (2 * s * s)
    Ktrain = rbf_kernel(X_train, X_train, gamma)
    scaler = preprocessing.StandardScaler().fit(Ktrain)
    scaleKtrain = scaler.transform(Ktrain)
    KKtrain = np.vstack([np.ones((scaleKtrain.shape[0],)), scaleKtrain.T]).T
    Kval = rbf_kernel(X_val, X_train, gamma)
    scaleKval = scaler.transform(Kval)
    KKval = np.vstack([np.ones((scaleKval.shape[0],)), scaleKval.T]).T

    svm.theta = np.zeros((KKtrain.shape[1],))
    for c in Cvals:
        for lr in learning_rates:
            svm.train(KKtrain, yy_train, learning_rate=lr, reg=c, num_iters=200, batch_size=400)
            yy_val_pre = svm.predict(KKval)
            curAcc = np.mean((yy_val_pre == yy_val) * 1.0)
            print s, c, lr, curAcc
            if curAcc > bestAcc:
                bestAcc = curAcc
                bests = (s, c, lr)
                best_svm = svm
            
print "Best sigma, C, lr are (%f, %f, %f) with an accuracy on the validation set of %f." %(bests[0], bests[1], bests[2], bestAcc)

##################################################################################
# YOUR CODE HERE for testing your best model's performance                       #
# what is the accuracy of your best model on the test set? On the training set?  #
##################################################################################

s = bests[0]
c = bests[1]
lr = bests[2]
#KK = X
gamma = 1.0 / (2 * s * s)
K = rbf_kernel(X, X, gamma)
scaler = preprocessing.StandardScaler().fit(K)
scaleK = scaler.transform(K)
KK = np.vstack([np.ones((scaleK.shape[0],)), scaleK.T]).T

best_svm.theta = np.zeros((KK.shape[1],))
best_svm.train(KK, yy, learning_rate=lr, reg=c, num_iters=30000, batch_size=KK.shape[0], verbose=True)
yy_pre = best_svm.predict(KK)
acc_train = np.mean((yy_pre == yy) * 1.0)
print "Accuracy on the training set: ", acc_train

yy_test = np.ones(y_test.shape)
yy_test[y_test==0] = -1
#KKtest = X_test
Ktest = rbf_kernel(X_test, X, gamma)
scaleKtest = scaler.transform(Ktest)
KKtest = np.vstack([np.ones((scaleKtest.shape[0],)), scaleKtest.T]).T

yy_test_pre = best_svm.predict(KKtest)
acc_test = np.mean((yy_test_pre == yy_test) * 1.0)
print "Accuracy on the testing set: ", acc_test

##################################################################################
# ANALYSIS OF MODEL: Print the top 15 words that are predictive of spam and for  #
# ham. Hint: use the coefficient values of the learned model                     #
##################################################################################
words, inv_words = utils.get_vocab_dict()

print "###### top 15 words ######"
t = np.dot(best_svm.theta[1:], X).argsort()[::-1]
w15 = t[:15]
for w in w15:
    print words[w]
##################################################################################
#                    END OF YOUR CODE                                            #
##################################################################################

0.3 10 1e-05 0.76
0.3 10 3e-05 0.76
0.3 10 0.0001 0.76
0.3 10 0.0003 0.76
0.3 10 0.001 0.76
0.3 10 0.003 0.76
0.3 10 0.01 0.76
0.3 30 1e-05 0.76
0.3 30 3e-05 0.76
0.3 30 0.0001 0.76
0.3 30 0.0003 0.76
0.3 30 0.001 0.76
0.3 30 0.003 0.76
0.3 30 0.01 0.76
0.3 100 1e-05 0.76
0.3 100 3e-05 0.76
0.3 100 0.0001 0.76
0.3 100 0.0003 0.76
0.3 100 0.001 0.76
0.3 100 0.003 0.76
0.3 100 0.01 0.76
0.3 300 1e-05 0.76
0.3 300 3e-05 0.76
0.3 300 0.0001 0.76
0.3 300 0.0003 0.76
0.3 300 0.001 0.76
0.3 300 0.003 0.76
0.3 300 0.01 0.76
0.3 1000 1e-05 0.76
0.3 1000 3e-05 0.76
0.3 1000 0.0001 0.76
0.3 1000 0.0003 0.76
0.3 1000 0.001 0.76
0.3 1000 0.003 0.76
0.3 1000 0.01 0.76
0.3 3000 1e-05 0.76
0.3 3000 3e-05 0.76
0.3 3000 0.0001 0.76
0.3 3000 0.0003 0.76
0.3 3000 0.001 0.76
0.3 3000 0.003 0.76
0.3 3000 0.01 0.76
0.3 10000 1e-05 0.76
0.3 10000 3e-05 0.76
0.3 10000 0.0001 0.76
0.3 10000 0.0003 0.76
0.3 10000 0.001 0.76
0.3 10000 0.003 0.76
0.3 10000 0.01 0.76
1 10 1e-05 0.78
1 10 3e-05 0.7775
1 10 0.0001 0.

iteration 3700 / 30000: loss 20.185126
iteration 3800 / 30000: loss 19.621561
iteration 3900 / 30000: loss 18.908397
iteration 4000 / 30000: loss 18.564405
iteration 4100 / 30000: loss 17.730183
iteration 4200 / 30000: loss 17.160617
iteration 4300 / 30000: loss 16.566787
iteration 4400 / 30000: loss 16.047913
iteration 4500 / 30000: loss 15.492350
iteration 4600 / 30000: loss 15.068716
iteration 4700 / 30000: loss 14.590547
iteration 4800 / 30000: loss 14.364386
iteration 4900 / 30000: loss 13.774913
iteration 5000 / 30000: loss 13.527979
iteration 5100 / 30000: loss 13.026667
iteration 5200 / 30000: loss 12.967604
iteration 5300 / 30000: loss 12.350128
iteration 5400 / 30000: loss 12.009084
iteration 5500 / 30000: loss 3868.568910
iteration 5600 / 30000: loss 57.705348
iteration 5700 / 30000: loss 35.303553
iteration 5800 / 30000: loss 23.705368
iteration 5900 / 30000: loss 19.159569
iteration 6000 / 30000: loss 16.778969
iteration 6100 / 30000: loss 15.697044
iteration 6200 / 30000:

iteration 24800 / 30000: loss 1.073457
iteration 24900 / 30000: loss 1.049663
iteration 25000 / 30000: loss 1.004457
iteration 25100 / 30000: loss 3.410862
iteration 25200 / 30000: loss 1.114961
iteration 25300 / 30000: loss 1.031377
iteration 25400 / 30000: loss 0.982316
iteration 25500 / 30000: loss 0.960435
iteration 25600 / 30000: loss 0.932242
iteration 25700 / 30000: loss 2.875808
iteration 25800 / 30000: loss 2.314165
iteration 25900 / 30000: loss 0.956215
iteration 26000 / 30000: loss 0.952815
iteration 26100 / 30000: loss 0.888755
iteration 26200 / 30000: loss 0.851491
iteration 26300 / 30000: loss 0.858553
iteration 26400 / 30000: loss 1.209110
iteration 26500 / 30000: loss 1.195036
iteration 26600 / 30000: loss 0.795747
iteration 26700 / 30000: loss 0.790423
iteration 26800 / 30000: loss 0.787614
iteration 26900 / 30000: loss 1.173664
iteration 27000 / 30000: loss 0.800775
iteration 27100 / 30000: loss 0.796958
iteration 27200 / 30000: loss 0.777370
iteration 27300 / 30000: 