In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC

In [3]:
#load the mnist data
mnist_file = np.load("/Users/garrettbrown/Desktop/CS189/HWs/hw1/data/mnist-data.npz")

mnist_data = mnist_file["training_data"]
mnist_data = mnist_data.reshape(60000, 28 * 28)
mnist_labels = mnist_file["training_labels"]

In [4]:
#shuffle the data, then take the first 10,000 for validation, rest is training
indices = np.arange(mnist_data.shape[0])
np.random.shuffle(indices)
mnist_data = mnist_data[indices, :]
mnist_labels = mnist_labels[indices]

mnist_valid = mnist_data[:10000, :]
mnist_valid_labels = mnist_labels[:10000]
mnist_training = mnist_data[10000:, :]
mnist_training_labels = mnist_labels[10000:]

In [5]:
#load data for the spam set
spam_file = np.load("/Users/garrettbrown/Desktop/CS189/HWs/hw1/data/spam-data.npz")

spam_data = spam_file["training_data"]
spam_labels = spam_file["training_labels"]

In [6]:
#shuffle the data, then take the first 20% for validation, rest is training
indices = np.arange(spam_data.shape[0])
np.random.shuffle(indices)
spam_data = spam_data[indices, :]
spam_labels = spam_labels[indices]
share = spam_data.shape[0] // 5

spam_valid = spam_data[:share, :]
spam_valid_labels = spam_labels[:share]
spam_training = spam_data[share:, :]
spam_training_labels = spam_labels[share:]

In [7]:
#write a function that takes predicted and true labels and compute the share that is correct
def accuracy(predicted, true):
    predicted = pd.Series(predicted)
    true = pd.Series(true)
    count = true.size
    true = true[true == predicted]
    return true.size / count

In [8]:
#this cell trains a linear SVM with increasing numbers of training examples on the MNIST data set
clf = SVC(kernel = 'linear')

examples = [100, 200, 500, 1000, 2000, 5000, 10000]
for example in examples:
    
    indices = np.arange(mnist_training.shape[0])
    np.random.shuffle(indices)
    mnist_training = mnist_training[indices, :]
    mnist_training_labels = mnist_training_labels[indices]
    
    cur_training = mnist_training[:example, :]
    cur_training_labels = mnist_training_labels[:example]

    clf.fit(cur_training, cur_training_labels)
    predictions = clf.predict(mnist_valid)
    accurate = accuracy(predictions, mnist_valid_labels)
    print(accurate)

0.7459
0.8175
0.8486
0.8862
0.8986
0.911
0.9128


In [9]:
#this cell trains a linear SVM with increasing numbers of training examples on the spam data set
clf = SVC(kernel = 'linear')

examples = [100, 200, 500, 1000, 2000, spam_training.shape[0]]
for example in examples:
    
    indices = np.arange(spam_training.shape[0])
    np.random.shuffle(indices)
    spam_training = spam_training[indices, :]
    spam_training_labels = spam_training_labels[indices]

    cur_training = spam_training[:example, :]
    cur_training_labels = spam_training_labels[:example]

    clf.fit(cur_training, cur_training_labels)
    predictions = clf.predict(spam_valid)
    accurate = accuracy(predictions, spam_valid_labels)
    print(accurate)

0.7877697841726619
0.7997601918465228
0.8033573141486811
0.8141486810551559
0.815347721822542
0.8201438848920863


In [10]:
#this cell trains a linear SVM with different hyperparameter C and compares performance for the MNIST set, train on 10000 examples each

hyper = [.0001, .001, .01, .1, 1, 10, 100, 1000]
for param in hyper:
    clf = SVC(C = param, kernel = 'linear')
    
    indices = np.arange(mnist_training.shape[0])
    np.random.shuffle(indices)
    mnist_training = mnist_training[indices, :]
    mnist_training_labels = mnist_training_labels[indices]
    
    cur_training = mnist_training[:10000, :]
    cur_training_labels = mnist_training_labels[:10000]

    clf.fit(cur_training, cur_training_labels)
    predictions = clf.predict(mnist_valid)
    accurate = accuracy(predictions, mnist_valid_labels)
    print(accurate)
    print(param)

0.911
0.0001
0.911
0.001
0.9129
0.01
0.9123
0.1
0.91
1
0.9097
10
0.9055
100
0.9114
1000


In [11]:
#this cell trains a linear SVM but with 5-fold cross-validation, with several hyperparameters C, for the spam dataset
#note: higher C values were super slow due to overfitting
spam_data
spam_labels

step = spam_data.shape[0] // 5
steps = [0, step, 2*step, 3*step, 4*step, spam_data.shape[0]]
hyper = [.000001, .00001, .0001, .001, .01, .1, 1, 10]

for param in hyper:

    indices = np.arange(spam_data.shape[0])
    np.random.shuffle(indices)

    accurate = 0
    clf = SVC(C = param, kernel = 'linear')
    
    for i in range(5):
        valid_data = spam_data[steps[i]: steps[i + 1], :]
        valid_labels = spam_labels[steps[i]: steps[i + 1]]

        training_data1 = spam_data[:steps[i], :]
        training_data2 = spam_data[steps[i + 1]:, :]
        training_data = np.append(training_data1, training_data2, 0)

        training_labels1 = spam_labels[:steps[i]]
        training_labels2 = spam_labels[steps[i+1]:]
        training_labels = np.append(training_labels1, training_labels2, 0)

        clf.fit(training_data, training_labels)
        predictions = clf.predict(valid_data)
        accurate += accuracy(predictions, valid_labels)

    accurate = accurate / 5
    print(accurate)
    print(param)
        
        

0.7123048866296184
1e-06
0.7123048866296184
1e-05
0.7180588463361047
0.0001
0.7494694065107196
0.001
0.7755995921825415
0.01
0.7954996481856431
0.1
0.8012527463059493
1
0.8046080500868765
10
