In [2]:
# This file is in scripts/load.py
import sys
if sys.version_info[0] < 3:
    raise Exception("Python 3 not detected.")
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from scipy import io
from tqdm import tqdm
if __name__ == "__main__":
    for data_name in ["mnist", "spam", "toy"]:
        data = np.load(f"../data/{data_name}-data.npz")
        print("\nloaded %s data!" % data_name)
        fields = "test_data", "training_data", "training_labels"
        for field in fields:
            print(field, data[field].shape)

np.random.seed(10072001)


loaded mnist data!
test_data (10000, 1, 28, 28)
training_data (60000, 1, 28, 28)
training_labels (60000,)

loaded spam data!
test_data (1000, 32)
training_data (4171, 32)
training_labels (4171,)

loaded toy data!
test_data (0,)
training_data (1000, 2)
training_labels (1000,)


In [3]:
def shuffle_data(data, labels):
    p = np.random.permutation(len(data))
    sdata, slabels = data[p], labels[p]
    
    return sdata, slabels

In [4]:
def partition_data(data, labels, size):
    sdata, slabels = shuffle_data(data, labels)
    
    tdata, tlabels = sdata[:size-1], slabels[:size-1]
    tedata, telabels = sdata[size:], slabels[size:]
    
    return tdata, tlabels, tedata, telabels

In [5]:
def mnist_partition():
    # uses set seed above to determine random shuffling
    data = np.load(f"../data/mnist-data.npz")
    mnist_data = np.copy(data["training_data"])
    mnist_labels = np.copy(data["training_labels"])
    
    mnist_data = mnist_data.reshape(len(mnist_data), -1)
    #mnist_labels = mnist_labels.reshape(len(mnist_labels), -1)

    val_data, val_labels, train_data, train_labels = partition_data(mnist_data, mnist_labels, 10000)
    
    return train_data, train_labels, val_data, val_labels

In [6]:
def spam_partition():
    # repeat for spam dataset with 20%
    data = np.load(f"../data/spam-data.npz")
    spam_data = np.copy(data["training_data"])
    spam_labels = np.copy(data["training_labels"])
    
    spam_data.reshape(len(spam_data), -1)
    #spam_labels.flatten(len(spam_labels), -1)

    perc = round(len(spam_data)/5)
    
    val_data, val_labels, train_data, train_labels = partition_data(spam_data, spam_labels, perc)
    
    return train_data, train_labels, val_data, val_labels

In [7]:
def accuracy(y, y_hat):
    n = len(y)
    accuracy = 0
    for i in np.arange(0, n):
        if y[i] == y_hat[i]:
            accuracy += 1
    return round(accuracy / n, 4)

In [8]:
# QUESTION 4

In [9]:
def train_svm_mnist():
    data, data_labels, val, val_labels = mnist_partition()
    sizes = [100, 200, 500, 1000, 2000, 5000, 10000]
    train_accs = []
    val_accs = []
    
    for size in tqdm(sizes):
        tdata, tlabels, tedata, telabels = partition_data(data, data_labels, size)
    
        clf = svm.SVC(kernel="linear")
        clf.fit(tdata, tlabels)
        pred_labels = clf.predict(tdata)
        pred_val_labels = clf.predict(val)
    
        train_acc = accuracy(tlabels, pred_labels)
        val_acc = accuracy(val_labels, pred_val_labels)
        train_accs.append(train_acc)
        val_accs.append(val_acc)
    
    plt.plot(sizes, train_accs, label="Training Accuracy")
    plt.plot(sizes, val_accs, label="Validation Accuracy")
    plt.xlabel("Training Sizes")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()

In [10]:
def train_svm_spam():
    data, data_labels, val, val_labels = spam_partition()
    sizes = [100, 200, 500, 1000, 2000, len(data)]
    train_accs = []
    val_accs = []
    
    for size in tqdm(sizes):
        tdata, tlabels, tedata, telabels = partition_data(data, data_labels, size)

        clf = svm.SVC(kernel="linear")
        clf.fit(tdata, tlabels)
        pred_labels = clf.predict(tdata)
        pred_val_labels = clf.predict(val)
    
        train_acc = accuracy(tlabels, pred_labels)
        val_acc = accuracy(val_labels, pred_val_labels)
        train_accs.append(train_acc)
        val_accs.append(val_acc)
    
    plt.plot(sizes, train_accs, label="Training Accuracy")
    plt.plot(sizes, val_accs, label="Validation Accuracy")
    plt.xlabel("Training Sizes")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()

In [24]:
def c_manipulation():
    d = np.load(f"../data/mnist-data.npz")
    #data = d["training_data"]
    #labels = d["training_labels"]
    #data = data.reshape(len(data), -1)
    size = 10000
    results = []
    #sees = [.001, .0005, .0001, .00005, .00001, .000005, .000001, .0000001]
    #sees = [.001]
    
    data, labels, val, val_labels = mnist_partition()
    tdata, tlabels, tedata, telabels = partition_data(data, labels, size)
    
    clf = svm.SVC(C=.0001, kernel="linear")
    clf.fit(data, labels)
    pred_val_labels = clf.predict(val)
    
    out = [.0001, accuracy(val_labels, pred_val_labels)]
    results.append(out)
    
#     for c in tqdm(sees):
#         # fit on full data set including validation data
#         clf = svm.SVC(C=c, kernel="linear")
#         clf.fit(data, labels)
        
#         tdata, tlabels, val, val_labels = mnist_partition()
#         tdata, tlabels, tedata, telabels = partition_data(tdata, tlabels, size)
        
#         pred_val_labels = clf.predict(val)

#         out = [c, accuracy(tlabels, pred_labels), accuracy(val_labels, pred_val_labels)]
#         results.append(out)

    return results

In [25]:
c_manipulation()

[[0.0001, 0.9308]]