In [None]:
from sklearn import svm
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Copy of q3b: Evaluation metric

def eval_metric(true_labels, predicted_labels):
    """
    use classification accuracy, or the percent of examples classified correctly,
    as a measure of the classifier performance.
    
    Args:
        true_labels: the set of true labels of the dataset
        predicted_labels: the set of labels predicted by the model
    
    Returns:
        the (unweighted) accuracy score
    """
    assert len(true_labels) == len(predicted_labels)
    total_num = len(true_labels)
    accurate_num = 0
    for i in range(total_num):
        if true_labels[i] == predicted_labels[i]:
            accurate_num += 1
    
    return accurate_num / total_num

In [None]:
# Train a linear SVM on the MNIST and spam datasets with a given size of the training set

def mnist_linear_svm(train_set_size):
    mnist_data = np.load("data/mnist-data.npz")
    fields = "test_data", "training_data", "training_labels"
    mnist_train_val_data = mnist_data[fields[1]]
    mnist_train_val_labels = mnist_data[fields[2]]
    # For the MNIST dataset, write code that sets aside 10,000 training images as a validation set.
    mnist_val_data_num = 10000
    mnist_indices = np.random.permutation(len(mnist_train_val_data))
    mnist_val_data = mnist_train_val_data[mnist_indices[:mnist_val_data_num]]
    mnist_val_labels = mnist_train_val_labels[mnist_indices[:mnist_val_data_num]]
    mnist_train_data = mnist_train_val_data[mnist_indices[mnist_val_data_num:]]
    mnist_train_labels = mnist_train_val_labels[mnist_indices[mnist_val_data_num:]]

    # use linear SVM to train the model
    model = svm.SVC(kernel="linear", max_iter=10000)
    # train the model with 0~train_set_size subset of all the train data
    mnist_train_data_subset = mnist_train_data[:train_set_size]
    mnist_train_labels_subset = mnist_train_labels[:train_set_size]
    # In MNIST, our feature vector for an image will be a row vector with all the pixel values
    # concatenated in a row major (or column major) order
    flattened_mnist_train_data_subset = mnist_train_data_subset.reshape((train_set_size, -1))
    model.fit(flattened_mnist_train_data_subset, mnist_train_labels_subset)

    flattened_mnist_val_data = mnist_val_data.reshape((mnist_val_data_num, -1))
    pred_mnist_val_labels = model.predict(flattened_mnist_val_data)
    pred_mnist_train_labels_subset = model.predict(flattened_mnist_train_data_subset)
    train_acc = eval_metric(mnist_train_labels_subset, pred_mnist_train_labels_subset)
    val_acc =eval_metric(mnist_val_labels, pred_mnist_val_labels)

    return train_acc, val_acc


def spam_linear_svm(train_set_size):
    spam_data = np.load("data/spam-data.npz")
    fields = "test_data", "training_data", "training_labels"
    spam_train_val_data = spam_data[fields[1]]
    spam_train_val_labels = spam_data[fields[2]]
    # For spam dataset, write code that sets aside 20% of the training data as a validation set.
    spam_val_data_split_ratio = 0.2
    spam_val_data_num = int(len(spam_train_val_data) * spam_val_data_split_ratio)
    spam_indices = np.random.permutation(len(spam_train_val_data))
    spam_val_data = spam_train_val_data[spam_indices[:spam_val_data_num]]
    spam_val_labels = spam_train_val_labels[spam_indices[:spam_val_data_num]]
    spam_train_data = spam_train_val_data[spam_indices[spam_val_data_num:]]
    spam_train_labels = spam_train_val_labels[spam_indices[spam_val_data_num:]]

    # use linear SVM to train the model
    model = svm.SVC(kernel="linear")
    # train the model with 0~train_set_size subset of all the train data
    spam_train_data_subset = spam_train_data[:train_set_size]
    spam_train_labels_subset = spam_train_labels[:train_set_size]
    model.fit(spam_train_data_subset, spam_train_labels_subset)

    pred_spam_val_labels = model.predict(spam_val_data)
    pred_spam_train_labels_subset = model.predict(spam_train_data_subset)
    train_acc = eval_metric(spam_train_labels_subset, pred_spam_train_labels_subset)
    val_acc = eval_metric(spam_val_labels, pred_spam_val_labels)

    return train_acc, val_acc

In [None]:
# plot the training accuracy and validation accuracy versus the number of samples
# for both MNIST and spam datasets

# MNIST datasets
mnist_train_set_sizes = np.array((100, 200, 500, 1000, 2000, 5000, 10000))
mnist_training_accuracies = np.zeros(len(mnist_train_set_sizes))
mnist_validation_accuracies = np.zeros(len(mnist_train_set_sizes))
mnist_index = 0
for mnist_train_set_size in mnist_train_set_sizes:
    mnist_training_accuracies[mnist_index], \
        mnist_validation_accuracies[mnist_index] = mnist_linear_svm(mnist_train_set_size)
    mnist_index += 1

plt.plot(mnist_train_set_sizes, mnist_training_accuracies, label="MNIST training accuracy")
plt.plot(mnist_train_set_sizes, mnist_validation_accuracies, label="MNIST validation accuracy")
plt.title("MNIST-linearSVM Accuracy vs Training Size")
plt.xlabel("number of training examples")
plt.ylabel("training and validation accuracy")
plt.legend()
plt.savefig("q4a_mnist_svm.png")

In [None]:
# Spam datasets
spam_data = np.load("data/spam-data.npz")
spam_train_val_num = len(spam_data["training_labels"])
spam_val_data_split_ratio = 0.2
spam_train_data_num = int(spam_train_val_num * (1 - spam_val_data_split_ratio))

spam_train_set_sizes = np.array((100, 200, 500, 1000, 2000, spam_train_data_num))
spam_training_accuracies = np.zeros(len(spam_train_set_sizes))
spam_validation_accuracies = np.zeros(len(spam_train_set_sizes))
spam_index = 0
for spam_train_set_size in spam_train_set_sizes:
    spam_training_accuracies[spam_index], \
        spam_validation_accuracies[spam_index] = spam_linear_svm(spam_train_set_size)
    spam_index += 1

plt.plot(spam_train_set_sizes, spam_training_accuracies, label="Spam training accuracy")
plt.plot(spam_train_set_sizes, spam_validation_accuracies, label="Spam validation accuracy")
plt.title("Spam-linearSVM Accuracy vs Training Size")
plt.xlabel("number of training examples")
plt.ylabel("training and validation accuracy")
plt.legend()
plt.savefig("q4b_spam_svm.png")