In [None]:
from sklearn import svm
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Copy of q3b: Evaluation metric

def eval_metric(true_labels, predicted_labels):
    """
    use classification accuracy, or the percent of examples classified correctly,
    as a measure of the classifier performance.
    
    Args:
        true_labels: the set of true labels of the dataset
        predicted_labels: the set of labels predicted by the model
    
    Returns:
        the (unweighted) accuracy score
    """
    assert len(true_labels) == len(predicted_labels)
    total_num = len(true_labels)
    accurate_num = 0
    for i in range(total_num):
        if true_labels[i] == predicted_labels[i]:
            accurate_num += 1
    
    return accurate_num / total_num

In [None]:
# Train a linear SVM on the MNIST dataset with a given hyperparameter C

def spam_linear_svm(train_data, train_labels, hyper_c):
    # use linear SVM to train the model
    model = svm.SVC(kernel="linear", C=hyper_c)
    model.fit(train_data, train_labels)
    
    return model

In [None]:
def random_k_fold_split(data, labels, k):
    """
    randomly split the dataset into k-folds
    
    Args:
        data: the to-be-split dataset
        labels: the labels of the to-be-split dataset
        k: the wanted k-fold
    
    Returns:
        data_folds: an array contain k elements, each of which is a split set containing data
        labels_folds: an array contain k elements, each of which is a split set containing labels
    """
    assert len(data) == len(labels)
    n_samples = len(data)
    indices = np.random.permutation(n_samples)
    
    # generate k-1 points from 1 to n_samples - 1,
    # so the dataset can be split into k-fold by these k-1 points
    cut_points = np.sort(np.random.choice(range(1, n_samples), k - 1, replace=False))
    
    data_folds = []
    labels_folds = []
    prev = 0
    for cut_point in cut_points:
        data_folds.append(data[indices[prev:cut_point]])
        labels_folds.append(labels[indices[prev:cut_point]])
        prev = cut_point
    # cope with the final k-th set
    data_folds.append(data[indices[prev:]])
    labels_folds.append(labels[indices[prev:]])
    
    return data_folds, labels_folds

In [None]:
def kFold_cross_validation(train_val_data, train_val_labels, k, hyper_c):
    data_folds, labels_folds = random_k_fold_split(train_val_data, train_val_labels, k)
    train_acc_sum = 0
    val_acc_sum = 0
    for i in range(k):
        # the i-th fold is the validation set
        # other (k - 1) folds are used as the training set
        val_data = data_folds[i]
        filtered_data_folds = [arr for idx, arr in enumerate(data_folds) if idx != i]
        train_data = np.concatenate(filtered_data_folds, axis=0)
        val_labels = labels_folds[i]
        filtered_labels_folds = [arr for idx, arr in enumerate(labels_folds) if idx != i]
        train_labels = np.concatenate(filtered_labels_folds, axis=0)
        
        model = spam_linear_svm(train_data, train_labels, hyper_c)
        pred_train_labels = model.predict(train_data)
        pred_val_labels = model.predict(val_data)
        train_acc_sum += eval_metric(train_labels, pred_train_labels)
        val_acc_sum += eval_metric(val_labels, pred_val_labels)

    train_acc = train_acc_sum / k
    val_acc = val_acc_sum / k

    return train_acc, val_acc

In [None]:
# load the spam dataset
spam_data = np.load("data/spam-data.npz")
fields = "test_data", "training_data", "training_labels"
spam_train_val_data = spam_data[fields[1]]
spam_train_val_labels = spam_data[fields[2]]

spam_hyper_C = np.array((1.e-9, 1.e-8, 1.e-7, 1.e-6, 1.e-5, 1.e-4, 1.e-3, 1.e-2, 1.e-1, 1, 10))
spam_training_accuracies = np.zeros(len(spam_hyper_C))
spam_validation_accuracies = np.zeros(len(spam_hyper_C))

k = 5
spam_index = 0
for hyper_c in spam_hyper_C:
    spam_training_accuracies[spam_index], spam_validation_accuracies[spam_index] = \
        kFold_cross_validation(spam_train_val_data, spam_train_val_labels, k, hyper_c)
    spam_index += 1

# Plot the accuracies
plt.plot(spam_hyper_C, spam_training_accuracies, label="spam training accuracy") 
plt.plot(spam_hyper_C, spam_validation_accuracies, label="spam validation accuracy")
plt.title("spam-kFold-Cross-Validation Accuracy vs Hyperparameter C")
plt.xlabel("hyper-C")
plt.xscale("log")
plt.ylabel("training and validation accuracy")
plt.legend()
plt.savefig("q6_k-fold_cross-validation.png")