Ex4(a)

In [2]:
import numpy as np

data = np.load('../data/spam-data.npz')
print("data: ", data.files)
for name in data.files:
    arr = data[name]
    print(f"{name} shape: {arr.shape}")
    # print(arr)

data:  ['training_data', 'training_labels', 'test_data']
training_data shape: (4171, 32)
training_labels shape: (4171,)
test_data shape: (1000, 32)


In [3]:
# Shuffle and split MNIST data into training and validation sets

def split_mnist_data(X, y, num_val=10000, random_seed=42):
    """
    Shuffle and split MNIST data into training and validation sets.
    Args:
        X: numpy array of images, shape (N, D)
        y: numpy array of labels, shape (N,)
        num_val: number of validation samples
        random_seed: random seed for reproducibility
    Returns:
        X_train, y_train, X_val, y_val
    """
    np.random.seed(random_seed)
    idx = np.random.permutation(X.shape[0])
    X_shuffled = X[idx]
    y_shuffled = y[idx]
    X_val = X_shuffled[:num_val]
    y_val = y_shuffled[:num_val]
    X_train = X_shuffled[num_val:]
    y_train = y_shuffled[num_val:]
    return X_train, y_train, X_val, y_val

# Example usage:
# X = data['train_images']
# y = data['train_labels']
# X_train, y_train, X_val, y_val = split_mnist_data(X, y)

In [4]:
# Compute classification accuracy

def accuracy_score(y_true, y_pred):
    """
    Compute the unweighted classification accuracy.
    Args:
        y_true: numpy array of true labels, shape (N,)
        y_pred: numpy array of predicted labels, shape (N,)
    Returns:
        accuracy: float, percentage of correct predictions
    """
    return np.mean(y_true == y_pred)

In [5]:
import numpy as np

# Load MNIST data
data = np.load('../data/mnist-data.npz')
X = data['training_data']
y = data['training_labels']

# Split data into training and validation sets
X_train, y_train, X_val, y_val = split_mnist_data(X, y, num_val=10000, random_seed=42)

# Example: suppose you have a model and get predictions
# Here we use random predictions as a placeholder
y_val_pred = np.random.choice(np.unique(y), size=y_val.shape[0])

# Compute accuracy on validation set
def accuracy_score(y_true, y_pred):
    """
    Compute the unweighted classification accuracy.
    Args:
        y_true: numpy array of true labels, shape (N,)
        y_pred: numpy array of predicted labels, shape (N,)
    Returns:
        accuracy: float, percentage of correct predictions
    """
    return np.mean(y_true == y_pred)

val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation accuracy:", val_accuracy)

Validation accuracy: 0.1002


In [6]:
from sklearn import svm

# Reshape images to (num_samples, num_features)
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_val_flat = X_val.reshape(X_val.shape[0], -1)

# Train SVM model
clf = svm.SVC(kernel='linear')  # You can also try 'rbf' kernel
clf.fit(X_train_flat[:5000], y_train[:5000])

# Predict on validation set
y_val_pred = clf.predict(X_val_flat)

# Compute accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation accuracy (SVM):", val_accuracy)

Validation accuracy (SVM): 0.8999


In [None]:
from sklearn import svm

C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
accuracies = []

for C in C_values:
    clf = svm.SVC(kernel='linear', C=C)
    clf.fit(X_train_flat[:10000], y_train[:10000])  # 至少10000个样本
    y_val_pred = clf.predict(X_val_flat)
    acc = accuracy_score(y_val, y_val_pred)
    print(f"C={C}, Validation accuracy={acc}")
    accuracies.append(acc)

best_C = C_values[np.argmax(accuracies)]
print("Best C:", best_C)

C=0.001, Validation accuracy=0.9079
C=0.01, Validation accuracy=0.9079
C=0.1, Validation accuracy=0.9079
C=1, Validation accuracy=0.9079
C=10, Validation accuracy=0.9079
C=100, Validation accuracy=0.9079
C=1000, Validation accuracy=0.9079
C=10000, Validation accuracy=0.9079
Best C: 0.001


In [None]:
import numpy as np
from sklearn import svm

# Load spam data
data = np.load('../data/spam-data.npz')
X = data['training_data']
y = data['training_labels']

# Set random seed for reproducibility
np.random.seed(42)
indices = np.random.permutation(X.shape[0])
X_shuffled = X[indices]
y_shuffled = y[indices]

k = 5  # Number of folds
fold_size = X.shape[0] // k

C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
cv_accuracies = []

for C in C_values:
    fold_accuracies = []
    for fold in range(k):
        # Split into train and validation for this fold
        start = fold * fold_size
        end = (fold + 1) * fold_size if fold < k - 1 else X.shape[0]
        X_val = X_shuffled[start:end]
        y_val = y_shuffled[start:end]
        X_train = np.concatenate([X_shuffled[:start], X_shuffled[end:]], axis=0)
        y_train = np.concatenate([y_shuffled[:start], y_shuffled[end:]], axis=0)
        
        # Train SVM
        clf = svm.SVC(kernel='linear', C=C)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        acc = np.mean(y_pred == y_val)
        fold_accuracies.append(acc)
    avg_acc = np.mean(fold_accuracies)
    print(f"C={C}, 5-fold CV accuracy={avg_acc}")
    cv_accuracies.append(avg_acc)

best_C = C_values[np.argmax(cv_accuracies)]
print("Best C (5-fold CV):", best_C)

C=0.001, 5-fold CV accuracy=0.7497008859977885
C=0.01, 5-fold CV accuracy=0.7748738494234553
C=0.1, 5-fold CV accuracy=0.7930938123752495
C=1, 5-fold CV accuracy=0.8002874825887792
C=10, 5-fold CV accuracy=0.8005275779376498
C=100, 5-fold CV accuracy=0.8005267163514697


k 折交叉验证（k-fold cross-validation）是一种常用的模型评估方法。其核心思想如下：

- 首先将全部训练数据**随机打乱**，然后**平均分成 $k$ 份**（称为“折”）。
- 每次选择其中一份作为**验证集**，其余 $k-1$ 份作为**训练集**，训练模型并在验证集上评估。
- 这个过程**重复 $k$ 次**，每一份都轮流做一次验证集。
- 最终的模型评估结果是这 $k$ 次验证的**平均准确率**。

这样做的好处是：每个样本都被用作过一次验证集，评估结果更稳定，能更好地反映模型的泛化能力，尤其适合数据量较小的情况。

例如，5 折交叉验证（$k=5$）就是把数据分成 5 份，分别轮流做验证集，最后取 5 次准确率的平均值。