编程题：

In [2]:
# todo 编程题: 在不使用sklearn的情况下，仅使用Numpy，为softmax回归实现带早停的批量梯度下降，将它用于分类任务，例如鸢尾花数据集
#  注意：
#  1. 要实现l2正则化
#  2. 除了数据读取，其他仅使用numpy，包括训练集+验证集分离，以及softmax预测 和 损失计算

In [12]:
import numpy as np
from sklearn.datasets import load_iris

# 加载
iris = load_iris(as_frame=True)
X = iris.data[["petal width (cm)", "petal length (cm)"]].values
y = iris["target"].values

# 打乱
np.random.seed(42)
m = len(X)
indices = np.arange(m)
np.random.shuffle(indices)
X_shuffled = X[indices]
y_shuffled = y[indices]

# 划分 训练集 测试集
split = int(0.8*m)
X_train = X_shuffled[:split]
y_train = y_shuffled[:split]
X_test = X_shuffled[split:]
y_test = y_shuffled[split:]

# 添加偏执
X_train = np.c_[np.ones(X_train.shape[0]), X_train]  # (n_train, d+1)
X_test = np.c_[np.ones(X_test.shape[0]), X_test]     # (n_test, d+1)

# 类别数
n_classes = len(np.unique(y))
n_features = X_train.shape[1]  # 包含偏置项

# 初始化权重矩阵 W: (n_features, n_classes)
W = np.random.randn(n_features, n_classes) * 0.01

# 超参数
learning_rate = 0.1
l2_lambda = 0.01
max_epochs = 10000
patience = 5  # 早停耐心值
tolerance = 1e-5  # 损失变化容忍度

# 用于早停的变量
best_loss = float('inf')
patience_counter = 0
best_W = None

def OneHotEncoder(y,num_classes):
    return np.eye(num_classes)[y]

# 独热编码 y_train
y_train_OHE = OneHotEncoder(y_train, n_classes)
y_test_OHE = OneHotEncoder(y_test, n_classes)

def softmax(z):
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

# 代价函数（交叉熵） l2
def l2_coss(X,y_OHE,W,l2_lambda):
    probs=softmax(X@W)
    cross_entopy = -np.sum(y_OHE*np.log(probs + 1e-15))/X.shape[0]
    # l2 正则化项
    l2_reg = l2_lambda * np.sum(W[1:]**2)
    return cross_entopy + l2_reg

# 梯度 l2
def l2_gradient(X, y_OHE, W, l2_lambda):
    probs=softmax(X@W)
    grad=(X.T @ (probs - y_OHE))/X.shape[0]
    grad[1:, :] += 2 * l2_lambda * W[1:, :]
    return grad

# 早停
for epoch in range(max_epochs):
    logit=X_train @ W
    probs = softmax(logit)
    train_loss = l2_coss(X_train, y_train_OHE, W, l2_lambda)
    grad = l2_gradient(X_train, y_train_OHE, W, l2_lambda)
    W-= learning_rate * grad
    val_loss = l2_coss(X_test, y_test_OHE, W, l2_lambda)
    if val_loss < best_loss - tolerance:
        best_loss = val_loss
        best_W = W.copy()
        patience_counter = 0
    else:
        patience_counter += 1
    # 打印损失
    if epoch % 1000 == 0:
        print(f"Epoch {epoch}, Train Loss: {train_loss:.6f}, Val Loss: {val_loss:.6f}")
    # 判断是否早停
    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch}")
        break

def predict(X, W):
    return np.argmax(X @ W, axis=1)
# 预测
y_train_pred = predict(X_train, best_W)
y_test_pred = predict(X_test, best_W)
print("y_test_pred:",y_test_pred)

Epoch 0, Train Loss: 1.102275, Val Loss: 0.990831
Epoch 1000, Train Loss: 0.380144, Val Loss: 0.416591
Epoch 2000, Train Loss: 0.336080, Val Loss: 0.362503
Epoch 3000, Train Loss: 0.318111, Val Loss: 0.338302
Epoch 4000, Train Loss: 0.309242, Val Loss: 0.325621
Epoch 5000, Train Loss: 0.304442, Val Loss: 0.318396
Epoch 6000, Train Loss: 0.301696, Val Loss: 0.314046
Epoch 7000, Train Loss: 0.300066, Val Loss: 0.311319
Early stopping at epoch 7189
y_test_pred: [1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 1 2 2 1 2 1 1 2 2 0 1 1 0 1 2]


In [13]:
# 计算准确率
train_acc = np.mean(y_train_pred == y_train)
test_acc = np.mean(y_test_pred == y_test)

print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Best Validation Loss: {best_loss:.6f}")

Training Accuracy: 0.9583
Test Accuracy: 0.9667
Best Validation Loss: 0.310936
