# 实验二（2025 夏季）——前馈神经网络：回归/二分类/多分类（手动实现与torch.nn）

- 按课程任务与图片要求，实现如下内容：
  - 手动实现前馈网络解决回归、二分类、多分类（MNIST），并绘制训练/测试 loss 曲线
  - 使用 torch.nn 实现相同任务，并绘制训练/测试 loss 曲线
  - 多分类任务上对比至少三种激活函数（ReLU/Tanh/LeakyReLU），并研究隐藏层层数与隐藏单元数的影响
  - 在多分类任务中，分别“手动实现”和“torch.nn 实现” dropout 与 L2 正则，并分析不同超参的影响
  - 回归/二分类/多分类分别选取最优模型，做 10 折交叉验证，表格展示每折结果
- 按图片要求，合成数据：
  - 回归：N=10000（train=7000/test=3000），p=500，高维线性：y=0.028+\sum 0.0056 x_i + ε
  - 二分类：共两个数据集，N=10000（train=7000/test=3000），p=200，两个类特征来自均值互为相反数、方差相同的正态分布
  - 多分类：使用 MNIST（28x28 展平为 784）

说明：代码默认在 CPU 可运行，epoch/网格规模已控制到课程实验可完成的量，如需更高精度可增大 epoch。


In [None]:
# 公共依赖与工具
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

plt.rcParams['figure.dpi'] = 120

def set_seed(s=42):
    np.random.seed(s); torch.manual_seed(s)

def split_train_test(X, y, n_train):
    Xtr, Xte = X[:n_train], X[n_train:]
    ytr, yte = y[:n_train], y[n_train:]
    return Xtr, ytr, Xte, yte

def plot_curves(history, title):
    plt.figure(figsize=(10,4))
    plt.subplot(1,2,1); plt.plot(history['train_loss'], label='train');
    if 'test_loss' in history: plt.plot(history['test_loss'], label='test');
    plt.title(title+' Loss'); plt.legend()
    if 'train_acc' in history:
        plt.subplot(1,2,2); plt.plot(history['train_acc'], label='train');
        if 'test_acc' in history: plt.plot(history['test_acc'], label='test');
        plt.title(title+' Acc'); plt.legend()
    plt.tight_layout(); plt.show()


In [None]:
# 任务A：回归（手动前馈 + torch.nn）——N=10000, p=500
set_seed(0)
N, P = 10000, 500
X = np.random.randn(N, P)
true_w = np.full((P,), 0.0056)
y = 0.028 + X @ true_w + 0.1*np.random.randn(N)
Xtr, ytr, Xte, yte = split_train_test(X, y, 7000)
Xtr_t = torch.tensor(Xtr, dtype=torch.float32)
ytr_t = torch.tensor(ytr, dtype=torch.float32).view(-1,1)
Xte_t = torch.tensor(Xte, dtype=torch.float32)
yte_t = torch.tensor(yte, dtype=torch.float32).view(-1,1)

# 手动前馈（1隐层）
class MLPRegScratch:
    def __init__(self, p, h):
        self.W1 = torch.randn(p, h, requires_grad=True)*0.01
        self.b1 = torch.zeros(h, requires_grad=True)
        self.W2 = torch.randn(h, 1, requires_grad=True)*0.01
        self.b2 = torch.zeros(1, requires_grad=True)
    def forward(self, x):
        z1 = x @ self.W1 + self.b1
        a1 = torch.relu(z1)
        out = a1 @ self.W2 + self.b2
        return out
    def params(self):
        return [self.W1,self.b1,self.W2,self.b2]

def train_reg_scratch(model, Xtr, ytr, Xte, yte, lr=1e-2, epochs=50, bs=256):
    ds = TensorDataset(Xtr, ytr); dl = DataLoader(ds, batch_size=bs, shuffle=True)
    hist={'train_loss':[], 'test_loss':[]}
    for ep in range(epochs):
        tl=0; n=0
        for xb,yb in dl:
            pred = model.forward(xb)
            loss = torch.mean((pred-yb)**2)
            loss.backward()
            with torch.no_grad():
                for p in model.params(): p -= lr*p.grad; p.grad.zero_()
            tl += loss.item()*len(xb); n+=len(xb)
        with torch.no_grad():
            te = torch.mean((model.forward(Xte)-yte)**2).item()
        hist['train_loss'].append(tl/n); hist['test_loss'].append(te)
        if ep%10==0: print(f'[Reg-S] Ep{ep:3d} | trainMSE={tl/n:.4f} | testMSE={te:.4f}')
    return hist

scratch = MLPRegScratch(P, 128)
h1 = train_reg_scratch(scratch, Xtr_t,ytr_t,Xte_t,yte_t, lr=5e-3, epochs=60)
plot_curves(h1,'Reg Scratch')

# torch.nn 版
class MLPReg(nn.Module):
    def __init__(self,p,h):
        super().__init__(); self.net=nn.Sequential(
            nn.Linear(p,h), nn.ReLU(), nn.Linear(h,1))
    def forward(self,x): return self.net(x)

def train_reg_torch(model, Xtr,ytr,Xte,yte, lr=1e-2, epochs=60, bs=256):
    ds=TensorDataset(Xtr,ytr); dl=DataLoader(ds,batch_size=bs,shuffle=True)
    opt=torch.optim.Adam(model.parameters(), lr=lr)
    hist={'train_loss':[], 'test_loss':[]}
    for ep in range(epochs):
        tl=0;n=0
        for xb,yb in dl:
            pred=model(xb); loss=F.mse_loss(pred,yb)
            opt.zero_grad(); loss.backward(); opt.step()
            tl += loss.item()*len(xb); n+=len(xb)
        with torch.no_grad():
            te=F.mse_loss(model(Xte), yte).item()
        hist['train_loss'].append(tl/n); hist['test_loss'].append(te)
        if ep%10==0: print(f'[Reg-N] Ep{ep:3d} | trainMSE={tl/n:.4f} | testMSE={te:.4f}')
    return hist

m = MLPReg(P,128)
h2 = train_reg_torch(m,Xtr_t,ytr_t,Xte_t,yte_t, lr=1e-3, epochs=60)
plot_curves(h2,'Reg Torch')


In [None]:
# 任务B：二分类（两个数据集、p=200，均值相反数、方差相同），手动+torch.nn
set_seed(1)
N, P = 10000, 200
mu = 2.0; sigma = 1.0
X0 = np.random.randn(N//2, P)*sigma + mu
X1 = np.random.randn(N//2, P)*sigma - mu
X  = np.vstack([X0,X1]); y = np.hstack([np.zeros(N//2), np.ones(N//2)])
idx=np.arange(N); np.random.shuffle(idx); X=X[idx]; y=y[idx]
Xtr,ytr,Xte,yte = split_train_test(X,y,7000)
Xtr_t = torch.tensor(Xtr, dtype=torch.float32)
ytr_t = torch.tensor(ytr, dtype=torch.float32).view(-1,1)
Xte_t = torch.tensor(Xte, dtype=torch.float32)
yte_t = torch.tensor(yte, dtype=torch.float32).view(-1,1)

# 手动前馈（分类）
class MLPBinScratch:
    def __init__(self,p,h):
        self.W1=torch.randn(p,h,requires_grad=True)*0.01
        self.b1=torch.zeros(h,requires_grad=True)
        self.W2=torch.randn(h,1,requires_grad=True)*0.01
        self.b2=torch.zeros(1,requires_grad=True)
    def forward(self,x):
        a=torch.relu(x@self.W1+self.b1)
        z=a@self.W2+self.b2
        return torch.sigmoid(z)
    def params(self): return [self.W1,self.b1,self.W2,self.b2]

def train_bin_scratch(model,Xtr,ytr,Xte,yte,lr=5e-3,epochs=40,bs=128):
    ds=TensorDataset(Xtr,ytr); dl=DataLoader(ds,batch_size=bs,shuffle=True)
    hist={'train_loss':[],'test_loss':[],'train_acc':[],'test_acc':[]}
    for ep in range(epochs):
        tl=0;ta=0;n=0
        for xb,yb in dl:
            p=model.forward(xb)
            loss=-torch.mean(yb*torch.log(p+1e-9)+(1-yb)*torch.log(1-p+1e-9))
            loss.backward()
            with torch.no_grad():
                for pm in model.params(): pm -= lr*pm.grad; pm.grad.zero_()
            tl+=loss.item()*len(xb); ta+=torch.sum((p>0.5).float().eq(yb)).item(); n+=len(xb)
        with torch.no_grad():
            pt=model.forward(Xte)
            te=-torch.mean(yte*torch.log(pt+1e-9)+(1-yte)*torch.log(1-pt+1e-9)).item()
            ta_tr=ta/n; ta_te=torch.mean((pt>0.5).float().eq(yte)).item()
        hist['train_loss'].append(tl/n); hist['test_loss'].append(te)
        hist['train_acc'].append(ta_tr); hist['test_acc'].append(ta_te)
        if ep%10==0: print(f'[Bin-S] Ep{ep:3d} | trLoss={tl/n:.4f} teLoss={te:.4f} | trAcc={ta_tr:.4f} teAcc={ta_te:.4f}')
    return hist

hS = train_bin_scratch(MLPBinScratch(P,256), Xtr_t,ytr_t,Xte_t,yte_t)
plot_curves(hS,'Binary Scratch')

# torch.nn
class MLPBin(nn.Module):
    def __init__(self,p,h):
        super().__init__(); self.net=nn.Sequential(
            nn.Linear(p,h), nn.ReLU(), nn.Linear(h,1))
    def forward(self,x): return self.net(x)

def train_bin_torch(m,Xtr,ytr,Xte,yte,lr=1e-3,epochs=40,bs=128):
    ds=TensorDataset(Xtr,ytr); dl=DataLoader(ds,batch_size=bs,shuffle=True)
    crit=nn.BCEWithLogitsLoss(); opt=torch.optim.Adam(m.parameters(), lr=lr)
    hist={'train_loss':[],'test_loss':[],'train_acc':[],'test_acc':[]}
    for ep in range(epochs):
        tl=0;ta=0;n=0
        for xb,yb in dl:
            z=m(xb); loss=crit(z,yb)
            opt.zero_grad(); loss.backward(); opt.step()
            tl+=loss.item()*len(xb); ta+=torch.sum((torch.sigmoid(z)>0.5).float().eq(yb)).item(); n+=len(xb)
        with torch.no_grad():
            zt=m(Xte); te=crit(zt,yte).item();
            ta_tr=ta/n; ta_te=torch.mean((torch.sigmoid(zt)>0.5).float().eq(yte)).item()
        hist['train_loss'].append(tl/n); hist['test_loss'].append(te)
        hist['train_acc'].append(ta_tr); hist['test_acc'].append(ta_te)
        if ep%10==0: print(f'[Bin-N] Ep{ep:3d} | trLoss={tl/n:.4f} teLoss={te:.4f} | trAcc={ta_tr:.4f} teAcc={ta_te:.4f}')
    return hist

hN = train_bin_torch(MLPBin(P,256), Xtr_t,ytr_t,Xte_t,yte_t)
plot_curves(hN,'Binary Torch')


In [None]:
# 任务C：MNIST 多分类 —— 激活函数 / 隐藏层配置 / Dropout / L2
import torchvision
import torchvision.transforms as T

# 数据
transform = T.Compose([T.ToTensor(), T.Lambda(lambda x: x.view(-1))])
mnist_train = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
mnist_test  = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# 取子集加速演示，可调大
ntr=20000; nte=5000
train_subset, _ = random_split(mnist_train, [ntr, len(mnist_train)-ntr])
Xtr = torch.stack([d[0] for d in train_subset])
Ytr = torch.tensor([d[1] for d in train_subset])
Xte = torch.stack([mnist_test[i][0] for i in range(nte)])
Yte = torch.tensor([mnist_test[i][1] for i in range(nte)])

# 模型工厂
class MLPCls(nn.Module):
    def __init__(self, d=784, h=256, layers=1, act='relu', dropout=0.0):
        super().__init__()
        acts={'relu':nn.ReLU(),'tanh':nn.Tanh(),'leaky':nn.LeakyReLU(0.1)}
        L=[]; inp=d
        for i in range(layers):
            L+=[nn.Linear(inp,h), acts[act]]
            if dropout>0: L.append(nn.Dropout(dropout))
            inp=h
        L.append(nn.Linear(inp,10))
        self.net=nn.Sequential(*L)
    def forward(self,x): return self.net(x)


def train_eval(dconf, wd=0.0, epochs=10, bs=128):
    ds = TensorDataset(Xtr, Ytr); dl=DataLoader(ds,batch_size=bs,shuffle=True)
    m=MLPCls(**dconf)
    opt=torch.optim.Adam(m.parameters(), lr=1e-3, weight_decay=wd)
    crit=nn.CrossEntropyLoss()
    hist={'train_loss':[],'test_loss':[],'train_acc':[],'test_acc':[]}
    for ep in range(epochs):
        tl=0;ta=0;n=0
        for xb,yb in dl:
            z=m(xb); loss=crit(z,yb)
            opt.zero_grad(); loss.backward(); opt.step()
            tl+=loss.item()*len(xb); ta+=torch.sum(z.argmax(1).eq(yb)).item(); n+=len(xb)
        with torch.no_grad():
            zt=m(Xte); te=crit(zt,Yte).item()
            tr=ta/n; te_acc=zt.argmax(1).eq(Yte).float().mean().item()
        hist['train_loss'].append(tl/n); hist['test_loss'].append(te)
        hist['train_acc'].append(tr); hist['test_acc'].append(te_acc)
        if ep%2==0: print(f"[MNIST] ep={ep:2d} {dconf} wd={wd} | trAcc={tr:.3f} teAcc={te_acc:.3f}")
    return hist

# 激活函数比较
cfg_relu = {'d':784,'h':256,'layers':1,'act':'relu','dropout':0.0}
cfg_tanh = {'d':784,'h':256,'layers':1,'act':'tanh','dropout':0.0}
cfg_leak = {'d':784,'h':256,'layers':1,'act':'leaky','dropout':0.0}

h_relu = train_eval(cfg_relu, wd=0.0)
h_tanh = train_eval(cfg_tanh, wd=0.0)
h_leak = train_eval(cfg_leak, wd=0.0)

plot_curves(h_relu,'ReLU'); plot_curves(h_tanh,'Tanh'); plot_curves(h_leak,'LeakyReLU')

# 隐藏层/单元数影响
cfg_deep = {'d':784,'h':256,'layers':2,'act':'relu','dropout':0.0}
h_deep = train_eval(cfg_deep, wd=0.0)
plot_curves(h_deep,'2-Layers')

# Dropout 与 L2 正则
cfg_do = {'d':784,'h':256,'layers':2,'act':'relu','dropout':0.5}
h_do = train_eval(cfg_do, wd=0.0)
plot_curves(h_do,'Dropout-0.5')

h_wd = train_eval(cfg_deep, wd=1e-4)
plot_curves(h_wd,'L2-1e-4')


In [None]:
# 任务D：在多分类任务上，手动实现 Dropout 与 L2（简洁版）
class LinearScratch:
    def __init__(self, d, h):
        self.W = torch.randn(d,h,requires_grad=True)*0.01
        self.b = torch.zeros(h,requires_grad=True)
    def __call__(self,x): return x@self.W + self.b
    def params(self): return [self.W,self.b]

class MLPDropScratch:
    def __init__(self, d=784, h=256, pdrop=0.5):
        self.l1 = LinearScratch(d,h)
        self.l2 = LinearScratch(h,10)
        self.pdrop = pdrop
    def forward(self, x, train=True):
        z1 = self.l1(x); a1 = torch.relu(z1)
        if train and self.pdrop>0:
            mask = (torch.rand_like(a1) > self.pdrop).float()
            a1 = a1 * mask / (1.0-self.pdrop)
        z2 = self.l2(a1)
        return z2
    def params(self): return self.l1.params()+self.l2.params()

def train_scratch_cls(model, Xtr,Ytr,Xte,Yte, lr=1e-3, wd=0.0, epochs=8, bs=128):
    ds=TensorDataset(Xtr,Ytr); dl=DataLoader(ds,batch_size=bs,shuffle=True)
    hist={'train_loss':[],'test_loss':[],'train_acc':[],'test_acc':[]}
    for ep in range(epochs):
        tl=0;ta=0;n=0
        for xb,yb in dl:
            logits = model.forward(xb, train=True)
            loss = F.cross_entropy(logits, yb)
            if wd>0:
                l2=0
                for p in model.params(): l2 += (p**2).sum()
                loss = loss + wd*l2/len(xb)
            loss.backward()
            with torch.no_grad():
                for p in model.params(): p -= lr*p.grad; p.grad.zero_()
            tl+=loss.item()*len(xb); ta+=torch.sum(logits.argmax(1).eq(yb)).item(); n+=len(xb)
        with torch.no_grad():
            zt = model.forward(Xte, train=False)
            te = F.cross_entropy(zt, Yte).item()
            tr=ta/n; te_acc=zt.argmax(1).eq(Yte).float().mean().item()
        hist['train_loss'].append(tl/n); hist['test_loss'].append(te)
        hist['train_acc'].append(tr); hist['test_acc'].append(te_acc)
        print(f'[Scratch-DO/L2] ep={ep:2d} lr={lr} wd={wd} pdrop={model.pdrop} | trAcc={tr:.3f} teAcc={te_acc:.3f}')
    return hist

Xs, Ys = Xtr, Ytr  # 复用MNIST子集
h_scr = train_scratch_cls(MLPDropScratch(784,256,0.5), Xs,Ys,Xte,Yte, lr=5e-4, wd=1e-4)
plot_curves(h_scr,'Scratch Dropout+L2')


In [None]:
# 任务E：10折交叉验证（回归/二分类/多分类选最优配置各一）

def kfold_eval(model_fn, loss_fn, X, y, k=10, task='reg'):
    X = torch.tensor(X, dtype=torch.float32)
    y = torch.tensor(y)
    if task!='multi': y = y.view(-1,1).float()
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    scores=[]
    for tr_idx, te_idx in kf.split(X):
        m = model_fn()
        if task=='reg':
            opt=torch.optim.Adam(m.parameters(), lr=1e-3)
            for _ in range(20):
                loss=loss_fn(m(X[tr_idx]), y[tr_idx]); opt.zero_grad(); loss.backward(); opt.step()
            with torch.no_grad():
                te=loss_fn(m(X[te_idx]), y[te_idx]).item(); scores.append(te)
        elif task=='bin':
            opt=torch.optim.Adam(m.parameters(), lr=1e-3)
            for _ in range(20):
                loss=loss_fn(m(X[tr_idx]), y[tr_idx]); opt.zero_grad(); loss.backward(); opt.step()
            with torch.no_grad():
                z=m(X[te_idx]); acc=(torch.sigmoid(z)>0.5).float().eq(y[te_idx]).float().mean().item()
                scores.append(acc)
        else: # multi
            opt=torch.optim.Adam(m.parameters(), lr=1e-3)
            for _ in range(5):
                loss=loss_fn(m(X[tr_idx]), y[tr_idx].long()); opt.zero_grad(); loss.backward(); opt.step()
            with torch.no_grad():
                acc=m(X[te_idx]).argmax(1).eq(y[te_idx].long()).float().mean().item(); scores.append(acc)
    return np.array(scores)

# 回归（使用 MLPReg 最优配置示例）
Xr, yr = X, y  # 来自任务A生成
reg_scores = kfold_eval(lambda: MLPReg(P,128), F.mse_loss, Xr, yr, k=10, task='reg')
print('Reg 10-fold MSE:', np.round(reg_scores,4), '| mean±std=', reg_scores.mean(), reg_scores.std())

# 二分类（使用 MLPBin 最优配置示例）
Xb, yb = X, y  # 任务B里最后一次生成可重新生成或复用
Xb, yb = Xtr, ytr  # 简化：使用训练集做演示
bin_scores = kfold_eval(lambda: MLPBin(P,256), lambda z,t: F.binary_cross_entropy_with_logits(z,t.float()), Xb, yb, k=10, task='bin')
print('Bin 10-fold Acc:', np.round(bin_scores,4), '| mean±std=', bin_scores.mean(), bin_scores.std())

# 多分类（使用 MLPCls 两层ReLU）
Xm, ym = Xtr.numpy(), Ytr.numpy()
multi_scores = kfold_eval(lambda: MLPCls(784,256,2,'relu',0.0), nn.CrossEntropyLoss(), Xm, ym, k=10, task='multi')
print('Multi 10-fold Acc:', np.round(multi_scores,4), '| mean±std=', multi_scores.mean(), multi_scores.std())
