# 两层全连接神经网络
- 输入1000个节点，隐藏层100个，输出10个
- 只考虑W不考虑b
- 全连接ReLU神经网络

---

- $h = W_1X$
- $h_{relu} = max(0, h)$
- $y_{pred} = W_2h_{relu}$
- $f = || y - y_{pred} ||^2_F$

---
- $\frac{\partial f}{\partial y_{pred}} = 2(y_{pred} - y)$
- $\frac{\partial f}{\partial h_{relu}} = \frac{\partial f}{\partial y_{pred}}W_2^T$
- $\frac{\partial f}{\partial W_2} = h_{relu}^T \frac{\partial f}{\partial y_{pred}}$
- $\frac{\partial f}{\partial h} = \frac{\partial f}{\partial h_{relu}} \odot \sigma(h)$
- $\frac{\partial f}{\partial x} = x^T\frac{\partial f}{\partial h}$

In [1]:
import numpy as np
import torch
import matplotlib.pyplot as plt

In [2]:
BATCH = 64
EPOCH = 500
N_in, N_hidden, N_out = 1000, 100, 10
LR = 1e-6

## 一. Numpy手动计算梯度

In [7]:
x = np.random.randn(BATCH, N_in)
y = np.random.randn(BATCH, N_out)

w1 = np.random.randn(N_in, N_hidden)
w2 = np.random.randn(N_hidden, N_out)

for it in range(EPOCH):
    # forward pass
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # compute loss
    loss = np.square(y_pred - y).sum()
    if it % 50 == 0: print("Epoch {}: Loss {}".format(it, loss))

    # backward pass
    grad_y_pred = 2 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # update weights
    w1 -= LR * grad_w1
    w2 -= LR * grad_w2

Epoch 0: Loss 35408475.5708653
Epoch 50: Loss 13210.723218441559
Epoch 100: Loss 424.07916714308783
Epoch 150: Loss 24.458366132394872
Epoch 200: Loss 1.7288604054217553
Epoch 250: Loss 0.13823869620770285
Epoch 300: Loss 0.012206723331863328
Epoch 350: Loss 0.0011717242860535743
Epoch 400: Loss 0.0001203650115225353
Epoch 450: Loss 1.3028675705281583e-05


## 二. Torch手动计算梯度

In [12]:
x = torch.randn(BATCH, N_in)
y = torch.randn(BATCH, N_out)

w1 = torch.randn(N_in, N_hidden)
w2 = torch.randn(N_hidden, N_out)

for it in range(EPOCH):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    loss = (y_pred - y).pow(2).sum()
    if it % 50 == 0: print("Epoch {}: Loss {}".format(it, loss))

    grad_y_pred = 2*(y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)

    w1 -= LR * grad_w1
    w2 -= LR * grad_w2

Epoch 0: Loss 26807970.0
Epoch 50: Loss 10895.0634765625
Epoch 100: Loss 404.9765930175781
Epoch 150: Loss 27.17841339111328
Epoch 200: Loss 2.4550464153289795
Epoch 250: Loss 0.2618061602115631
Epoch 300: Loss 0.030567120760679245
Epoch 350: Loss 0.003984278533607721
Epoch 400: Loss 0.0007121993694454432
Epoch 450: Loss 0.00019911790150217712


## PyTorch

### 三. Autograd

In [14]:
x = torch.randn(BATCH, N_in)
y = torch.randn(BATCH, N_out)

w1 = torch.randn(N_in, N_hidden, requires_grad=True)
w2 = torch.randn(N_hidden, N_out, requires_grad=True)

for it in range(EPOCH):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    loss = (y_pred - y).pow(2).sum()
    if it % 50 == 0: print("Epoch {}: Loss {}".format(it, loss))

    loss.backward()

    with torch.no_grad():
        w1 -= LR * w1.grad
        w2 -= LR * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

Epoch 0: Loss 32670436.0
Epoch 50: Loss 20055.0078125
Epoch 100: Loss 1657.7003173828125
Epoch 150: Loss 257.3371887207031
Epoch 200: Loss 46.99903869628906
Epoch 250: Loss 9.062090873718262
Epoch 300: Loss 1.7900952100753784
Epoch 350: Loss 0.35835203528404236
Epoch 400: Loss 0.07240660488605499
Epoch 450: Loss 0.014984498731791973


### 四. Optim

In [17]:
x = torch.randn(BATCH, N_in)
y = torch.randn(BATCH, N_out)

w1 = torch.randn(N_in, N_hidden, requires_grad=True)
w2 = torch.randn(N_hidden, N_out, requires_grad=True)

optimizer = torch.optim.SGD([w1, w2], lr=LR)

for it in range(EPOCH):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    loss = (y_pred - y).pow(2).sum()
    if it % 50 == 0: print("Epoch {}: Loss {}".format(it, loss))

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

Epoch 0: Loss 34224736.0
Epoch 50: Loss 13434.0263671875
Epoch 100: Loss 644.0214233398438
Epoch 150: Loss 51.45646286010742
Epoch 200: Loss 5.0250163078308105
Epoch 250: Loss 0.5542936325073242
Epoch 300: Loss 0.0669383704662323
Epoch 350: Loss 0.008839967660605907
Epoch 400: Loss 0.0014848707942292094
Epoch 450: Loss 0.00039268750697374344


### 五. Loss

In [18]:
x = torch.randn(BATCH, N_in)
y = torch.randn(BATCH, N_out)

w1 = torch.randn(N_in, N_hidden, requires_grad=True)
w2 = torch.randn(N_hidden, N_out, requires_grad=True)

optimizer = torch.optim.SGD([w1, w2], lr=LR)
loss_func = torch.nn.MSELoss(reduction='sum')

for it in range(EPOCH):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    loss = loss_func(y_pred, y)
    if it % 50 == 0: print("Epoch {}: Loss {}".format(it, loss))

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

Epoch 0: Loss 30201670.0
Epoch 50: Loss 19577.216796875
Epoch 100: Loss 1194.46875
Epoch 150: Loss 116.95330810546875
Epoch 200: Loss 14.979321479797363
Epoch 250: Loss 2.248992443084717
Epoch 300: Loss 0.367341548204422
Epoch 350: Loss 0.06274251639842987
Epoch 400: Loss 0.011243206448853016
Epoch 450: Loss 0.0023114148061722517


### 六. Sequential

In [21]:
x = torch.randn(BATCH, N_in)
y = torch.randn(BATCH, N_out)

model = torch.nn.Sequential(
    torch.nn.Linear(N_in, N_hidden, bias=False),
    torch.nn.ReLU(),
    torch.nn.Linear(N_hidden, N_out, bias=False)
)

torch.nn.init.normal_(model[0].weight)
torch.nn.init.normal_(model[2].weight)

loss_func = torch.nn.MSELoss(reduction='sum')

for it in range(EPOCH):
    y_pred = model(x)

    loss = loss_func(y_pred, y)
    if it % 50 == 0: print("Epoch {}: Loss {}".format(it, loss))

    loss.backward()

    with torch.no_grad():
        for param in model.parameters():
            param -= LR * param.grad
    
    model.zero_grad()

Epoch 0: Loss 33341480.0
Epoch 50: Loss 15438.4150390625
Epoch 100: Loss 652.8851928710938
Epoch 150: Loss 58.59893798828125
Epoch 200: Loss 7.505269527435303
Epoch 250: Loss 1.141122579574585
Epoch 300: Loss 0.18748416006565094
Epoch 350: Loss 0.0320446640253067
Epoch 400: Loss 0.005843315739184618
Epoch 450: Loss 0.001299908384680748


### 七. Sequential + Optim

In [27]:
x = torch.randn(BATCH, N_in)
y = torch.randn(BATCH, N_out)

model = torch.nn.Sequential(
    torch.nn.Linear(N_in, N_hidden, bias=False),
    torch.nn.ReLU(),
    torch.nn.Linear(N_hidden, N_out, bias=False)
)

torch.nn.init.normal_(model[0].weight)
torch.nn.init.normal_(model[2].weight)

loss_func = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=LR)

for it in range(EPOCH):
    y_pred = model(x)

    loss = loss_func(y_pred, y)
    if it % 50 == 0: print("Epoch {}: Loss {}".format(it, loss))

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

Epoch 0: Loss 31621422.0
Epoch 50: Loss 9968.7177734375
Epoch 100: Loss 345.5548095703125
Epoch 150: Loss 21.064655303955078
Epoch 200: Loss 1.687893033027649
Epoch 250: Loss 0.1620352864265442
Epoch 300: Loss 0.017796725034713745
Epoch 350: Loss 0.0024050897918641567
Epoch 400: Loss 0.0004838006279896945
Epoch 450: Loss 0.0001473624724894762


### 八. 自定义网络(显式参数)

In [50]:
x = torch.randn(BATCH, N_in)
y = torch.randn(BATCH, N_out)

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        # self.w1 = torch.nn.Parameter(torch.nn.init.xavier_normal_(torch.Tensor(N_in, N_hidden)))
        # self.w2 = torch.nn.Parameter(torch.nn.init.xavier_normal_(torch.Tensor(N_hidden, N_out)))
        self.w1 = torch.nn.Parameter(torch.nn.init.normal_(torch.Tensor(N_in, N_hidden)))
        self.w2 = torch.nn.Parameter(torch.nn.init.normal_(torch.randn(N_hidden, N_out)))

    def forward(self, x):
        y_pred = x.mm(self.w1).clamp(min=0).mm(self.w2)
        return y_pred

model = Net()
loss_func = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=LR)

for it in range(EPOCH):
    y_pred = model(x)

    loss = loss_func(y_pred, y)
    if it % 50 == 0: print("Epoch {}: Loss {}".format(it, loss))

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

Epoch 0: Loss 34467236.0
Epoch 50: Loss 12329.94140625
Epoch 100: Loss 551.3489990234375
Epoch 150: Loss 40.532169342041016
Epoch 200: Loss 3.521925687789917
Epoch 250: Loss 0.33698779344558716
Epoch 300: Loss 0.03455748409032822
Epoch 350: Loss 0.003984588198363781
Epoch 400: Loss 0.0006867757765576243
Epoch 450: Loss 0.00020060440874658525


### 九. 自定义网络(隐式参数)

In [43]:
x = torch.randn(BATCH, N_in)
y = torch.randn(BATCH, N_out)

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.linear1 = torch.nn.Linear(N_in, N_hidden, bias=False)
        self.linear2 = torch.nn.Linear(N_hidden, N_out, bias=False)

        torch.nn.init.normal_(self.linear1.weight)
        torch.nn.init.normal_(self.linear2.weight)

    def forward(self, x):
        y_pred = self.linear2(self.linear1(x).clamp(min=0))
        return y_pred

model = Net()
loss_func = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=LR)

for it in range(EPOCH):
    y_pred = model(x)

    loss = loss_func(y_pred, y)
    if it % 50 == 0: print("Epoch {}: Loss {}".format(it, loss))

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

Epoch 0: Loss 40366000.0
Epoch 50: Loss 12587.9111328125
Epoch 100: Loss 353.7375183105469
Epoch 150: Loss 16.398738861083984
Epoch 200: Loss 0.9396277666091919
Epoch 250: Loss 0.0601460263133049
Epoch 300: Loss 0.00429706322029233
Epoch 350: Loss 0.0005215808050706983
Epoch 400: Loss 0.00013343743921723217
Epoch 450: Loss 5.4725031077396125e-05
