## Example
running example: train a two-layer ReLU network on random data with L2 loss

In [52]:
import torch

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [53]:
torch.randn(2, 3, device = device)

tensor([[ 0.5936,  0.9675,  0.2479],
        [-1.4069, -1.0570, -2.0190]])

In [54]:
N, D_in, H, D_out = 64, 1000, 100, 10 # N is the number of data points

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device)
w2= torch.randn(H, D_out, device=device)

In [55]:
w2[:1]

tensor([[ 0.4898, -0.5194, -0.8267, -0.3929, -1.0933,  0.3315, -1.8265, -0.2080,
          1.7972,  0.5134]])

In [56]:
learning_rate = 1e-6

In [57]:
for t in range(500):
    h = x.mm(w1) # X matmul w1
    h_relu = h.clamp(min=0) # use activation function to the hidden layer
    y_pred = h_relu.mm(w2) # hidden matmult w2
    loss = (y_pred - y).pow(2).sum() # sqaure sum loss

    grad_y_pred = 2.0 * (y_pred - y) # it is just differentiation
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    w1 -= learning_rate* grad_w1
    w2 -= learning_rate* grad_w2


In [58]:
w1[:1]

tensor([[-2.5315, -2.2143, -0.8954, -0.6610, -0.6031,  0.8249,  0.6192,  1.2545,
          0.3083, -0.2887,  0.8703,  1.5056, -1.0473,  0.3107,  0.7957,  0.3389,
          1.0303, -0.7558, -1.6617,  0.1515,  2.2062,  0.5581,  0.5029,  1.4396,
         -0.6068, -0.8064, -0.3301,  0.2012,  0.2661, -1.6686, -0.1905, -1.0241,
          0.4694,  0.8759, -1.0050, -0.0091, -0.8329, -0.5159, -2.0456, -0.2293,
         -1.1314, -0.9064,  0.6369,  0.8547,  0.3533, -1.0125, -0.3455, -1.1306,
          1.4173, -1.8185,  0.6905, -1.1772, -1.4044,  0.9248,  0.0805,  0.6364,
          0.0789,  0.6480,  0.9278,  1.0812,  0.7277,  0.2533, -1.5227,  0.5346,
          1.4883,  2.3632,  1.1129, -3.0738,  0.4576,  0.6122, -1.4783, -0.6386,
          0.0418,  0.1857,  0.2793,  0.7301,  0.7003, -0.9364,  1.6653,  2.4008,
          1.0775, -0.7225, -0.0830,  2.0328, -0.8451,  0.6982, -0.7560,  0.9785,
         -1.4675,  1.3310,  1.6482, -0.8352,  0.8862,  1.1501,  0.6076, -0.8103,
         -1.6767, -0.2094, -

In [59]:
w2[:1]

tensor([[ 0.3317,  0.1398, -0.7393, -0.8986, -0.6101,  0.2838, -1.8702, -0.2033,
          1.4925,  0.6795]])

## lets use autograd

In [60]:
N, D_in, H, D_out = 64, 1000, 100, 10 # N is the number of data points

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2= torch.randn(H, D_out, device=device, requires_grad=True)

In [61]:
w2[:1]

tensor([[ 0.2342,  1.2281, -1.5796,  0.3111, -1.3187, -0.3890,  1.4018, -2.7042,
         -1.0479, -0.2453]], grad_fn=<SliceBackward0>)

In [62]:
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()

    loss.backward()

    with torch.no_grad():
        w1 -= learning_rate*w1.grad
        w2 -= learning_rate*w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

In [63]:
w2[:1]

tensor([[ 0.1197,  0.8572, -1.6828,  0.1270, -1.3622, -0.8218,  1.3890, -2.3421,
         -1.1848, -0.7086]], grad_fn=<SliceBackward0>)

## using custom functions

In [64]:
def sigmoid(x):
    return 1.0/(1.0+(-x).exp())

In [65]:
for t in range(500):
    y_pred = sigmoid(x.mm(w1)).mm(w2)
    loss = (y_pred - y).pow(2).sum()

    loss.backward()

    if t % 50 == 0:
        print(t, loss.item())

    with torch.no_grad():
        w1 -= learning_rate*w1.grad
        w2 -= learning_rate*w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

0 5747.49072265625
50 nan
100 nan
150 nan
200 nan
250 nan
300 nan
350 nan
400 nan
450 nan


> Caution: the sigmod python function is not the best way for that you can checks the notes

## Lets use NN

In [66]:
N, D_in, H, D_out = 64, 1000, 100, 10 # N is the number of data points

x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

In [67]:
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H,D_out)
)

In [68]:
for t in range(500):
    y_pred = model(x)
    loss = torch.nn.functional.mse_loss(y_pred, y)

    loss.backward()

    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate*param.grad
    model.zero_grad()

## now optim: adam

In [69]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [70]:
for t in range(500):
    y_pred = model(x)
    loss = torch.nn.functional.mse_loss(y_pred, y)

    loss.backward()

    optimizer.step()
    optimizer.zero_grad()

## define module

In [71]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

In [76]:
# N, D_in, H, D_out = 64, 1000, 100, 10 # N is the number of data points

# x = torch.randn(N, D_in, device=device)
# y = torch.randn(N, D_out, device=device)
from torch.utils.data import TensorDataset, DataLoader

loader = DataLoader(TensorDataset(x, y), batch_size=8)

In [77]:
model = TwoLayerNet(D_in, H,D_out)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
for epoch in range(20):
    for x_batch, y_batch in loader:
        y_pred = model(x_batch)
        loss = torch.nn.functional.cross_entropy(y_pred, y_batch)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()