In [3]:
import numpy as np 
import torch

In [4]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

lr = 1e-6
for t in range(500):
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    loss = np.square(y_pred - y).sum()
    print(t, loss)

    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    w1 -= lr * grad_w1
    w2 -= lr * grad_w2

0 30061337.285521127
1 24881588.99563973
2 22659870.659497745
3 20199631.57466445
4 16604042.527000437
5 12271353.116277408
6 8309836.16745984
7 5309170.253203824
8 3346504.409575032
9 2146427.8979948107
10 1437526.7939032083
11 1014271.194956638
12 753955.4377582639
13 585992.37647834
14 471823.48328578356
15 389884.9745614141
16 328362.4589436631
17 280375.61743013974
18 241809.69419782297
19 210227.5648610438
20 183917.9425215019
21 161681.4394858007
22 142720.80515775952
23 126454.3285365882
24 112411.02418376922
25 100218.02409826289
26 89584.81075477668
27 80294.50288674672
28 72137.13945124808
29 64946.362250799684
30 58592.04092406382
31 52960.292533468324
32 47953.47070558178
33 43492.428206904835
34 39509.77894977154
35 35947.454413535525
36 32754.11956111245
37 29886.224073614918
38 27306.38819331745
39 24981.503144955062
40 22884.433904141362
41 20985.12057269267
42 19264.951919531002
43 17706.042939900763
44 16289.378083097137
45 15000.61440387246
46 13827.719264281437
47 

In [5]:
dtype = torch.float
device = torch.device('cpu')

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device= device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

lr = 1e-6
for t in range(500):
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99:
        print(t, loss)

    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    w1 -= lr * grad_w1
    w2 -= lr * grad_w2

99 788.1539916992188
199 7.497045993804932
299 0.11784081161022186
399 0.0026257396675646305
499 0.0002073047071462497


In [7]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    loss.backward()

    with torch.no_grad():
        w1 -= lr * w1.grad
        w2 -= lr * w2.grad

        w1.grad.zero_()
        w2.grad.zero_()

99 655.915771484375
199 3.9228463172912598
299 0.036498039960861206
399 0.0006362242274917662
499 7.542649836977944e-05
