In [1]:
import numpy as np

In [2]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10


In [7]:
# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [8]:
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)


In [18]:
learning_rate = 1e-6

In [19]:
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

RuntimeError: dot: Expected 1-D argument self, but got 2-D

In [17]:
import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 29575190.0
1 27795432.0
2 29504916.0
3 30556572.0
4 27873944.0
5 21219902.0
6 13535675.0
7 7620063.0
8 4162873.0
9 2382284.5
10 1502111.75
11 1048805.75
12 795486.9375
13 637857.25
14 529578.8125
15 448938.75
16 385616.875
17 334156.375
18 291444.96875
19 255635.65625
20 225208.640625
21 199165.5
22 176761.46875
23 157360.8125
24 140495.71875
25 125798.9296875
26 112932.1953125
27 101624.109375
28 91638.140625
29 82808.0703125
30 74977.2734375
31 68005.9453125
32 61784.51953125
33 56222.56640625
34 51235.37109375
35 46756.609375
36 42727.109375
37 39096.03125
38 35814.9140625
39 32846.421875
40 30156.712890625
41 27716.31640625
42 25498.650390625
43 23483.2421875
44 21645.51171875
45 19969.0390625
46 18439.107421875
47 17039.109375
48 15757.919921875
49 14584.2265625
50 13506.830078125
51 12519.1845703125
52 11613.29296875
53 10780.4384765625
54 10017.318359375
55 9313.80859375
56 8664.53125
57 8064.6953125
58 7510.56689453125
59 6998.2177734375
60 6524.001953125
61 6084.78759765625
