In [1]:
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 36949462.6074
1 35109238.2467
2 33116181.8753
3 26919821.4408
4 17860522.8582
5 10096995.2039
6 5392160.3754
7 3055985.36163
8 1951154.94678
9 1399856.47887
10 1089260.00442
11 889302.525497
12 745876.992718
13 635602.638186
14 547282.113614
15 475004.196568
16 414953.307363
17 364317.274473
18 321357.517234
19 284681.41232
20 253190.926169
21 226033.67255
22 202451.433731
23 181890.462671
24 163893.674484
25 148073.251603
26 134100.898499
27 121735.739365
28 110754.794487
29 100956.773392
30 92196.5000867
31 84346.3459747
32 77291.376778
33 70934.3955354
34 65194.2521051
35 60002.9210427
36 55295.3739172
37 51020.5144452
38 47131.9421083
39 43588.7952015
40 40355.245669
41 37399.9502554
42 34697.584019
43 32219.4059605
44 29944.9141268
45 27859.0174242
46 25939.6687179
47 24171.0179198
48 22540.7868817
49 21035.5522276
50 19644.7628899
51 18358.888358
52 17168.4227478
53 16065.1268027
54 15041.5627735
55 14092.0567923
56 13209.9766419
57 12392.3677588
58 11632.6863438
59 10925.06128

465 0.000984867324546
466 0.000950713506653
467 0.000917736601124
468 0.000885905264173
469 0.000855181761779
470 0.00082552349973
471 0.000796896554016
472 0.000769266572979
473 0.000742601675142
474 0.000716861770356
475 0.000692014667558
476 0.000668030683386
477 0.000644886072125
478 0.000622551891852
479 0.000600990363849
480 0.000580170770729
481 0.000560076600007
482 0.000540677758452
483 0.000521953570966
484 0.000503880280519
485 0.000486437127175
486 0.000469598098109
487 0.00045334271084
488 0.000437652735046
489 0.000422506887399
490 0.000407887288604
491 0.000393783007842
492 0.000380157320888
493 0.000367005439771
494 0.000354311479007
495 0.000342056871058
496 0.000330225820706
497 0.000318804954567
498 0.000307784641086
499 0.000297142632161


In [2]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Variables; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Variables.
    # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
    # (1,); loss.data[0] is a scalar value holding the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Variables with requires_grad=True.
    # After this call w1.grad and w2.grad will be Variables holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Update weights using gradient descent; w1.data and w2.data are Tensors,
    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
    # Tensors.
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 32035690.0
1 27489600.0
2 25360172.0
3 22290430.0
4 17597250.0
5 12286941.0
6 7840175.0
7 4801167.0
8 2973353.75
9 1931018.0
10 1338014.125
11 987515.3125
12 767340.1875
13 619219.625
14 512865.40625
15 432498.6875
16 369263.25
17 318106.59375
18 275912.8125
19 240598.296875
20 210792.046875
21 185420.328125
22 163678.828125
23 144973.703125
24 128799.625
25 114739.625
26 102473.75
27 91730.359375
28 82289.984375
29 73970.6796875
30 66614.4296875
31 60094.19140625
32 54305.7890625
33 49150.7578125
34 44552.43359375
35 40439.80078125
36 36756.2265625
37 33448.0546875
38 30474.599609375
39 27801.15625
40 25391.9375
41 23214.71875
42 21247.458984375
43 19465.765625
44 17849.55859375
45 16381.849609375
46 15047.91015625
47 13835.0458984375
48 12730.8701171875
49 11724.490234375
50 10805.134765625
51 9966.21875
52 9200.29296875
53 8499.1806640625
54 7857.04638671875
55 7268.00634765625
56 6727.2978515625
57 6230.8076171875
58 5774.3173828125
59 5354.2431640625
60 4967.60009765625
61 4611.

452 8.319818880409002e-05
453 8.190382504835725e-05
454 8.001272362889722e-05
455 7.862946949899197e-05
456 7.712593651376665e-05
457 7.583005208289251e-05
458 7.449533586623147e-05
459 7.31305408407934e-05
460 7.192001794464886e-05
461 7.103579991962761e-05
462 6.975364522077143e-05
463 6.844748713774607e-05
464 6.75420742481947e-05
465 6.644660606980324e-05
466 6.554338324349374e-05
467 6.423761806217954e-05
468 6.341410335153341e-05
469 6.186519021866843e-05
470 6.128603126853704e-05
471 6.032051533111371e-05
472 5.9361769672250375e-05
473 5.8267069107387215e-05
474 5.753690129495226e-05
475 5.674963176716119e-05
476 5.591339140664786e-05
477 5.515833254321478e-05
478 5.457550287246704e-05
479 5.359473288990557e-05
480 5.262400009087287e-05
481 5.211871030041948e-05
482 5.1502371206879616e-05
483 5.055115252616815e-05
484 4.947673369315453e-05
485 4.8918987886281684e-05
486 4.8591864469926804e-05
487 4.787009675055742e-05
488 4.725686085294001e-05
489 4.649239417631179e-05
490 4.580