## 1. with numpy only

In [8]:
import numpy as np

# N is batch size, D_in is input dim
# H is hidden dim, D_out is output dim
N, D_in, H, D_out = 64, 1000, 100, 10

#create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

#randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    #compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # backprop to compute gradients of w1 and w2 with respect to loss
    # backprop: upstream gradient 
    grad_y_pred = 2.0 * (y_pred -y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)
    
    # update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2 
    

0 25399476.009179894
1 20715623.717473105
2 19974112.88727703
3 20304520.7797091
4 19917669.53755856
5 17601270.75945895
6 13776967.617959501
7 9508102.455557171
8 6047094.1153080445
9 3687753.1228775615
10 2273588.3224184327
11 1461548.1215360174
12 1000297.14108949
13 730068.7628404252
14 563685.2528515186
15 454454.23232658755
16 377896.75904626935
17 321024.3386738691
18 276776.21563010477
19 241146.86573844904
20 211714.29327289213
21 186992.20180412824
22 165948.98648411673
23 147872.1766725932
24 132234.35028724372
25 118629.61338527605
26 106735.45556169891
27 96297.24685612854
28 87100.84586758386
29 78966.19050999095
30 71750.57730589688
31 65327.58758045387
32 59594.489878247776
33 54464.90454588746
34 49863.09910706791
35 45725.70489364752
36 41998.8906742777
37 38634.048474060284
38 35588.60125233632
39 32827.86754623681
40 30325.258335681632
41 28049.109279163506
42 25973.58045871318
43 24078.03081033421
44 22345.18683148324
45 20758.661542916525
46 19304.32532991065
47 1

407 0.0014257563140549758
408 0.0013677971078667635
409 0.0013122072082879907
410 0.0012588723324384679
411 0.0012077120425729932
412 0.0011586404864185422
413 0.0011115659964499747
414 0.0010664037124185673
415 0.001023079488177432
416 0.0009815223319969905
417 0.0009416543495326178
418 0.0009034081193947313
419 0.0008667292269745091
420 0.0008315360385901464
421 0.0007977712530745047
422 0.00076537918761203
423 0.0007343058814491817
424 0.0007044973895334419
425 0.0006758973482880176
426 0.0006484635121564815
427 0.0006221442286151461
428 0.000596894393269316
429 0.0005726725635738132
430 0.0005494334236398026
431 0.0005271432634795371
432 0.0005057600251448095
433 0.0004852495178742462
434 0.0004655653259714268
435 0.00044668163076458644
436 0.00042856364979608675
437 0.00041118178375677047
438 0.0003945076408497332
439 0.0003785095905995327
440 0.000363161445226781
441 0.000348437927102346
442 0.00033431166630757444
443 0.00032075885676031903
444 0.00030775591142999327
445 0.000295

## 2. Using Tensor

In [6]:
import torch

dtype = torch.float
device = torch.device("cuda")

N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass
    h = x.mm(w1)
    h_relu = h.clamp(min=0) #clamp pushes data into an interval [min, max]
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)
    
    #backprop
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    
    
    

0 34892460.0
1 34517000.0
2 37614492.0
3 36724256.0
4 28941472.0
5 17429290.0
6 8740051.0
7 4163470.75
8 2207177.0
9 1380469.75
10 995550.1875
11 782304.1875
12 642814.25
13 540480.6875
14 460299.59375
15 395325.9375
16 341715.90625
17 296967.65625
18 259356.53125
19 227484.09375
20 200298.96875
21 176972.96875
22 156871.515625
23 139466.09375
24 124341.1640625
25 111152.609375
26 99596.546875
27 89460.203125
28 80523.5703125
29 72633.703125
30 65647.5
31 59438.1015625
32 53910.04296875
33 48969.75390625
34 44550.765625
35 40590.6640625
36 37032.453125
37 33831.97265625
38 30945.220703125
39 28336.61328125
40 25975.359375
41 23837.365234375
42 21898.658203125
43 20137.759765625
44 18535.16015625
45 17075.333984375
46 15745.0087890625
47 14530.83203125
48 13420.3916015625
49 12404.4853515625
50 11473.666015625
51 10619.8974609375
52 9836.318359375
53 9116.60546875
54 8454.1787109375
55 7844.310546875
56 7282.34765625
57 6764.2978515625
58 6286.3486328125
59 5845.072265625
60 5437.969726

408 0.00011535563680808991
409 0.00011289337271591648
410 0.00011025437561329454
411 0.00010754373215604573
412 0.00010540004586800933
413 0.00010298720735590905
414 0.00010046189709100872
415 9.7905911388807e-05
416 9.568009409122169e-05
417 9.389281331095845e-05
418 9.170900011667982e-05
419 8.938635437516496e-05
420 8.814279863145202e-05
421 8.631331729702652e-05
422 8.455897477688268e-05
423 8.269562385976315e-05
424 8.099851402221248e-05
425 7.921642099972814e-05
426 7.776806887704879e-05
427 7.651864143554121e-05
428 7.471998105756938e-05
429 7.342986646108329e-05
430 7.220872794277966e-05
431 7.056383765302598e-05
432 6.959454185562208e-05
433 6.813769141444936e-05
434 6.660204962827265e-05
435 6.573200516868383e-05
436 6.44130414002575e-05
437 6.29714340902865e-05
438 6.211480649653822e-05
439 6.109868991188705e-05
440 5.9862053603865206e-05
441 5.872982001164928e-05
442 5.7702374760992825e-05
443 5.7019606174435467e-05
444 5.5984026403166354e-05
445 5.5293763580266386e-05
446 

## 3. Autograd

In [5]:
import torch

dtype = torch.float
device = torch.device("cuda:0")

# N is batch size; D_in is input dim
# H is hidden dim; D_out is output dim
N, D_in, H, D_out = 64, 1000, 100, 10

# create random tensors to hold input and outputs
# Setting requires_grad=False indicates that we do not need to compute gradients
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# create random weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # forward pass: no longer need to compute intermediate values
    # since we are not doing backward pass by hand
    y_pred = x.mm(w1).clamp(min=0).mm(w2)  # so is clamp like relu here?
    
    # compute loss using operations on Tensors
    # Now loss is a Tensor of shape (1, )
    # loss.item) gets the scaler value held in the loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    #  autograd backprop. this computes the gradients of loss 
    #  with respect to all Tensors with requires_grad=True
    loss.backward()
    
    # manually update weights using gradient descent. we do not 
    # need to track gradients here
    # alternative is to operate on weight.data and weight.grad.data
    # you can also use torch.optim.SGD to achieve this
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()


0 20477330.0
1 16112718.0
2 15389859.0
3 16694331.0
4 18893108.0
5 20521368.0
6 20297012.0
7 17482504.0
8 13062984.0
9 8581346.0
10 5195616.0
11 3049706.0
12 1824940.0
13 1153566.125
14 785171.8125
15 575911.1875
16 449819.6875
17 367968.6875
18 310638.71875
19 267692.75
20 233778.84375
21 205973.4375
22 182616.421875
23 162691.40625
24 145485.265625
25 130507.6328125
26 117385.3828125
27 105837.78125
28 95631.3203125
29 86585.8984375
30 78540.09375
31 71367.90625
32 64965.5625
33 59230.2578125
34 54094.3046875
35 49475.5546875
36 45314.796875
37 41556.34375
38 38156.9375
39 35077.671875
40 32281.93359375
41 29740.875
42 27428.3125
43 25321.509765625
44 23399.994140625
45 21645.240234375
46 20039.369140625
47 18568.27734375
48 17219.564453125
49 15981.552734375
50 14844.048828125
51 13800.125
52 12840.61328125
53 11956.1416015625
54 11140.19140625
55 10387.380859375
56 9691.140625
57 9046.9130859375
58 8450.33203125
59 7898.439453125
60 7386.6904296875
61 6911.716796875
62 6470.6435546

417 0.0010789884254336357
418 0.00104778993409127
419 0.0010188180021941662
420 0.0009881338337436318
421 0.0009600361227057874
422 0.0009317424846813083
423 0.0009052353561855853
424 0.000878263614140451
425 0.0008554956875741482
426 0.0008317860192619264
427 0.0008086937596090138
428 0.0007865393999963999
429 0.0007640929543413222
430 0.0007440564804710448
431 0.0007248151232488453
432 0.0007042352808639407
433 0.0006854998064227402
434 0.0006671394221484661
435 0.0006499136216007173
436 0.0006335540092550218
437 0.0006156776798889041
438 0.0005996833788231015
439 0.0005854116170667112
440 0.0005699713947251439
441 0.0005553741939365864
442 0.0005418173968791962
443 0.0005293100839480758
444 0.0005150680663064122
445 0.0005031514447182417
446 0.0004904383094981313
447 0.0004790515231434256
448 0.000468123413156718
449 0.0004560067900456488
450 0.0004455877060536295
451 0.0004350529925432056
452 0.00042479834519326687
453 0.00041414896259084344
454 0.00040462391916662455
455 0.0003953

## Defining new Autograd functions

In [19]:
"""
In PyTorch we can easily define our own autograd operator by defining a subclass of 
torch.autograd.Function and implementing the forward and backward functions.

"""

import torch 

class MyRelU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """
    
    @staticmethod
    def forward(ctx, input): 
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors    # wtf does this comma do
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input
    
dtype = torch.float
#device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()    

    

0 29703856.0
1 25456548.0
2 24244644.0
3 22675944.0
4 19485046.0
5 14773184.0
6 10089622.0
7 6374792.0
8 3953313.0
9 2500319.5
10 1668390.75
11 1185413.75
12 894431.75
13 708124.5
14 580488.8125
15 487349.40625
16 415758.0
17 358579.4375
18 311628.90625
19 272427.5
20 239253.265625
21 210902.125
22 186522.25
23 165453.0625
24 147146.875
25 131182.5625
26 117222.4765625
27 104984.90625
28 94222.15625
29 84747.453125
30 76368.9453125
31 68945.4140625
32 62357.98828125
33 56495.0859375
34 51267.75
35 46607.71875
36 42440.43359375
37 38706.3046875
38 35353.125
39 32336.357421875
40 29618.046875
41 27165.623046875
42 24947.703125
43 22939.703125
44 21118.923828125
45 19465.8671875
46 17963.76953125
47 16595.23046875
48 15348.744140625
49 14210.728515625
50 13170.69140625
51 12219.18359375
52 11347.4013671875
53 10547.947265625
54 9814.228515625
55 9139.45703125
56 8518.4296875
57 7946.71240234375
58 7419.541015625
59 6933.5732421875
60 6484.6904296875
61 6069.42333984375
62 5684.853515625
6

478 0.0006722987163811922
479 0.0006567068630829453
480 0.0006398829282261431
481 0.0006243206444196403
482 0.0006097203586250544
483 0.0005942700081504881
484 0.0005805536638945341
485 0.0005675934953615069
486 0.0005534137017093599
487 0.0005411840393207967
488 0.0005279144970700145
489 0.0005151324439793825
490 0.0005040245596319437
491 0.000492360326461494
492 0.0004815545107703656
493 0.00046947167720645666
494 0.0004590116732288152
495 0.00044882536167278886
496 0.00043803855078294873
497 0.00042825916898436844
498 0.00041996079380623996
499 0.00040944365900941193


## nn module

In [20]:
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4

for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)
    
    # loss
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    model.zero_grad()
    
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
            

0 652.9434204101562
1 607.1904296875
2 567.1231079101562
3 531.6373291015625
4 499.7470703125
5 471.16455078125
6 445.2689514160156
7 421.2921447753906
8 398.99334716796875
9 377.9862060546875
10 358.42999267578125
11 340.1341857910156
12 322.86138916015625
13 306.4911804199219
14 290.9460754394531
15 276.08563232421875
16 261.9029541015625
17 248.41180419921875
18 235.4343719482422
19 222.99610900878906
20 211.09219360351562
21 199.7056427001953
22 188.86260986328125
23 178.49237060546875
24 168.59213256835938
25 159.1352996826172
26 150.1284637451172
27 141.58053588867188
28 133.43643188476562
29 125.71715545654297
30 118.3956069946289
31 111.46167755126953
32 104.89895629882812
33 98.70603942871094
34 92.84928131103516
35 87.33130645751953
36 82.12065124511719
37 77.20608520507812
38 72.57354736328125
39 68.20928955078125
40 64.10840606689453
41 60.24431228637695
42 56.615325927734375
43 53.20256805419922
44 50.00001907348633
45 46.98949432373047
46 44.167518615722656
47 41.52206420

382 0.00021378471865318716
383 0.0002082691207760945
384 0.00020288812811486423
385 0.00019765982870012522
386 0.00019255962979514152
387 0.0001875954621937126
388 0.00018276454648002982
389 0.00017806403047870845
390 0.00017348652181681246
391 0.00016902631614357233
392 0.00016468262765556574
393 0.00016046107339207083
394 0.00015633934526704252
395 0.00015232873556669801
396 0.0001484309323132038
397 0.0001446283858967945
398 0.00014092325000092387
399 0.00013732195657212287
400 0.0001338133297394961
401 0.00013038820179644972
402 0.00012705987319350243
403 0.0001238215045304969
404 0.00012065946793882176
405 0.00011758074833778664
406 0.0001145842470577918
407 0.00011166679905727506
408 0.00010882168862735853
409 0.00010606022988213226
410 0.00010336189734516665
411 0.00010073088196804747
412 9.81744597083889e-05
413 9.568427049089223e-05
414 9.325516293756664e-05
415 9.089054219657555e-05
416 8.858399814926088e-05
417 8.634143887320533e-05
418 8.415732736466452e-05
419 8.2026963355

## Pytorch: optim

In [21]:
import torch

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        # in the constructor we instantiate two nn.Linear modules
        # and assign them as member variables
        
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred
    

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 707.0040283203125
1 653.7041625976562
2 608.2230224609375
3 568.4384765625
4 533.3046875
5 501.59423828125
6 472.8901062011719
7 446.79998779296875
8 422.7680358886719
9 400.2886962890625
10 379.1805725097656
11 359.3346252441406
12 340.51116943359375
13 322.8019104003906
14 306.0471496582031
15 290.076171875
16 274.81353759765625
17 260.2462158203125
18 246.30348205566406
19 233.03945922851562
20 220.394287109375
21 208.31320190429688
22 196.82095336914062
23 185.8819580078125
24 175.4629364013672
25 165.5499725341797
26 156.10409545898438
27 147.11668395996094
28 138.6002655029297
29 130.51882934570312
30 122.85480499267578
31 115.58381652832031
32 108.69966125488281
33 102.1866455078125
34 96.0290756225586
35 90.22250366210938
36 84.74131774902344
37 79.57643127441406
38 74.70415496826172
39 70.10711669921875
40 65.78270721435547
41 61.721824645996094
42 57.91447830200195
43 54.34060287475586
44 50.99251937866211
45 47.84709548950195
46 44.90102005004883
47 42.139896392822266
48 3

421 6.932891119504347e-05
422 6.747130100848153e-05
423 6.566734373336658e-05
424 6.39090794720687e-05
425 6.220075010787696e-05
426 6.053801189409569e-05
427 5.892190529266372e-05
428 5.73510151298251e-05
429 5.582177254837006e-05
430 5.432937905425206e-05
431 5.2883155149174854e-05
432 5.147669435245916e-05
433 5.010707172914408e-05
434 4.877297396888025e-05
435 4.747546336147934e-05
436 4.621250263880938e-05
437 4.4984302803641185e-05
438 4.3790354538941756e-05
439 4.262680158717558e-05
440 4.149766027694568e-05
441 4.039321356685832e-05
442 3.932587060262449e-05
443 3.8281574234133586e-05
444 3.726836075657047e-05
445 3.628289050539024e-05
446 3.532304253894836e-05
447 3.438698695390485e-05
448 3.347864912939258e-05
449 3.259353979956359e-05
450 3.173217919538729e-05
451 3.089278470724821e-05
452 3.0079609132371843e-05
453 2.9284874472068623e-05
454 2.851338285836391e-05
455 2.7760946977650747e-05
456 2.70309901679866e-05
457 2.6318106392864138e-05
458 2.562644294812344e-05
459 2.4

## dynamic computational graph

In [24]:
import random
import torch


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 672.4888916015625
1 686.2418212890625
2 669.6068115234375
3 695.9315795898438
4 653.3483276367188
5 658.4375
6 625.5452270507812
7 660.5146484375
8 596.7185668945312
9 501.92803955078125
10 569.4170532226562
11 644.0126953125
12 654.1666870117188
13 390.4842529296875
14 651.0487670898438
15 648.8712768554688
16 293.19793701171875
17 253.55917358398438
18 482.13427734375
19 462.55291748046875
20 633.0488891601562
21 124.0434799194336
22 105.35383605957031
23 362.9795227050781
24 332.5519714355469
25 73.33924865722656
26 262.2640686035156
27 575.5018920898438
28 477.48345947265625
29 433.8481750488281
30 480.7769775390625
31 154.7144775390625
32 385.3707275390625
33 153.25057983398438
34 149.6761016845703
35 135.7842559814453
36 254.9252471923828
37 194.85269165039062
38 119.96599578857422
39 186.91346740722656
40 44.8973503112793
41 163.15963745117188
42 48.66880416870117
43 237.89022827148438
44 84.17405700683594
45 79.4248275756836
46 320.4878845214844
47 188.27691650390625
48 180.5

399 0.9619389772415161
400 0.6230021715164185
401 1.7949697971343994
402 0.7727672457695007
403 0.21332727372646332
404 0.2088049352169037
405 1.5873510837554932
406 0.4237506687641144
407 0.3547523319721222
408 0.366731196641922
409 0.8232511878013611
410 1.3625060319900513
411 0.3440248370170593
412 0.34457096457481384
413 0.8665167093276978
414 0.0971582680940628
415 0.09432286769151688
416 0.9007596969604492
417 0.44151341915130615
418 0.6078176498413086
419 0.4849146008491516
420 0.47488096356391907
421 0.8211244344711304
422 0.14081190526485443
423 0.5580092668533325
424 0.5948164463043213
425 0.5745373368263245
426 0.48072105646133423
427 0.42917656898498535
428 0.5016624927520752
429 0.4823612868785858
430 0.11466917395591736
431 0.511505126953125
432 0.5023939609527588
433 0.40811315178871155
434 0.3528721332550049
435 0.36043182015419006
436 0.6129853129386902
437 0.49648407101631165
438 0.10581345111131668
439 0.5894150733947754
440 0.5972546339035034
441 0.5332108736038208
