pytorch official site tutorial 
[link](https://pytorch.org/tutorials/beginner/pytorch_with_examples.html)

In [6]:
import numpy as np
import time

In [2]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [3]:
# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [4]:
learning_rate = 1e-6

In [7]:
start=time.time()
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    # sum(y_^ ** 2)
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    
end=time.time()

0 2.9305196811144052e-06
1 2.8023533902701647e-06
2 2.6797611011492256e-06
3 2.5625455575986016e-06
4 2.450403967844863e-06
5 2.3431612386598866e-06
6 2.240648194554905e-06
7 2.142590683219541e-06
8 2.048858850395393e-06
9 1.9592388891582738e-06
10 1.8735189257485515e-06
11 1.7915587817062127e-06
12 1.7131883089462645e-06
13 1.638259274388157e-06
14 1.5666256058040259e-06
15 1.4981209202057974e-06
16 1.4325955413394615e-06
17 1.3699416176928559e-06
18 1.3100442958820044e-06
19 1.252798068355974e-06
20 1.1980316321803047e-06
21 1.1456529874243706e-06
22 1.0955645584800078e-06
23 1.047672550232233e-06
24 1.0018742414252327e-06
25 9.581183069312595e-07
26 9.162357696802187e-07
27 8.761935008712818e-07
28 8.37890153891796e-07
29 8.012791829701048e-07
30 7.662579510012317e-07
31 7.327809446381734e-07
32 7.007548578811724e-07
33 6.701344439648674e-07
34 6.408527167304133e-07
35 6.128499829909495e-07
36 5.86106760875167e-07
37 5.605008315839416e-07
38 5.360196403927433e-07
39 5.12605303768649

342 7.215391620688179e-13
343 6.902334845430762e-13
344 6.602927798487975e-13
345 6.316491170353668e-13
346 6.0425104811817e-13
347 5.780405450422439e-13
348 5.529768818823967e-13
349 5.290072534876369e-13
350 5.060587470992438e-13
351 4.841087285097491e-13
352 4.631218429001127e-13
353 4.430359948910689e-13
354 4.2381667985734986e-13
355 4.054366049087541e-13
356 3.878495580125805e-13
357 3.710268944066233e-13
358 3.5494366061805747e-13
359 3.395459138637922e-13
360 3.248189315576854e-13
361 3.1072910293492306e-13
362 2.9725149334828913e-13
363 2.843590270587153e-13
364 2.72025781128551e-13
365 2.60230703500445e-13
366 2.489493759026994e-13
367 2.381578356383336e-13
368 2.278324518432154e-13
369 2.179524495292949e-13
370 2.0849992133879908e-13
371 1.9945618838829975e-13
372 1.9080711167219014e-13
373 1.8253164644334205e-13
374 1.7461671638757352e-13
375 1.670447105196551e-13
376 1.5980018907577204e-13
377 1.5287139233500533e-13
378 1.4624359515948138e-13
379 1.3990605655382727e-13
380

In [8]:
print (end-start)

1.06184983253479


## With pytorch
- can run on GPU


In [18]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6

start = time.time()
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

end = time.time()

0 31917916.0
1 26402290.0
2 23695908.0
3 20549122.0
4 16344115.0
5 11710772.0
6 7766012.0
7 4938143.5
8 3152147.0
9 2084836.625
10 1456146.75
11 1077021.0
12 837352.1875
13 676544.875
14 562337.4375
15 476998.875
16 410264.25
17 356498.15625
18 312170.65625
19 275046.5625
20 243562.1875
21 216602.71875
22 193366.96875
23 173190.46875
24 155593.890625
25 140163.375
26 126573.984375
27 114575.34375
28 103927.21875
29 94456.75
30 86002.8203125
31 78436.0625
32 71650.484375
33 65550.3125
34 60058.1875
35 55105.0078125
36 50622.53125
37 46560.1875
38 42875.91796875
39 39527.34375
40 36478.51953125
41 33691.078125
42 31144.595703125
43 28815.125
44 26681.830078125
45 24724.5546875
46 22926.53125
47 21274.2109375
48 19754.67578125
49 18355.90625
50 17066.638671875
51 15876.2265625
52 14776.34765625
53 13759.8076171875
54 12819.32421875
55 11948.283203125
56 11141.80078125
57 10394.3134765625
58 9701.33203125
59 9058.2041015625
60 8461.359375
61 7907.71240234375
62 7392.884765625
63 6913.86425

499 8.515968511346728e-05


In [12]:
print (end-start)

0.3610689640045166


### Autograd

In [20]:
import torch

dtype = torch.float
device = torch.device("cpu")


N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)
learning_rate = 1e-6

start=time.time()
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    loss.backward()
        
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

end=time.time()

0 25371550.0
1 21053832.0
2 21971140.0
3 25627446.0
4 29209076.0
5 29664214.0
6 24749534.0
7 16762698.0
8 9449980.0
9 4887234.5
10 2557847.5
11 1472386.375
12 961422.5
13 702634.875
14 555161.75
15 459618.625
16 390663.75
17 337101.8125
18 293605.28125
19 257349.703125
20 226688.859375
21 200502.21875
22 177943.78125
23 158423.578125
24 141466.0625
25 126680.4609375
26 113747.3828125
27 102372.84375
28 92347.9375
29 83492.1015625
30 75642.8828125
31 68658.3828125
32 62434.171875
33 56877.4609375
34 51900.32421875
35 47439.53125
36 43427.48046875
37 39819.828125
38 36560.3828125
39 33612.32421875
40 30940.904296875
41 28516.30078125
42 26311.958984375
43 24304.9765625
44 22474.501953125
45 20803.673828125
46 19275.9375
47 17876.798828125
48 16595.078125
49 15420.8115234375
50 14341.728515625
51 13349.9580078125
52 12437.8544921875
53 11598.8037109375
54 10824.310546875
55 10108.2861328125
56 9446.2197265625
57 8832.943359375
58 8264.3984375
59 7737.1357421875
60 7247.626953125
61 6793.4

449 0.00023205381876323372
450 0.00022754658129997551
451 0.00022226481814868748
452 0.00021712458692491055
453 0.00021273009770084172
454 0.00020843357197009027
455 0.00020320600015111268
456 0.0001993592013604939
457 0.00019545300165191293
458 0.0001911478175316006
459 0.00018757075304165483
460 0.0001839060423662886
461 0.00018035862012766302
462 0.00017648986249696463
463 0.000173527019796893
464 0.00017025452689267695
465 0.00016700492415111512
466 0.0001638047833694145
467 0.0001606057194294408
468 0.00015747225552331656
469 0.00015434007218573242
470 0.00015141193580348045
471 0.00014856721099931747
472 0.00014649862714577466
473 0.00014363742957357317
474 0.0001408530806656927
475 0.00013785088958684355
476 0.00013570203736890107
477 0.00013338029384613037
478 0.00013123526878189296
479 0.00012856523972004652
480 0.00012610614066943526
481 0.0001239880220964551
482 0.00012149430403951555
483 0.00011949388863286003
484 0.00011824194371001795
485 0.00011611099762376398
486 0.0001

In [21]:
print(end-start)

0.4639310836791992


## New Autograd Function
- implementing `backward()`
- subclassing `torch.autograd.Function`
- implement `forward()` and `backward()`


In [23]:
class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        #ctx: context object use to stash info for backward computation
        #ctx.save_for_backward
        ctx.save_for_backward(input)
        return input.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        # receive the gradient of the loss w.r.t loss
        # compute the gradient of the loss w.r.t input
        
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

In [26]:
dtype = torch.float
device = torch.device("cpu")
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

start=time.time()
for t in range(500):
    
    # to apply our Function, use Function.apply
    relu = MyReLU.apply
    y_pred = relu(x.mm(w1)).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    loss.backward()
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()
end=time.time()

0 34141752.0
1 31585304.0
2 31296080.0
3 28545016.0
4 21951470.0
5 14098627.0
6 7968132.5
7 4406336.0
8 2595890.75
9 1703445.625
10 1235400.875
11 962778.0
12 784514.375
13 656643.125
14 558690.0
15 480540.3125
16 416463.34375
17 363216.6875
18 318457.875
19 280537.96875
20 248171.3125
21 220409.484375
22 196453.8125
23 175683.46875
24 157610.875
25 141794.234375
26 127909.625
27 115672.28125
28 104850.328125
29 95252.296875
30 86705.3203125
31 79074.7578125
32 72243.4765625
33 66115.1953125
34 60612.63671875
35 55641.94921875
36 51154.6484375
37 47092.65625
38 43406.5234375
39 40056.2734375
40 37008.59765625
41 34228.421875
42 31690.173828125
43 29368.720703125
44 27240.880859375
45 25290.359375
46 23499.921875
47 21854.390625
48 20341.958984375
49 18948.46875
50 17664.03125
51 16478.08203125
52 15382.720703125
53 14369.4267578125
54 13431.224609375
55 12561.6728515625
56 11755.5458984375
57 11007.6552734375
58 10312.958984375
59 9667.0146484375
60 9066.083984375
61 8506.6826171875
62

In [27]:
print(end-start)

0.5832967758178711


### Computational  Graphs
- TensorFlow: **static**
    - define the computational graph once
    - execute the same graph over and over again
    - possibly feeding different input data to the graph
- Pytorch: **dynamic**
    - each forward pass defines a new computational graph

- Static
    - can optimize the graph up front
    - for RNN: loop construct need to be a part of the graph
    - TensorFlow: tf.scan, for embedding loops into the graph
- Dynamic
    - build graph on-the-fly
    - use normal imperative flow control to perform computation the differs for each input


In [33]:
import tensorflow as tf
import numpy as np

N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

## Graph setup done

start = time.time()

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())
    
    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)
        
end = time.time()

34019770.0
30104250.0
27189500.0
22231564.0
15842158.0
9962604.0
5889683.0
3505511.5
2217551.0
1520865.0
1124000.5
878713.25
713511.4
593677.94
501879.75
428841.7
369278.1
319979.0
278724.06
243729.78
214009.84
188631.97
166860.81
148094.47
131888.0
117805.805
105551.984
94845.7
85427.33
77120.69
69755.39
63227.0
57421.445
52245.594
47621.42
43480.617
39764.508
36423.54
33412.062
30693.674
28234.959
26007.719
23986.434
22149.285
20476.064
18950.842
17557.848
16284.088
15118.213
14049.707
13068.563
12167.805
11339.167
10576.09
9872.16
9222.627
8622.723
8067.5527
7553.603
7077.286
6635.54
6225.24
5843.8535
5489.0986
5158.8013
4851.086
4564.2207
4296.5723
4046.64
3813.2227
3595.0625
3390.8896
3199.6638
3020.5933
2852.7046
2695.2356
2547.504
2408.8403
2278.5276
2156.0127
2040.838
1932.4507
1830.4502
1734.3832
1643.8945
1558.5823
1478.142
1402.2406
1330.6555
1263.0735
1199.2277
1138.9478
1081.9312
1028.0444
977.10547
928.90063
883.27246
840.05853
799.14136
760.374
723.6332
688.8002
655.7798

In [34]:
print(end-start)

0.5760607719421387


### PyTorch.nn
- when building nn, frequently think of arranging the computation into **layers**
- some have **learnable parameters** which will be optimized during training
- TensorFlow:
    - Keras, TensorFlow-Slim, TFLearn
    - higher-level abstractions over raw computational graphs
   
- Pytorch: `nn` package

    

In [35]:
import torch
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
start=time.time()
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    model.zero_grad()
    loss.backward()
    
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
            
end=time.time()

0 654.0012817382812
1 609.2283935546875
2 569.9983520507812
3 535.0026245117188
4 503.347412109375
5 474.4626159667969
6 447.98992919921875
7 423.7221984863281
8 401.0868835449219
9 379.7489318847656
10 359.7193908691406
11 340.91094970703125
12 323.1837463378906
13 306.4001770019531
14 290.3970947265625
15 275.1534729003906
16 260.6966857910156
17 246.9180908203125
18 233.741943359375
19 221.18447875976562
20 209.2122802734375
21 197.73983764648438
22 186.80838012695312
23 176.38758850097656
24 166.43136596679688
25 156.9395751953125
26 147.90553283691406
27 139.31692504882812
28 131.17318725585938
29 123.4544906616211
30 116.12548828125
31 109.16812896728516
32 102.59530639648438
33 96.38241577148438
34 90.51556396484375
35 84.97615814208984
36 79.76026153564453
37 74.8453369140625
38 70.22955322265625
39 65.89469146728516
40 61.82470703125
41 57.988460540771484
42 54.39303207397461
43 51.02285385131836
44 47.866737365722656
45 44.91701889038086
46 42.152587890625
47 39.5529594421386

351 0.00033791441819630563
352 0.00032875710166990757
353 0.0003198659687768668
354 0.0003112079866696149
355 0.00030279604834504426
356 0.000294612895231694
357 0.00028665614081546664
358 0.0002789169957395643
359 0.0002713857393246144
360 0.00026407386758364737
361 0.00025695539079606533
362 0.0002500317932572216
363 0.00024330909946002066
364 0.0002367628476349637
365 0.00023039523512125015
366 0.0002241988986497745
367 0.00021817479864694178
368 0.0002123190788552165
369 0.00020662341557908803
370 0.00020107839372940361
371 0.0001956838386831805
372 0.00019044171494897455
373 0.00018536495917942375
374 0.00018051530059892684
375 0.0001759418228175491
376 0.0001714994286885485
377 0.00016717899416107684
378 0.00016297277761623263
379 0.00015888261259533465
380 0.000154903216753155
381 0.00015103277110029012
382 0.00014726394147146493
383 0.0001435996819054708
384 0.00014002736133988947
385 0.00013655137445311993
386 0.00013316881086211652
387 0.00012987790978513658
388 0.00012667162

In [36]:
print(end-start)

0.553321361541748


### optim
- tell the model how to update the weight

In [37]:
import torch
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

start=time.time()
for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
end=time.time()

0 762.9730224609375
1 744.7385864257812
2 726.9492797851562
3 709.6425170898438
4 692.8268432617188
5 676.5465087890625
6 660.7701416015625
7 645.3900146484375
8 630.3855590820312
9 615.78564453125
10 601.5917358398438
11 587.7503662109375
12 574.23486328125
13 561.05224609375
14 548.2566528320312
15 535.7800903320312
16 523.628662109375
17 511.72369384765625
18 500.080322265625
19 488.7501220703125
20 477.6652526855469
21 466.8039245605469
22 456.2086181640625
23 445.8610534667969
24 435.7585144042969
25 425.9405517578125
26 416.3385314941406
27 406.94427490234375
28 397.78839111328125
29 388.84991455078125
30 380.0923767089844
31 371.527099609375
32 363.1761779785156
33 355.01641845703125
34 347.03741455078125
35 339.2467956542969
36 331.62554931640625
37 324.1261291503906
38 316.79461669921875
39 309.6190185546875
40 302.5705871582031
41 295.6624755859375
42 288.8970642089844
43 282.2479248046875
44 275.7345886230469
45 269.3438415527344
46 263.0760192871094
47 256.9371643066406
48 

484 2.3600109670951497e-06
485 2.246973735964275e-06
486 2.1388896129792556e-06
487 2.0359595964691835e-06
488 1.9379765490157297e-06
489 1.8437430071571725e-06
490 1.7552715689816978e-06
491 1.6698904801160097e-06
492 1.5889481801423244e-06
493 1.5121763681236189e-06
494 1.4380871107277926e-06
495 1.367958020637161e-06
496 1.3011454029765446e-06
497 1.2372214541755966e-06
498 1.1767990599764744e-06
499 1.1185830999238533e-06


In [38]:
print(end-start)

0.84619140625


## Custome nn Modules
- define __init
- define forward()
- backward() will be done by autograd

In [39]:
import torch

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

In [40]:
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for t in range(500):
    y_pred = model(x)

    loss = criterion(y_pred, y)
    print(t, loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 657.0488891601562
1 608.1629638671875
2 565.1637573242188
3 527.3429565429688
4 493.5360412597656
5 462.9828796386719
6 435.14764404296875
7 409.7709655761719
8 386.45050048828125
9 364.8515625
10 344.582763671875
11 325.4694519042969
12 307.5060119628906
13 290.3721923828125
14 274.11181640625
15 258.672119140625
16 244.06272888183594
17 230.20310974121094
18 217.01451110839844
19 204.5237274169922
20 192.59664916992188
21 181.3201446533203
22 170.63494873046875
23 160.53695678710938
24 151.00222778320312
25 141.9997100830078
26 133.5238037109375
27 125.53817749023438
28 118.01799774169922
29 110.92809295654297
30 104.27371215820312
31 98.027587890625
32 92.17721557617188
33 86.67762756347656
34 81.51738739013672
35 76.66770935058594
36 72.12227630615234
37 67.83963775634766
38 63.82606506347656
39 60.06318283081055
40 56.537052154541016
41 53.23059844970703
42 50.13134765625
43 47.22095489501953
44 44.496307373046875
45 41.94125747680664
46 39.544776916503906
47 37.29912185668945
4

383 7.142260437831283e-05
384 6.938476872164756e-05
385 6.740630487911403e-05
386 6.549151294166222e-05
387 6.362585554597899e-05
388 6.18185949861072e-05
389 6.006030162097886e-05
390 5.835699994349852e-05
391 5.6701752328081056e-05
392 5.5095435527618974e-05
393 5.353281449060887e-05
394 5.202245301916264e-05
395 5.055128349340521e-05
396 4.912534495815635e-05
397 4.7739424189785495e-05
398 4.638968675862998e-05
399 4.5086813770467415e-05
400 4.3816737161250785e-05
401 4.25818761868868e-05
402 4.138570511713624e-05
403 4.022397843073122e-05
404 3.909366569132544e-05
405 3.799872501986101e-05
406 3.693133476190269e-05
407 3.589770130929537e-05
408 3.489062873995863e-05
409 3.391467180335894e-05
410 3.2966148864943534e-05
411 3.20453145832289e-05
412 3.115273284493014e-05
413 3.0284012609627098e-05
414 2.9437471312121488e-05
415 2.8619830118259415e-05
416 2.7822808988275938e-05
417 2.7047941330238245e-05
418 2.629656228236854e-05
419 2.5564220777596347e-05
420 2.48539618041832e-05
421 

## Control Flow + Weight Sharing
- a fully-connected ReLU
- on each foward pass
- get a rand number (1,4)
- apply that many hidden layer
- reusing the same weights mulitple time


In [42]:
import random
import torch


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

In [43]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = DynamicNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

for t in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    print(t, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


0 694.6466064453125
1 646.4335327148438
2 645.0548095703125
3 561.909423828125
4 648.8294677734375
5 639.1426391601562
6 418.19232177734375
7 636.1817016601562
8 337.19384765625
9 295.2964782714844
10 590.8641967773438
11 630.7500610351562
12 187.41159057617188
13 555.1864013671875
14 534.2852172851562
15 627.0146484375
16 475.89984130859375
17 438.9692687988281
18 587.6603393554688
19 610.5687866210938
20 317.7987060546875
21 522.2824096679688
22 567.4202880859375
23 214.12020874023438
24 189.3125
25 245.04861450195312
26 151.50636291503906
27 134.69302368164062
28 414.22076416015625
29 274.21697998046875
30 239.8072509765625
31 207.57752990722656
32 261.1435852050781
33 218.958740234375
34 230.8821258544922
35 186.8773956298828
36 177.83045959472656
37 205.00611877441406
38 162.72999572753906
39 136.8452911376953
40 111.85853576660156
41 161.79898071289062
42 110.72850799560547
43 132.17123413085938
44 159.05950927734375
45 131.398681640625
46 155.9126434326172
47 122.51481628417969


394 0.8815687298774719
395 1.3410292863845825
396 0.8944778442382812
397 1.0850512981414795
398 1.7017433643341064
399 3.0892183780670166
400 1.3535157442092896
401 1.6275274753570557
402 2.683032274246216
403 1.506862998008728
404 2.1087522506713867
405 0.6940479278564453
406 0.5549706220626831
407 1.4495928287506104
408 0.7407151460647583
409 0.7576965093612671
410 1.4795684814453125
411 2.0082645416259766
412 0.7420099377632141
413 1.0977014303207397
414 0.49077051877975464
415 0.557947039604187
416 2.0735280513763428
417 1.6248900890350342
418 0.16130948066711426
419 0.6673615574836731
420 0.3137296140193939
421 0.45968109369277954
422 1.4217034578323364
423 1.0025177001953125
424 0.24871474504470825
425 0.19954638183116913
426 0.2242172509431839
427 0.47583186626434326
428 0.7102601528167725
429 1.267356276512146
430 1.1205428838729858
431 0.35200873017311096
432 0.5062118768692017
433 0.25318238139152527
434 0.5557329058647156
435 0.12978878617286682
436 0.7733047604560852
437 1.