<a href="https://colab.research.google.com/github/flecue/ml-back-to-basics/blob/master/pytorch_examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Fundamental concepts of PyTorch through self-contained examples

Warm-up: numpy -- https://github.com/jcjohnson/pytorch-examples

In [None]:
# Code in file tensor/two_layer_net_numpy.py
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y
  h = x.dot(w1)
  h_relu = np.maximum(h, 0)
  y_pred = h_relu.dot(w2)
  
  # Compute and print loss
  loss = np.square(y_pred - y).sum()
  print(t, loss)
  
  # Backprop to compute gradients of w1 and w2 with respect to loss
  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.T.dot(grad_y_pred)
  grad_h_relu = grad_y_pred.dot(w2.T)
  grad_h = grad_h_relu.copy()
  grad_h[h < 0] = 0
  grad_w1 = x.T.dot(grad_h)
 
  # Update weights
  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2

0 26396433.366881922
1 19406879.909012705
2 16653276.924716596
3 15245853.985543879
4 13914702.48469096
5 12179987.408520795
6 10002971.499850051
7 7725800.404073164
8 5651702.375165768
9 4006258.6371294837
10 2792040.5411846302
11 1953555.8646161852
12 1387554.7702781004
13 1011998.6282687776
14 759885.8046352663
15 588657.5614096628
16 469142.0774703522
17 383495.233209769
18 320182.8341931361
19 271921.7999252254
20 234072.340200108
21 203607.53342268558
22 178600.03846830328
23 157677.4043844068
24 139932.18624905054
25 124710.38073991331
26 111541.4552549654
27 100062.5765821276
28 89993.75238172355
29 81119.35793833488
30 73265.57023051355
31 66291.67236983686
32 60083.81244547553
33 54542.50792725481
34 49580.595442194426
35 45129.3452619702
36 41131.62282564335
37 37532.87371310646
38 34289.708054634015
39 31361.302833944777
40 28712.592745691723
41 26312.272091706975
42 24137.119273131262
43 22162.801399625983
44 20367.7689784387
45 18733.448556694137
46 17244.79767504126
47 1

PyTorch: Tensors

In [None]:
# Code in file tensor/two_layer_net_tensor.py
import torch

# device = torch.device('cpu')
device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y
  h = x.mm(w1)
  h_relu = h.clamp(min=0)
  y_pred = h_relu.mm(w2)

  # Compute and print loss; loss is a scalar, and is stored in a PyTorch Tensor
  # of shape (); we can get its value as a Python number with loss.item().
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # Backprop to compute gradients of w1 and w2 with respect to loss
  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.t().mm(grad_y_pred)
  grad_h_relu = grad_y_pred.mm(w2.t())
  grad_h = grad_h_relu.clone()
  grad_h[h < 0] = 0
  grad_w1 = x.t().mm(grad_h)

  # Update weights using gradient descent
  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2

0 30102632.0
1 28439188.0
2 30658804.0
3 31875492.0
4 28609304.0
5 20695148.0
6 12320428.0
7 6502527.0
8 3461634.5
9 2029829.0
10 1357580.0
11 1010783.5625
12 807150.6875
13 670921.0
14 570298.0625
15 491094.96875
16 426441.4375
17 372475.375
18 326799.0625
19 287887.5625
20 254493.75
21 225730.28125
22 200818.203125
23 179118.015625
24 160151.875
25 143526.03125
26 128891.1796875
27 115965.1015625
28 104521.140625
29 94364.09375
30 85331.84375
31 77282.828125
32 70087.3125
33 63641.66796875
34 57859.890625
35 52664.4375
36 47989.25
37 43776.0703125
38 40004.171875
39 36600.11328125
40 33517.34375
41 30723.609375
42 28191.78515625
43 25891.8515625
44 23797.97265625
45 21889.044921875
46 20149.28125
47 18557.76171875
48 17104.119140625
49 15775.53515625
50 14559.466796875
51 13445.4404296875
52 12425.216796875
53 11489.4677734375
54 10630.1513671875
55 9840.4453125
56 9114.052734375
57 8445.5361328125
58 7830.951171875
59 7264.85791015625
60 6742.9853515625
61 6261.34765625
62 5816.7358

PyTorch: Autograd

In [None]:
# Code in file autograd/two_layer_net_autograd.py
import torch

# device = torch.device('cpu')
device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Create random Tensors for weights; setting requires_grad=True means that we
# want to compute gradients for these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y using operations on Tensors. Since w1 and
  # w2 have requires_grad=True, operations involving these Tensors will cause
  # PyTorch to build a computational graph, allowing automatic computation of
  # gradients. Since we are no longer implementing the backward pass by hand we
  # don't need to keep references to intermediate values.
  y_pred = x.mm(w1).clamp(min=0).mm(w2)
  
  # Compute and print loss. Loss is a Tensor of shape (), and loss.item()
  # is a Python number giving its value.
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # Use autograd to compute the backward pass. This call will compute the
  # gradient of loss with respect to all Tensors with requires_grad=True.
  # After this call w1.grad and w2.grad will be Tensors holding the gradient
  # of the loss with respect to w1 and w2 respectively.
  loss.backward()

  # Update weights using gradient descent. For this step we just want to mutate
  # the values of w1 and w2 in-place; we don't want to build up a computational
  # graph for the update steps, so we use the torch.no_grad() context manager
  # to prevent PyTorch from building a computational graph for the updates
  with torch.no_grad():
    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad

    # Manually zero the gradients after running the backward pass
    w1.grad.zero_()
    w2.grad.zero_()

0 29901312.0
1 24276404.0
2 21379234.0
3 18269816.0
4 14404308.0
5 10353708.0
6 6951422.5
7 4501437.0
8 2923629.5
9 1957224.5
10 1374035.625
11 1015237.5625
12 785718.25
13 631379.8125
14 521910.09375
15 440064.625
16 376591.4375
17 325652.65625
18 283796.1875
19 248801.0
20 219143.328125
21 193801.90625
22 172023.5
23 153199.015625
24 136825.265625
25 122523.8203125
26 109991.0859375
27 98962.265625
28 89228.15625
29 80620.6328125
30 72977.21875
31 66176.5625
32 60130.17578125
33 54723.45703125
34 49879.953125
35 45534.15625
36 41620.23828125
37 38092.8125
38 34906.20703125
39 32022.734375
40 29408.8828125
41 27036.8046875
42 24880.587890625
43 22918.744140625
44 21131.50390625
45 19501.2109375
46 18011.775390625
47 16649.27734375
48 15401.81640625
49 14258.16015625
50 13208.6455078125
51 12244.94921875
52 11359.0625
53 10544.08984375
54 9795.62109375
55 9105.908203125
56 8469.1298828125
57 7881.25830078125
58 7338.1513671875
59 6836.0703125
60 6370.5224609375
61 5939.52490234375
62 5

PyTorch: Defining new autograd functions

In [None]:
# Code in file autograd/two_layer_net_custom_function.py
import torch

class MyReLU(torch.autograd.Function):
  """
  We can implement our own custom autograd Functions by subclassing
  torch.autograd.Function and implementing the forward and backward passes
  which operate on Tensors.
  """
  @staticmethod
  def forward(ctx, x):
    """
    In the forward pass we receive a context object and a Tensor containing the
    input; we must return a Tensor containing the output, and we can use the
    context object to cache objects for use in the backward pass.
    """
    ctx.save_for_backward(x)
    return x.clamp(min=0)

  @staticmethod
  def backward(ctx, grad_output):
    """
    In the backward pass we receive the context object and a Tensor containing
    the gradient of the loss with respect to the output produced during the
    forward pass. We can retrieve cached data from the context object, and must
    compute and return the gradient of the loss with respect to the input to the
    forward function.
    """
    x, = ctx.saved_tensors
    grad_x = grad_output.clone()
    grad_x[x < 0] = 0
    return grad_x


device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and output
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y using operations on Tensors; we call our
  # custom ReLU implementation using the MyReLU.apply function
  y_pred = MyReLU.apply(x.mm(w1)).mm(w2)
 
  # Compute and print loss
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # Use autograd to compute the backward pass.
  loss.backward()

  with torch.no_grad():
    # Update weights using gradient descent
    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad

    # Manually zero the gradients after running the backward pass
    w1.grad.zero_()
    w2.grad.zero_()

0 26606482.0
1 21883448.0
2 24876296.0
3 33014348.0
4 42192448.0
5 44394816.0
6 33858748.0
7 17776160.0
8 7077691.0
9 2743524.5
10 1324100.375
11 843156.875
12 639978.25
13 524843.8125
14 443950.21875
15 380744.28125
16 329170.125
17 286224.65625
18 250076.140625
19 219426.40625
20 193269.375
21 170817.078125
22 151455.1875
23 134679.15625
24 120098.6640625
25 107375.3984375
26 96227.734375
27 86419.015625
28 77775.3125
29 70142.859375
30 63387.9296875
31 57396.8671875
32 52056.94140625
33 47289.26171875
34 43024.2734375
35 39200.859375
36 35764.73046875
37 32672.12109375
38 29881.958984375
39 27361.029296875
40 25080.3125
41 23014.44140625
42 21139.830078125
43 19436.875
44 17887.41015625
45 16477.517578125
46 15191.7236328125
47 14017.404296875
48 12944.09765625
49 11962.0908203125
50 11062.43359375
51 10237.611328125
52 9480.6123046875
53 8785.421875
54 8146.5048828125
55 7558.525390625
56 7017.1591796875
57 6518.49365234375
58 6058.32421875
59 5633.6513671875
60 5241.5048828125
61 

TensorFlow: Static Graphs

In [None]:
# Code in file autograd/tf_two_layer_net.py

# import tensorflow as tf -> this causes problems on tensofloow version 2
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
  # Run the graph once to initialize the Variables w1 and w2.
  sess.run(tf.global_variables_initializer())

  # Create numpy arrays holding the actual data for the inputs x and targets y
  x_value = np.random.randn(N, D_in)
  y_value = np.random.randn(N, D_out)
  for _ in range(500):
    # Execute the graph many times. Each time it executes we want to bind
    # x_value to x and y_value to y, specified with the feed_dict argument.
    # Each time we execute the graph we want to compute the values for loss,
    # new_w1, and new_w2; the values of these Tensors are returned as numpy
    # arrays.
    loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                feed_dict={x: x_value, y: y_value})
    print(loss_value)

Instructions for updating:
non-resource variables are not supported in the long term
36746828.0
37427600.0
41852824.0
41194744.0
31328228.0
17369508.0
7893663.0
3596880.0
1969379.4
1316112.0
998219.5
804908.75
667738.1
562249.3
478070.22
409398.94
352635.3
305343.94
265623.0
232039.17
203495.42
179122.66
158230.03
140221.11
124618.76
111066.24
99248.6
88892.2
79792.88
71803.87
64747.734
58496.11
52944.133
48000.844
43596.113
39656.805
36127.027
32957.582
30106.213
27536.127
25215.887
23117.84
21219.732
19497.906
17934.797
16513.098
15219.135
14039.068
12960.589
11975.982
11077.616
10255.603
9502.255
8811.008
8176.5137
7593.152
7056.631
6562.373
6106.8237
5686.4697
5298.3525
4939.795
4608.3027
4301.551
4017.4
3754.0723
3509.862
3283.358
3072.944
2877.472
2695.828
2526.8596
2369.5056
2222.9763
2086.4307
1959.137
1840.3256
1729.4548
1625.9266
1529.226
1438.7639
1354.1215
1274.9205
1200.7833
1131.339
1066.2633
1005.2842
948.09503
894.4601
844.1236
796.8481
752.4327
710.70416
671.4794
634.5

PyTorch: nn

In [None]:
# Code in file nn/two_layer_net_nn.py
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
# After constructing the model we use the .to() method to move it to the
# desired device.
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        ).to(device)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function. Setting
# reduction='sum' means that we are computing the *sum* of squared errors rather
# than the mean; this is for consistency with the examples above where we
# manually compute the loss, but in practice it is more common to use mean
# squared error as a loss by setting reduction='elementwise_mean'.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
  # Forward pass: compute predicted y by passing x to the model. Module objects
  # override the __call__ operator so you can call them like functions. When
  # doing so you pass a Tensor of input data to the Module and it produces
  # a Tensor of output data.
  y_pred = model(x)

  # Compute and print loss. We pass Tensors containing the predicted and true
  # values of y, and the loss function returns a Tensor containing the loss.
  loss = loss_fn(y_pred, y)
  print(t, loss.item())
  
  # Zero the gradients before running the backward pass.
  model.zero_grad()

  # Backward pass: compute gradient of the loss with respect to all the learnable
  # parameters of the model. Internally, the parameters of each Module are stored
  # in Tensors with requires_grad=True, so this call will compute gradients for
  # all learnable parameters in the model.
  loss.backward()

  # Update the weights using gradient descent. Each parameter is a Tensor, so
  # we can access its data and gradients like we did before.
  with torch.no_grad():
    for param in model.parameters():
      param.data -= learning_rate * param.grad

0 723.328857421875
1 665.3406372070312
2 616.8114013671875
3 575.0645751953125
4 538.4151611328125
5 505.92498779296875
6 476.14300537109375
7 448.91864013671875
8 423.5191345214844
9 400.162353515625
10 378.4400939941406
11 358.0506591796875
12 338.9394836425781
13 320.8081359863281
14 303.6729431152344
15 287.35821533203125
16 271.7994384765625
17 256.917236328125
18 242.74771118164062
19 229.25833129882812
20 216.4097900390625
21 204.17379760742188
22 192.49496459960938
23 181.36318969726562
24 170.7843017578125
25 160.73829650878906
26 151.20437622070312
27 142.17727661132812
28 133.63644409179688
29 125.5472412109375
30 117.91654968261719
31 110.72642517089844
32 103.95173645019531
33 97.58236694335938
34 91.59589385986328
35 85.97016906738281
36 80.69068908691406
37 75.73841857910156
38 71.07051849365234
39 66.68684387207031
40 62.57154083251953
41 58.71418762207031
42 55.10502624511719
43 51.723365783691406
44 48.55567932128906
45 45.586090087890625
46 42.80587387084961
47 40.19

PyTorch: optim

In [None]:
# Code in file nn/two_layer_net_optim.py
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs.
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        )
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algorithms. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
  # Forward pass: compute predicted y by passing x to the model.
  y_pred = model(x)

  # Compute and print loss.
  loss = loss_fn(y_pred, y)
  print(t, loss.item())
  
  # Before the backward pass, use the optimizer object to zero all of the
  # gradients for the Tensors it will update (which are the learnable weights
  # of the model)
  optimizer.zero_grad()

  # Backward pass: compute gradient of the loss with respect to model parameters
  loss.backward()

  # Calling the step function on an Optimizer makes an update to its parameters
  optimizer.step()

0 702.3942260742188
1 684.8390502929688
2 667.8131103515625
3 651.3367919921875
4 635.3045654296875
5 619.7864990234375
6 604.7274780273438
7 590.101806640625
8 575.9267578125
9 562.240966796875
10 548.9314575195312
11 536.0692749023438
12 523.56689453125
13 511.43505859375
14 499.64971923828125
15 488.17987060546875
16 477.03741455078125
17 466.2320251464844
18 455.71337890625
19 445.4958801269531
20 435.5445251464844
21 425.8295593261719
22 416.3707275390625
23 407.1593017578125
24 398.134765625
25 389.37554931640625
26 380.8812561035156
27 372.6099853515625
28 364.52880859375
29 356.6188049316406
30 348.8829650878906
31 341.3634948730469
32 334.0481262207031
33 326.9001159667969
34 319.8994140625
35 313.0061340332031
36 306.2316589355469
37 299.58013916015625
38 293.09735107421875
39 286.7384338378906
40 280.48046875
41 274.3565673828125
42 268.3641357421875
43 262.46307373046875
44 256.67413330078125
45 251.00267028808594
46 245.44061279296875
47 239.99072265625
48 234.651382446289

PyTorch: Custom nn Modules

In [None]:
# Code in file nn/two_layer_net_module.py
import torch

class TwoLayerNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    """
    In the constructor we instantiate two nn.Linear modules and assign them as
    member variables.
    """
    super(TwoLayerNet, self).__init__()
    self.linear1 = torch.nn.Linear(D_in, H)
    self.linear2 = torch.nn.Linear(H, D_out)

  def forward(self, x):
    """
    In the forward function we accept a Tensor of input data and we must return
    a Tensor of output data. We can use Modules defined in the constructor as
    well as arbitrary (differentiable) operations on Tensors.
    """
    h_relu = self.linear1(x).clamp(min=0)
    y_pred = self.linear2(h_relu)
    return y_pred

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above.
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
  # Forward pass: Compute predicted y by passing x to the model
  y_pred = model(x)

  # Compute and print loss
  loss = loss_fn(y_pred, y)
  print(t, loss.item())

  # Zero gradients, perform a backward pass, and update the weights.
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

0 643.819091796875
1 599.687255859375
2 560.9725952148438
3 526.6018676757812
4 495.7556457519531
5 467.88623046875
6 442.6900939941406
7 419.27117919921875
8 397.63714599609375
9 377.43939208984375
10 358.4281311035156
11 340.54150390625
12 323.5330810546875
13 307.39959716796875
14 292.02978515625
15 277.2976989746094
16 263.1766052246094
17 249.75746154785156
18 236.948974609375
19 224.7375030517578
20 213.08682250976562
21 201.95425415039062
22 191.31700134277344
23 181.15957641601562
24 171.436279296875
25 162.0989532470703
26 153.20748901367188
27 144.70875549316406
28 136.62063598632812
29 128.9405975341797
30 121.66677856445312
31 114.76039123535156
32 108.20635223388672
33 101.99005126953125
34 96.11670684814453
35 90.56095886230469
36 85.2976303100586
37 80.33368682861328
38 75.67613983154297
39 71.27505493164062
40 67.1375961303711
41 63.240657806396484
42 59.57960510253906
43 56.12895202636719
44 52.88474655151367
45 49.84014892578125
46 46.9820442199707
47 44.2975578308105

PyTorch: Control Flow + Weight Sharing

In [None]:
# Code in file nn/dynamic_net.py
import random
import torch

class DynamicNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    """
    In the constructor we construct three nn.Linear instances that we will use
    in the forward pass.
    """
    super(DynamicNet, self).__init__()
    self.input_linear = torch.nn.Linear(D_in, H)
    self.middle_linear = torch.nn.Linear(H, H)
    self.output_linear = torch.nn.Linear(H, D_out)

  def forward(self, x):
    """
    For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
    and reuse the middle_linear Module that many times to compute hidden layer
    representations.

    Since each forward pass builds a dynamic computation graph, we can use normal
    Python control-flow operators like loops or conditional statements when
    defining the forward pass of the model.

    Here we also see that it is perfectly safe to reuse the same Module many
    times when defining a computational graph. This is a big improvement from Lua
    Torch, where each Module could be used only once.
    """
    h_relu = self.input_linear(x).clamp(min=0)
    for _ in range(random.randint(0, 3)):
      h_relu = self.middle_linear(h_relu).clamp(min=0)
    y_pred = self.output_linear(h_relu)
    return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs.
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
  # Forward pass: Compute predicted y by passing x to the model
  y_pred = model(x)

  # Compute and print loss
  loss = criterion(y_pred, y)
  print(t, loss.item())

  # Zero gradients, perform a backward pass, and update the weights.
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

0 629.9264526367188
1 584.5685424804688
2 510.84954833984375
3 584.625244140625
4 367.223876953125
5 308.50213623046875
6 254.86587524414062
7 578.38427734375
8 174.58868408203125
9 574.5233764648438
10 125.04602813720703
11 545.2431030273438
12 566.2965087890625
13 560.2883911132812
14 483.48681640625
15 538.585205078125
16 420.34722900390625
17 121.79531860351562
18 114.53099822998047
19 476.7185363769531
20 299.75146484375
21 270.26708984375
22 412.21954345703125
23 487.9044189453125
24 177.90216064453125
25 131.008056640625
26 281.44879150390625
27 130.5889129638672
28 234.35610961914062
29 103.15802764892578
30 284.6755065917969
31 171.997314453125
32 155.11770629882812
33 76.00270080566406
34 128.7870330810547
35 56.348228454589844
36 97.01957702636719
37 42.49695587158203
38 75.67996215820312
39 85.03910064697266
40 35.33122253417969
41 71.96217346191406
42 66.67933654785156
43 37.17229461669922
44 112.19108581542969
45 92.86408233642578
46 31.457950592041016
47 87.7069320678711