<a href="https://colab.research.google.com/github/dkanzariya/AI-Notebooks/blob/master/pytorch_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Pytorch tutorial with comment

In [None]:
import time
import numpy as np

start_time = time.time()
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y
  h = x.dot(w1)
  h_relu = np.maximum(h, 0)
  y_pred = h_relu.dot(w2)
  
  # Compute and print loss
  loss = np.square(y_pred - y).sum()
  print(t, loss)
  
  # Backprop to compute gradients of w1 and w2 with respect to loss
  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.T.dot(grad_y_pred)
  grad_h_relu = grad_y_pred.dot(w2.T)
  grad_h = grad_h_relu.copy()
  grad_h[h < 0] = 0
  grad_w1 = x.T.dot(grad_h)
 
  # Update weights
  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2
print("--- %s seconds ---" % (time.time() - start_time))

0 27424503.68954306
1 22796186.19930524
2 22290647.247438625
3 22529733.11563731
4 21567054.225588135
5 18304373.13435915
6 13681652.10933756
7 9023658.428783735
8 5552507.393473214
9 3326808.898340334
10 2052531.562645676
11 1336297.6591190365
12 932946.9798675845
13 694880.9440833789
14 545975.5995274353
15 445993.2676850022
16 374433.95773795166
17 320287.39727445255
18 277474.34489664895
19 242629.280256963
20 213599.97373665933
21 188986.047554332
22 167935.79617007222
23 149793.89436627546
24 134049.44921262594
25 120305.46964135017
26 108253.22933253362
27 97651.48915711534
28 88286.26755746198
29 79981.57677789562
30 72593.58150096031
31 66004.7406061747
32 60115.53014929258
33 54839.55511576931
34 50096.95509351269
35 45828.28474747749
36 41979.89629635101
37 38501.02151508067
38 35350.717483951295
39 32497.408922039216
40 29904.987916647893
41 27547.529343494556
42 25399.844423457558
43 23441.491564363452
44 21652.35338723719
45 20016.443612119598
46 18517.43304781964
47 1714

In [None]:
import torch
import time

start_time = time.time()
# device = torch.device('cpu')
device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y
  h = x.mm(w1)
  h_relu = h.clamp(min=0)
  y_pred = h_relu.mm(w2)

  # Compute and print loss; loss is a scalar, and is stored in a PyTorch Tensor
  # of shape (); we can get its value as a Python number with loss.item().
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # Backprop to compute gradients of w1 and w2 with respect to loss
  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.t().mm(grad_y_pred)
  grad_h_relu = grad_y_pred.mm(w2.t())
  grad_h = grad_h_relu.clone()
  grad_h[h < 0] = 0
  grad_w1 = x.t().mm(grad_h)

  # Update weights using gradient descent
  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2
print("--- %s seconds ---" % (time.time() - start_time))

0 27960376.0
1 25626336.0
2 31732964.0
3 43080680.0
4 52237240.0
5 47865232.0
6 29363112.0
7 12200625.0
8 4313265.5
9 1854965.25
10 1118208.5
11 839636.125
12 688886.5
13 583534.0625
14 500954.5625
15 433426.75
16 377231.0
17 329972.1875
18 289927.90625
19 255771.671875
20 226506.734375
21 201320.75
22 179554.28125
23 160622.0625
24 144106.015625
25 129633.03125
26 116908.4609375
27 105680.4765625
28 95728.6875
29 86879.328125
30 78989.25
31 71939.265625
32 65624.796875
33 59952.90625
34 54855.1015625
35 50259.6171875
36 46108.73828125
37 42355.3359375
38 38950.21875
39 35856.796875
40 33044.18359375
41 30480.90234375
42 28141.421875
43 26004.26171875
44 24050.671875
45 22263.41015625
46 20624.4609375
47 19120.42578125
48 17738.744140625
49 16467.451171875
50 15296.638671875
51 14218.0732421875
52 13223.6396484375
53 12306.06640625
54 11458.646484375
55 10675.623046875
56 9951.4765625
57 9281.259765625
58 8660.35546875
59 8085.18359375
60 7553.8466796875
61 7060.806640625
62 6603.51074

In [None]:
import torch

# device = torch.device('cpu')
device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Create random Tensors for weights; setting requires_grad=True means that we
# want to compute gradients for these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y using operations on Tensors. Since w1 and
  # w2 have requires_grad=True, operations involving these Tensors will cause
  # PyTorch to build a computational graph, allowing automatic computation of
  # gradients. Since we are no longer implementing the backward pass by hand we
  # don't need to keep references to intermediate values.
  y_pred = x.mm(w1).clamp(min=0).mm(w2)
  
  # Compute and print loss. Loss is a Tensor of shape (), and loss.item()
  # is a Python number giving its value.
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # Use autograd to compute the backward pass. This call will compute the
  # gradient of loss with respect to all Tensors with requires_grad=True.
  # After this call w1.grad and w2.grad will be Tensors holding the gradient
  # of the loss with respect to w1 and w2 respectively.
  loss.backward()

  # Update weights using gradient descent. For this step we just want to mutate
  # the values of w1 and w2 in-place; we don't want to build up a computational
  # graph for the update steps, so we use the torch.no_grad() context manager
  # to prevent PyTorch from building a computational graph for the updates
  with torch.no_grad():
    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad

    # Manually zero the gradients after running the backward pass
    w1.grad.zero_()
    w2.grad.zero_()


0 28808532.0
1 28853480.0
2 32873520.0
3 35459864.0
4 32038916.0
5 22263456.0
6 12282948.0
7 5912540.0
8 2965759.5
9 1723818.75
10 1181984.5
11 907094.375
12 740251.5625
13 622973.0
14 532763.0625
15 459982.5
16 399810.65625
17 349395.84375
18 306816.9375
19 270553.84375
20 239518.1875
21 212782.515625
22 189641.890625
23 169572.21875
24 152110.59375
25 136762.375
26 123273.140625
27 111384.078125
28 100857.109375
29 91523.234375
30 83223.5
31 75823.609375
32 69202.59375
33 63267.5703125
34 57934.8203125
35 53135.9375
36 48815.09375
37 44906.5859375
38 41366.5390625
39 38151.8828125
40 35229.02734375
41 32567.04296875
42 30140.28515625
43 27922.708984375
44 25893.830078125
45 24034.01171875
46 22328.13671875
47 20763.24609375
48 19324.8203125
49 18000.74609375
50 16779.49609375
51 15652.2265625
52 14610.40625
53 13647.171875
54 12754.78125
55 11927.78125
56 11160.904296875
57 10449.6328125
58 9789.3310546875
59 9175.58203125
60 8604.751953125
61 8073.23779296875
62 7578.18359375
63 711

In [None]:
# Code in file autograd/two_layer_net_custom_function.py
import torch

class MyReLU(torch.autograd.Function):
  """
  We can implement our own custom autograd Functions by subclassing
  torch.autograd.Function and implementing the forward and backward passes
  which operate on Tensors.
  """
  @staticmethod
  def forward(ctx, x):
    """
    In the forward pass we receive a context object and a Tensor containing the
    input; we must return a Tensor containing the output, and we can use the
    context object to cache objects for use in the backward pass.
    """
    ctx.save_for_backward(x)
    return x.clamp(min=0)

  @staticmethod
  def backward(ctx, grad_output):
    """
    In the backward pass we receive the context object and a Tensor containing
    the gradient of the loss with respect to the output produced during the
    forward pass. We can retrieve cached data from the context object, and must
    compute and return the gradient of the loss with respect to the input to the
    forward function.
    """
    x, = ctx.saved_tensors
    grad_x = grad_output.clone()
    grad_x[x < 0] = 0
    return grad_x


device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and output
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y using operations on Tensors; we call our
  # custom ReLU implementation using the MyReLU.apply function
  y_pred = MyReLU.apply(x.mm(w1)).mm(w2)
 
  # Compute and print loss
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # Use autograd to compute the backward pass.
  loss.backward()

  with torch.no_grad():
    # Update weights using gradient descent
    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad

    # Manually zero the gradients after running the backward pass
    w1.grad.zero_()
    w2.grad.zero_()



0 32271960.0
1 25066348.0
2 21325046.0
3 17906532.0
4 14105932.0
5 10297919.0
6 7092155.5
7 4740865.0
8 3178338.75
9 2184725.75
10 1561216.75
11 1163023.875
12 900449.625
13 719737.1875
14 590081.5
15 492998.46875
16 417776.25
17 357897.40625
18 309063.9375
19 268452.75
20 234397.421875
21 205536.890625
22 180873.59375
23 159682.046875
24 141363.015625
25 125457.578125
26 111607.046875
27 99501.484375
28 88905.15625
29 79587.859375
30 71375.90625
31 64119.83984375
32 57669.015625
33 51942.640625
34 46851.6953125
35 42320.0078125
36 38282.05078125
37 34673.12890625
38 31445.240234375
39 28549.654296875
40 25950.248046875
41 23619.654296875
42 21520.70703125
43 19627.96875
44 17919.224609375
45 16374.98046875
46 14978.576171875
47 13714.033203125
48 12567.388671875
49 11526.396484375
50 10580.1142578125
51 9719.0703125
52 8935.1064453125
53 8220.6962890625
54 7568.78125
55 6973.53369140625
56 6429.53564453125
57 5932.03173828125
58 5476.916015625
59 5059.87548828125
60 4677.52099609375
6

In [None]:
import numpy as np
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
  # Run the graph once to initialize the Variables w1 and w2.
  sess.run(tf.global_variables_initializer())

  # Create numpy arrays holding the actual data for the inputs x and targets y
  x_value = np.random.randn(N, D_in)
  y_value = np.random.randn(N, D_out)
  for _ in range(500):
    # Execute the graph many times. Each time it executes we want to bind
    # x_value to x and y_value to y, specified with the feed_dict argument.
    # Each time we execute the graph we want to compute the values for loss,
    # new_w1, and new_w2; the values of these Tensors are returned as numpy
    # arrays.
    loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                feed_dict={x: x_value, y: y_value})
    print(loss_value)


Instructions for updating:
non-resource variables are not supported in the long term
24055768.0
18145598.0
15820517.0
14731917.0
13695217.0
12229802.0
10228829.0
8027734.0
5943121.0
4237078.0
2954795.8
2060482.6
1453772.6
1051056.2
782081.94
600458.5
474856.03
385727.78
320295.3
270790.5
232190.31
201299.7
176022.03
154953.36
137171.42
121965.125
108824.88
97392.836
87405.11
78631.086
70888.43
64023.71
57924.88
52491.254
47638.367
43294.02
39397.957
35897.4
32745.914
29906.727
27347.195
25031.816
22936.73
21036.992
19313.191
17745.256
16322.555
15026.805
13845.873
12768.758
11783.783
10882.549
10057.24
9300.641
8606.568
7969.7275
7384.207
6845.995
6350.6016
5894.5635
5474.3765
5086.777
4729.031
4398.752
4093.3494
3810.902
3549.4983
3307.4758
3083.204
2875.3354
2682.7002
2503.8123
2337.7615
2183.4202
2040.087
1906.732
1782.6833
1667.2246
1559.7977
1459.667
1366.3677
1279.386
1198.3021
1122.6503
1052.0425
986.1434
924.6432
867.1301
813.3966
763.16125
716.2132
672.27094
631.1687
592.68414

In [None]:
# Code in file nn/two_layer_net_nn.py
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
# After constructing the model we use the .to() method to move it to the
# desired device.
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        ).to(device)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function. Setting
# reduction='sum' means that we are computing the *sum* of squared errors rather
# than the mean; this is for consistency with the examples above where we
# manually compute the loss, but in practice it is more common to use mean
# squared error as a loss by setting reduction='elementwise_mean'.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
  # Forward pass: compute predicted y by passing x to the model. Module objects
  # override the __call__ operator so you can call them like functions. When
  # doing so you pass a Tensor of input data to the Module and it produces
  # a Tensor of output data.
  y_pred = model(x)

  # Compute and print loss. We pass Tensors containing the predicted and true
  # values of y, and the loss function returns a Tensor containing the loss.
  loss = loss_fn(y_pred, y)
  print(t, loss.item())
  
  # Zero the gradients before running the backward pass.
  model.zero_grad()

  # Backward pass: compute gradient of the loss with respect to all the learnable
  # parameters of the model. Internally, the parameters of each Module are stored
  # in Tensors with requires_grad=True, so this call will compute gradients for
  # all learnable parameters in the model.
  loss.backward()

  # Update the weights using gradient descent. Each parameter is a Tensor, so
  # we can access its data and gradients like we did before.
  with torch.no_grad():
    for param in model.parameters():
      param.data -= learning_rate * param.grad


0 648.663818359375
1 602.0015258789062
2 560.98779296875
3 524.5716552734375
4 492.0588684082031
5 462.7075500488281
6 435.7606201171875
7 411.0263671875
8 388.2406921386719
9 366.93011474609375
10 346.8517761230469
11 327.9144287109375
12 310.1330871582031
13 293.2542724609375
14 277.2796936035156
15 262.0508117675781
16 247.5782470703125
17 233.80870056152344
18 220.71478271484375
19 208.28204345703125
20 196.43655395507812
21 185.15931701660156
22 174.45498657226562
23 164.31695556640625
24 154.71592712402344
25 145.58160400390625
26 136.92271423339844
27 128.73655700683594
28 121.0045394897461
29 113.7004623413086
30 106.8165512084961
31 100.33067321777344
32 94.22438049316406
33 88.47716522216797
34 83.07918548583984
35 78.01973724365234
36 73.27088165283203
37 68.81683349609375
38 64.63436889648438
39 60.71023178100586
40 57.03742218017578
41 53.597267150878906
42 50.37464141845703
43 47.3487663269043
44 44.51903533935547
45 41.87025833129883
46 39.392127990722656
47 37.076076507

In [None]:
# Code in file nn/two_layer_net_optim.py
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs.
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        )
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algorithms. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
  # Forward pass: compute predicted y by passing x to the model.
  y_pred = model(x)

  # Compute and print loss.
  loss = loss_fn(y_pred, y)
  print(t, loss.item())
  
  # Before the backward pass, use the optimizer object to zero all of the
  # gradients for the Tensors it will update (which are the learnable weights
  # of the model)
  optimizer.zero_grad()

  # Backward pass: compute gradient of the loss with respect to model parameters
  loss.backward()

  # Calling the step function on an Optimizer makes an update to its parameters
  optimizer.step()


0 672.701904296875
1 655.5008544921875
2 638.7713012695312
3 622.5156860351562
4 606.7432861328125
5 591.3767700195312
6 576.5172119140625
7 562.1378173828125
8 548.2099609375
9 534.6563720703125
10 521.5331420898438
11 508.85589599609375
12 496.5869445800781
13 484.68341064453125
14 473.1648254394531
15 461.9825439453125
16 451.0685119628906
17 440.3684997558594
18 429.89447021484375
19 419.7164306640625
20 409.9297790527344
21 400.4506530761719
22 391.3001708984375
23 382.42352294921875
24 373.75079345703125
25 365.2677307128906
26 356.9698791503906
27 348.86553955078125
28 340.9283447265625
29 333.197509765625
30 325.63446044921875
31 318.20428466796875
32 310.9620056152344
33 303.875244140625
34 296.94696044921875
35 290.20416259765625
36 283.6206970214844
37 277.1614074707031
38 270.817138671875
39 264.62249755859375
40 258.53271484375
41 252.5489044189453
42 246.6917724609375
43 240.9436492919922
44 235.31121826171875
45 229.82797241210938
46 224.44276428222656
47 219.15188598632

In [None]:
# Code in file nn/two_layer_net_module.py
import torch

class TwoLayerNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    """
    In the constructor we instantiate two nn.Linear modules and assign them as
    member variables.
    """
    super(TwoLayerNet, self).__init__()
    self.linear1 = torch.nn.Linear(D_in, H)
    self.linear2 = torch.nn.Linear(H, D_out)

  def forward(self, x):
    """
    In the forward function we accept a Tensor of input data and we must return
    a Tensor of output data. We can use Modules defined in the constructor as
    well as arbitrary (differentiable) operations on Tensors.
    """
    h_relu = self.linear1(x).clamp(min=0)
    y_pred = self.linear2(h_relu)
    return y_pred

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above.
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
  # Forward pass: Compute predicted y by passing x to the model
  y_pred = model(x)

  # Compute and print loss
  loss = loss_fn(y_pred, y)
  print(t, loss.item())

  # Zero gradients, perform a backward pass, and update the weights.
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()



0 686.425048828125
1 637.8709716796875
2 595.5647583007812
3 558.3999633789062
4 525.0234985351562
5 494.70196533203125
6 467.11773681640625
7 441.477783203125
8 417.7107849121094
9 395.42681884765625
10 374.4598693847656
11 354.8149108886719
12 336.1886901855469
13 318.4610595703125
14 301.67523193359375
15 285.6275329589844
16 270.2740173339844
17 255.5702362060547
18 241.58978271484375
19 228.23941040039062
20 215.54721069335938
21 203.4690704345703
22 191.9642333984375
23 181.03236389160156
24 170.6521453857422
25 160.758056640625
26 151.3758544921875
27 142.50108337402344
28 134.10289001464844
29 126.12176513671875
30 118.57012939453125
31 111.43146514892578
32 104.66985321044922
33 98.28768920898438
34 92.25708770751953
35 86.58206176757812
36 81.25836181640625
37 76.27629089355469
38 71.5999526977539
39 67.21029663085938
40 63.09667205810547
41 59.25386428833008
42 55.6488151550293
43 52.26586151123047
44 49.08864212036133
45 46.112117767333984
46 43.32244873046875
47 40.7096099

In [None]:
# Code in file nn/dynamic_net.py
import random
import torch

class DynamicNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    """
    In the constructor we construct three nn.Linear instances that we will use
    in the forward pass.
    """
    super(DynamicNet, self).__init__()
    self.input_linear = torch.nn.Linear(D_in, H)
    self.middle_linear = torch.nn.Linear(H, H)
    self.output_linear = torch.nn.Linear(H, D_out)

  def forward(self, x):
    """
    For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
    and reuse the middle_linear Module that many times to compute hidden layer
    representations.

    Since each forward pass builds a dynamic computation graph, we can use normal
    Python control-flow operators like loops or conditional statements when
    defining the forward pass of the model.

    Here we also see that it is perfectly safe to reuse the same Module many
    times when defining a computational graph. This is a big improvement from Lua
    Torch, where each Module could be used only once.
    """
    h_relu = self.input_linear(x).clamp(min=0)
    for _ in range(random.randint(0, 3)):
      h_relu = self.middle_linear(h_relu).clamp(min=0)
    y_pred = self.output_linear(h_relu)
    return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs.
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
  # Forward pass: Compute predicted y by passing x to the model
  y_pred = model(x)

  # Compute and print loss
  loss = criterion(y_pred, y)
  print(t, loss.item())

  # Zero gradients, perform a backward pass, and update the weights.
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()


0 638.4155883789062
1 637.5550537109375
2 671.3807373046875
3 611.5127563476562
4 633.3062744140625
5 543.3452758789062
6 483.456298828125
7 568.3768310546875
8 357.78594970703125
9 300.0804443359375
10 620.122314453125
11 529.1466064453125
12 626.7066650390625
13 148.12681579589844
14 124.63733673095703
15 100.6673812866211
16 446.47235107421875
17 71.17749786376953
18 395.25048828125
19 72.82600402832031
20 601.0855102539062
21 308.0509338378906
22 513.1746215820312
23 558.686279296875
24 216.6508331298828
25 407.7015380859375
26 168.76882934570312
27 338.3005676269531
28 307.76361083984375
29 149.88157653808594
30 159.98379516601562
31 137.91024780273438
32 306.4300842285156
33 97.96807098388672
34 86.84786987304688
35 161.40643310546875
36 62.953975677490234
37 64.7838134765625
38 109.91879272460938
39 116.98856353759766
40 90.85096740722656
41 83.77637481689453
42 70.49765014648438
43 50.698734283447266
44 51.126075744628906
45 39.4654541015625
46 84.21495056152344
47 103.39818572