Tensors

Warm-up: numpy

In [2]:
# -*- coding: utf-8 -*-
import numpy as np

In [3]:
# N is batch size
# D_in is input dimension;
# H is hidden dimension;
# D_out is output dimension.

N, D_in, H, D_out = 64, 1000, 100, 10

In [4]:
# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [5]:
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [6]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 38956746.569864005
1 32209686.75252662
2 24228914.0350871
3 15877890.182624102
4 9390301.733084235
5 5500074.000594686
6 3429778.419421806
7 2347416.4113436947
8 1740288.1416685295
9 1363861.3405861554
10 1105924.107644165
11 915907.3113529782
12 768880.6503582762
13 651888.1636764943
14 556774.7404462438
15 478433.55473524507
16 413231.1274069061
17 358514.60874253185
18 312282.424728881
19 273034.184990944
20 239515.28431043698
21 210783.82489751207
22 186056.75162117707
23 164650.63321717555
24 146060.69451449992
25 129853.09941936375
26 115686.55633238208
27 103267.38607898887
28 92375.10215935568
29 82773.67749056035
30 74291.48310559729
31 66778.14052885468
32 60109.62261424337
33 54182.053139679694
34 48926.53919496742
35 44237.96176475592
36 40050.63966261402
37 36299.572447005616
38 32936.56455236734
39 29915.734905231024
40 27198.579551215975
41 24750.632281321363
42 22542.59531729658
43 20548.134208887015
44 18746.00962203733
45 17115.095687602763
46 15638.901942616223
47 

382 4.369941987325113e-06
383 4.1191702356581006e-06
384 3.882908081977591e-06
385 3.6602344828104382e-06
386 3.4503465051284558e-06
387 3.2525655061379575e-06
388 3.0661535172582894e-06
389 2.8905212616744145e-06
390 2.724980963161972e-06
391 2.5689351273182524e-06
392 2.4218514742927367e-06
393 2.2832396741362797e-06
394 2.152617506915135e-06
395 2.029497657380395e-06
396 1.9134398831434113e-06
397 1.8040364237386476e-06
398 1.7009275994130918e-06
399 1.6037414696076764e-06
400 1.5121306062049893e-06
401 1.425765774686455e-06
402 1.3443573855289933e-06
403 1.2676160099195458e-06
404 1.1952712528876932e-06
405 1.1270809858011692e-06
406 1.0627857743820217e-06
407 1.0021777656231152e-06
408 9.450465464772124e-07
409 8.911758974326834e-07
410 8.403949376867669e-07
411 7.9251496816969e-07
412 7.473827687103561e-07
413 7.048194219864965e-07
414 6.646949497634621e-07
415 6.26868053547059e-07
416 5.911940637453822e-07
417 5.57562518244967e-07
418 5.258520143430821e-07
419 4.959486051848099e

PyTorch: Tensors

In [7]:
# -*- coding: utf-8 -*-
import torch

In [8]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

In [9]:
# N is batch size
# D_in is input dimension;
# H is hidden dimension;
# D_out is output dimension.

N, D_in, H, D_out = 64, 1000, 100, 10

In [10]:
# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [11]:
# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

In [12]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 35037344.0
1 30013722.0
2 26158256.0
3 20697328.0
4 14416138.0
5 9037380.0
6 5431960.5
7 3330000.25
8 2177336.5
9 1535701.125
10 1158691.75
11 918837.75
12 753467.9375
13 631650.9375
14 537445.375
15 462089.625
16 400398.6875
17 349123.84375
18 306013.28125
19 269539.40625
20 238406.84375
21 211657.125
22 188549.125
23 168487.421875
24 150980.578125
25 135658.59375
26 122186.4453125
27 110322.359375
28 99836.09375
29 90526.8828125
30 82232.734375
31 74840.328125
32 68227.8671875
33 62296.9140625
34 56965.42578125
35 52164.33203125
36 47828.53125
37 43916.1796875
38 40379.5234375
39 37168.9140625
40 34250.12890625
41 31593.078125
42 29173.2734375
43 26965.13671875
44 24947.4765625
45 23101.013671875
46 21408.40234375
47 19855.486328125
48 18429.369140625
49 17117.390625
50 15910.314453125
51 14798.234375
52 13772.201171875
53 12825.2880859375
54 11952.359375
55 11146.0458984375
56 10399.7587890625
57 9708.3291015625
58 9067.462890625
59 8473.4130859375
60 7922.10400390625
61 7410.1918

385 0.0010982209350913763
386 0.0010588071309030056
387 0.0010231889318674803
388 0.0009895163821056485
389 0.0009552748524583876
390 0.0009240593062713742
391 0.0008950341725721955
392 0.0008656525751575828
393 0.0008377445628866553
394 0.0008103660074993968
395 0.0007850135443732142
396 0.0007593724294565618
397 0.0007350958767347038
398 0.0007129504811018705
399 0.000691111374180764
400 0.0006702313548885286
401 0.0006491619860753417
402 0.0006294778431765735
403 0.0006091604591347277
404 0.0005918012466281652
405 0.0005756137543357909
406 0.0005585511680692434
407 0.0005409617442637682
408 0.0005254005081951618
409 0.0005103641306050122
410 0.0004945972468703985
411 0.0004815948777832091
412 0.00046821613796055317
413 0.00045571260852739215
414 0.0004436075105331838
415 0.00043114591971971095
416 0.0004175901412963867
417 0.00040718630771152675
418 0.00039635534631088376
419 0.00038428520201705396
420 0.00037445040652528405
421 0.00036432623164728284
422 0.00035437417682260275
423 

PyTorch: Tensors and autograd

In [13]:
# -*- coding: utf-8 -*-
import torch

In [14]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

In [15]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [16]:
# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

In [17]:
# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [18]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 31369276.0
1 31981216.0
2 36917224.0
3 39131140.0
4 33450786.0
5 21331460.0
6 10648509.0
7 4769433.5
8 2316887.5
9 1350306.5
10 935844.25
11 722342.6875
12 589178.0625
13 493631.71875
14 419585.09375
15 359877.59375
16 310793.375
17 270007.34375
18 235800.703125
19 206840.03125
20 182222.953125
21 161145.03125
22 143020.125
23 127354.2890625
24 113743.4609375
25 101872.6171875
26 91496.0234375
27 82392.09375
28 74394.28125
29 67331.8203125
30 61069.046875
31 55496.1328125
32 50521.69921875
33 46073.49609375
34 42083.71484375
35 38499.1484375
36 35271.95703125
37 32358.484375
38 29724.671875
39 27337.791015625
40 25172.046875
41 23204.701171875
42 21416.37109375
43 19787.3359375
44 18300.6484375
45 16947.212890625
46 15709.6796875
47 14575.14453125
48 13533.5986328125
49 12577.2265625
50 11697.5537109375
51 10887.5224609375
52 10140.9755859375
53 9452.099609375
54 8815.8408203125
55 8227.6240234375
56 7683.33642578125
57 7180.84912109375
58 6714.9951171875
59 6282.9951171875
60 5882.1

New Autograd functions

In [19]:
# -*- coding: utf-8 -*-
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

In [20]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

In [21]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

In [22]:
# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

In [23]:
learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 28899014.0
1 22329404.0
2 21632238.0
3 22928274.0
4 23815748.0
5 22158440.0
6 17910940.0
7 12299582.0
8 7553762.0
9 4345823.5
10 2525171.25
11 1543969.75
12 1025549.9375
13 739061.375
14 570473.8125
15 462322.21875
16 387063.125
17 330746.40625
18 286368.65625
19 250209.796875
20 219984.875
21 194306.96875
22 172270.421875
23 153202.28125
24 136613.8125
25 122138.734375
26 109425.9140625
27 98240.2421875
28 88369.4375
29 79629.28125
30 71873.3515625
31 64971.34375
32 58816.6796875
33 53322.42578125
34 48404.53515625
35 43999.5546875
36 40043.0390625
37 36481.76953125
38 33269.35546875
39 30370.365234375
40 27749.763671875
41 25378.154296875
42 23228.984375
43 21279.2421875
44 19508.412109375
45 17898.58984375
46 16433.666015625
47 15099.208984375
48 13882.7099609375
49 12772.1044921875
50 11758.2158203125
51 10831.4150390625
52 9983.5703125
53 9207.6767578125
54 8496.576171875
55 7844.71630859375
56 7246.7294921875
57 6698.11279296875
58 6194.03857421875
59 5730.7587890625
60 5304.63

472 7.324118632823229e-05
473 7.165219722082838e-05
474 7.050712883938104e-05
475 6.930742529220879e-05
476 6.84030310367234e-05
477 6.775839574402198e-05
478 6.673327879980206e-05
479 6.589785334654152e-05
480 6.488296639872715e-05
481 6.381313141901046e-05
482 6.307488365564495e-05
483 6.202980875968933e-05
484 6.107716035330668e-05
485 6.00262574153021e-05
486 5.917789530940354e-05
487 5.8556703152135015e-05
488 5.767331094830297e-05
489 5.685546057065949e-05
490 5.5817443353589624e-05
491 5.4903957789065316e-05
492 5.4349147831089795e-05
493 5.373707244871184e-05
494 5.273582064546645e-05
495 5.194569894229062e-05
496 5.135536412126385e-05
497 5.068121390650049e-05
498 5.015898932470009e-05
499 4.951158189214766e-05
