In [1]:
%matplotlib inline


PyTorch: Tensors and autograd
-------------------------------

A fully-connected ReLU network with one hidden layer and no biases, trained to
predict y from x by minimizing squared Euclidean distance.

This implementation computes the forward pass using operations on PyTorch
Tensors, and uses **PyTorch autograd** to compute gradients.


A PyTorch Tensor represents a node in a computational graph. 

If ``x`` is a Tensor that has ``x.requires_grad=True`` 

then ``x.grad`` is another Tensor holding the gradient of ``x`` with respect to some scalar value.



In [2]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU


In [5]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10


# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in,  device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)


# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.

### NOTICE HERE
# w1, w2 are the parameter to learn
# so they are require the grad to be true
w1 = torch.randn(D_in, H, device=device,  dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

print(x.shape)
print(y.shape)
print(w1.shape)
print(w2.shape)

torch.Size([64, 1000])
torch.Size([64, 10])
torch.Size([1000, 100])
torch.Size([100, 10])


In [6]:
learning_rate = 1e-6

for t in range(500):
    # Forward pass: 
    # compute predicted y using operations on Tensors; 
    # these are exactly the same operations that
    # we used to compute the forward pass using Tensors, 
    # but we do not need to keep references to intermediate values 
    # since we are not implementing the backward pass by hand.
    # that is, we don't calculate grad_relu by hand
    # 
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. 
    # Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        # Wrap in torch.no_grad()
        # because weights have requires_grad=True, 
        # but we don't need to track this
        # in autograd.
    
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        # how to understand this???
        w1.grad.zero_()
        w2.grad.zero_()

0 46927512.0
1 60032444.0
2 76812616.0
3 69671048.0
4 35941444.0
5 10474588.0
6 3325159.25
7 1904509.125
8 1461474.0
9 1201787.0
10 1007274.625
11 852703.5
12 727583.125
13 625124.375
14 540337.4375
15 469663.5
16 410427.21875
17 360341.5
18 317774.9375
19 281407.65625
20 250136.15625
21 223125.359375
22 199669.0
23 179207.0625
24 161281.421875
25 145516.25
26 131615.984375
27 119318.484375
28 108397.453125
29 98673.59375
30 89997.3046875
31 82232.9609375
32 75268.4609375
33 69004.734375
34 63357.2890625
35 58255.51171875
36 53642.1171875
37 49458.63671875
38 45659.48828125
39 42206.44140625
40 39060.51953125
41 36192.4296875
42 33572.06640625
43 31173.400390625
44 28975.78515625
45 26958.248046875
46 25103.654296875
47 23397.26171875
48 21824.580078125
49 20373.853515625
50 19033.31640625
51 17793.849609375
52 16647.544921875
53 15585.228515625
54 14599.814453125
55 13685.3408203125
56 12837.3388671875
57 12048.8193359375
58 11314.486328125
59 10631.3046875
60 9995.5732421875
61 9401.

387 0.013214805163443089
388 0.012761476449668407
389 0.01232107076793909
390 0.011899544857442379
391 0.01149759441614151
392 0.011105232872068882
393 0.010726908221840858
394 0.010365379974246025
395 0.010015957988798618
396 0.009675554931163788
397 0.0093547273427248
398 0.009034995920956135
399 0.00873071514070034
400 0.008439025841653347
401 0.0081570940092206
402 0.00788429006934166
403 0.007625418249517679
404 0.00737238023430109
405 0.007127926219254732
406 0.00689314492046833
407 0.006669469177722931
408 0.006445478182286024
409 0.006234436761587858
410 0.006030440330505371
411 0.005836357828229666
412 0.005644408520311117
413 0.0054638502188026905
414 0.0052838376723229885
415 0.005116951651871204
416 0.004951616283506155
417 0.004790734965354204
418 0.004640566185116768
419 0.004493196494877338
420 0.00435195118188858
421 0.004212355241179466
422 0.0040805758908391
423 0.003949015401303768
424 0.0038274277467280626
425 0.0037074682768434286
426 0.0035892988089472055
427 0.00