# Example Notebook
Examples taken from: http://pytorch.org/tutorials/

In [14]:
import torch
from torch.autograd import Variable

# dtype = torch.FloatTensor
dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y using operations on Variables; these
  # are exactly the same operations we used to compute the forward pass using
  # Tensors, but we do not need to keep references to intermediate values since
  # we are not implementing the backward pass by hand.
  y_pred = x.mm(w1).clamp(min=0).mm(w2)

  # Compute and print loss using operations on Variables.
  # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
  # (1,); loss.data[0] is a scalar value holding the loss.
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.data[0])

  # Use autograd to compute the backward pass. This call will compute the
  # gradient of loss with respect to all Variables with requires_grad=True.
  # After this call w1.grad and w2.grad will be Variables holding the gradient
  # of the loss with respect to w1 and w2 respectively.
  loss.backward()

  # Update weights using gradient descent; w1.data and w2.data are Tensors,
  # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
  # Tensors.
  w1.data -= learning_rate * w1.grad.data
  w2.data -= learning_rate * w2.grad.data

  # Manually zero the gradients after updating weights
  w1.grad.data.zero_()
  w2.grad.data.zero_()

0 23700218.0
1 19281312.0
2 19016230.0
3 20467160.0
4 21750358.0
5 21108908.0
6 17953568.0
7 13156043.0
8 8521544.0
9 5098736.0
10 2990187.75
11 1797250.125
12 1146099.5
13 784846.875
14 576307.6875
15 447975.1875
16 363376.6875
17 303576.09375
18 258762.078125
19 223716.5
20 195330.21875
21 171756.453125
22 151844.375
23 134823.6875
24 120145.6796875
25 107410.984375
26 96311.8359375
27 86585.2890625
28 78016.3515625
29 70461.0625
30 63770.73046875
31 57819.734375
32 52510.5703125
33 47764.09375
34 43519.28515625
35 39717.90625
36 36295.75
37 33208.71484375
38 30424.568359375
39 27906.6875
40 25625.291015625
41 23555.4609375
42 21673.716796875
43 19960.46875
44 18395.9296875
45 16970.30859375
46 15660.5478515625
47 14462.619140625
48 13366.1298828125
49 12361.34375
50 11439.5126953125
51 10593.1689453125
52 9815.6357421875
53 9100.517578125
54 8442.255859375
55 7836.0517578125
56 7277.25244140625
57 6761.7822265625
58 6286.0322265625
59 5847.04150390625
60 5441.58642578125
61 5066.495

478 5.47622621525079e-05
479 5.3903713705949485e-05
480 5.304428850649856e-05
481 5.240848258836195e-05
482 5.177933053346351e-05
483 5.093953586765565e-05
484 5.0163645937573165e-05
485 4.9312409828417e-05
486 4.8600682930555195e-05
487 4.79378504678607e-05
488 4.7204550355672836e-05
489 4.671800707001239e-05
490 4.609403185895644e-05
491 4.541334419627674e-05
492 4.462415017769672e-05
493 4.407084270496853e-05
494 4.352055839262903e-05
495 4.2979147110600024e-05
496 4.23368692281656e-05
497 4.1748371586436406e-05
498 4.1328919905936345e-05
499 4.0825019823387265e-05
