In [62]:
%matplotlib inline

## Tensors and variables

In [1]:
import torch
from torch import autograd

In [2]:
t = torch.rand(1,5)

In [3]:
t

tensor([[0.5366, 0.6205, 0.9969, 0.4679, 0.4508]])

In [4]:
v = torch.rand(1,5)

In [5]:
v

tensor([[0.9170, 0.8589, 0.1134, 0.7630, 0.3665]])

Notice that `t` is different than `v`. 

Some important observations:

- The tensor `t` cannot be operated with `x`
- A variable has a `.grad` property (or field) not found in a tensor.
    - This starts as None and then it gets asigned a tensor object after backpropagation is applied to the variable


In [23]:
grad_output = torch.ones(5,1)

In [24]:
a = torch.tensor(torch.rand(5,1), requires_grad=True)

  """Entry point for launching an IPython kernel.


In [25]:
b = a + 1

In [26]:
b.grad

In [27]:
b.requires_grad

True

In [28]:
a.grad

In [29]:
b.grad

In [30]:
b.backward(grad_output)

In [31]:
a.grad

tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.]])

In [32]:
a.grad.data.zero_()

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.]])

In [33]:
a.grad

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.]])

In [34]:
b.backward(grad_output)

## torch.Autograd.variable

In [35]:
torch.__version__

'1.1.0'

In [36]:
b.__dict__

{}


Autograd: automatic differentiation
===================================

Central to all neural networks in PyTorch is the ``autograd`` package.
Let’s first briefly visit this, and we will then go to training our
first neural network.


The ``autograd`` package provides automatic differentiation for all operations
on Tensors. It is a define-by-run framework, which means that your backprop is
defined by how your code is run, and that every single iteration can be
different.

Let us see this in more simple terms with some examples.

Variable
--------

``autograd.Variable`` is the central class of the package. It wraps a
Tensor, and supports nearly all of operations defined on it. Once you
finish your computation you can call ``.backward()`` and have all the
gradients computed automatically.

You can access the raw tensor through the ``.data`` attribute, while the
gradient w.r.t. this variable is accumulated into ``.grad``.

.. figure:: /_static/img/Variable.png
   :alt: Variable

   Variable

There’s one more class which is very important for autograd
implementation - a ``Function``.

``Variable`` and ``Function`` are interconnected and build up an acyclic
graph, that encodes a complete history of computation. Each variable has
a ``.grad_fn`` attribute that references a ``Function`` that has created
the ``Variable`` (except for Variables created by the user - their
``grad_fn is None``).

If you want to compute the derivatives, you can call ``.backward()`` on
a ``Variable``. If ``Variable`` is a scalar (i.e. it holds a one element
data), you don’t need to specify any arguments to ``backward()``,
however if it has more elements, you need to specify a ``grad_output``
argument that is a tensor of matching shape.



In [37]:
import torch
from torch.autograd import Variable

In [38]:
x = torch.tensor(torch.ones(2, 2), requires_grad=True)
print(x)
y = x + 2
print(y)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>)


  """Entry point for launching an IPython kernel.


``y`` was created as a result of an operation, so it has a ``grad_fn``.


In [39]:
print(y.grad_fn)

<AddBackward0 object at 0x118b49898>


Do more operations on y


In [40]:
z = y * y * 3
out = z.mean()

print(z, out)

tensor([[27., 27.],
        [27., 27.]], grad_fn=<MulBackward0>) tensor(27., grad_fn=<MeanBackward0>)


Gradients
---------
let's backprop now
``out.backward()`` is equivalent to doing ``out.backward(torch.Tensor([1.0]))``


In [41]:
out.backward()

In [42]:
print(x.grad)

tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])


You should have got a matrix of ``4.5``. Let’s call the ``out``
*Variable* “$o$”.
We have that $o = \frac{1}{4}\sum_i z_i$, since
$z_i = 3(x_i+2)^2$ and $z_i\bigr\rvert_{x_i=1} = 27$.
Therefore,

$$\frac{\partial o}{\partial x_i} = \frac{3}{2}(x_i+2)$$ hence
$$\frac{\partial o}{\partial x_i}\bigr\rvert_{x_i=1} = \frac{9}{2} = 4.5$$



In [43]:
x = torch.randn(3)
x = torch.tensor(x, requires_grad=True)

y = x * 2
while y.data.norm() < 1000:
    y = y * 2

print(y)

tensor([-1053.1802,   688.7786, -1495.9871], grad_fn=<MulBackward0>)


  


In [44]:
help(x.grad_fn)

Help on NoneType object:

class NoneType(object)
 |  Methods defined here:
 |  
 |  __bool__(self, /)
 |      self != 0
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.
 |  
 |  __repr__(self, /)
 |      Return repr(self).



In [45]:
help(x.grad)

Help on NoneType object:

class NoneType(object)
 |  Methods defined here:
 |  
 |  __bool__(self, /)
 |      self != 0
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.
 |  
 |  __repr__(self, /)
 |      Return repr(self).



## Models in torch.nn

In [51]:
POLY_DEGREE = 2
W_target = torch.randn(POLY_DEGREE, 1) * 5
b_target = torch.randn(1) * 5
max_iterations = 10000

In [52]:
#?torch.nn.LogSigmoid

In [53]:
fc = torch.nn.Linear(W_target.size(0), 1)

In [54]:
fc.share_memory()

Linear(in_features=2, out_features=1, bias=True)

In [55]:
x = torch.Tensor([12.3])

## Polinomial coefficient

Many examples of models in Pytortch can be found here:

- https://github.com/pytorch/examples

In [56]:
import torch
import torch.autograd
import torch.nn.functional as F
from torch.autograd import Variable

In [57]:
POLY_DEGREE = 2
W_target = torch.randn(POLY_DEGREE, 1) * 5
b_target = torch.randn(1) * 5
max_iterations = 10000

In [58]:
print(W_target)

tensor([[ 3.1340],
        [-6.2177]])


In [60]:
def make_features(x):
    """Builds features i.e. a matrix with columns [x, x^2, x^3, x^4]."""
    x = x.unsqueeze(1)
    return torch.cat([x ** i for i in range(1, POLY_DEGREE+1)], 1)


def f(x):
    """Approximated function."""
    return x.mm(W_target) + b_target[0]


def poly_desc(W, b):
    """Creates a string description of a polynomial."""
    result = 'y = '
    for i, w in enumerate(W):
        result += '{:+.2f} x^{} '.format(w, len(W) - i)
    result += '{:+.2f}'.format(b[0])
    return result


def get_batch(batch_size=32):
    """Builds a batch i.e. (x, f(x)) pair."""
    random = torch.randn(batch_size)
    x = make_features(random)
    y = f(x)
    return Variable(x), Variable(y)


# Define model
fc = torch.nn.Linear(W_target.size(0), 1)

for batch_idx in range(max_iterations):
    
    # Get data
    batch_x, batch_y = get_batch()

    # Reset gradients
    fc.zero_grad()

    # Forward pass
    output = F.smooth_l1_loss(fc(batch_x), batch_y)
    loss = output.data

    # Backward pass
    output.backward()

    # Apply gradients
    for param in fc.parameters():
        param.data.add_(-0.1 * param.grad.data)

    # Stop criterion
    if loss < 1e-3:
        break

In [62]:
print('Loss: {:.6f} after {} batches'.format(loss, batch_idx))
print('==> Learned function:\t' + poly_desc(fc.weight.data.view(-1), fc.bias.data))
print('==> Actual function:\t' + poly_desc(W_target.view(-1), b_target))

Loss: 0.000961 after 132 batches
==> Learned function:	y = +3.12 x^2 -6.19 x^1 +0.76
==> Actual function:	y = +3.13 x^2 -6.22 x^1 +0.81
