In [4]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


Introduction to PyTorch
***********************

Introduction to Torch's tensor library
======================================

- All of deep learning is computations on tensors
- tensors: generalizations of a matrix that can be indexed in more than 2 dimensions


In [3]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7fee26e38210>

#### `torch.Tensor` create tensors

- Tensors can be created from Python lists with the torch.Tensor()
function.




In [5]:
# torch.tensor(data) creates a torch.Tensor object with the given data
V_data = [1., 2., 3.]
V = torch.Tensor(V_data)
print(V)

tensor([1., 2., 3.])


In [6]:
# Creates a matrix
M_data = [[1., 2., 3.], [4., 5., 6]]
M = torch.Tensor(M_data)
print(M)

tensor([[1., 2., 3.],
        [4., 5., 6.]])


In [7]:
# Create a 3D tensor of size 2x2x2.
T_data = [[[1., 2.], [3., 4.]],
          [[5., 6.], [7., 8.]]]
T = torch.Tensor(T_data)
print(T)

tensor([[[1., 2.],
         [3., 4.]],

        [[5., 6.],
         [7., 8.]]])


What is a 3D tensor anyway? Think about it like this. If you have a
vector, indexing into the vector gives you a scalar. If you have a
matrix, indexing into the matrix gives you a vector. If you have a 3D
tensor, then indexing into the tensor gives you a matrix!

A note on terminology:
- "tensor" in this tutorial refers to any torch.Tensor object. Matrices and vectors are special cases of
torch.Tensors, where their dimension is 1 and 2 respectively. When I am talking about 3D tensors, I will explicitly use the term "3D tensor".

#### Index into tensors

In [28]:
# Index into V and get a scalar (0 dimensional tensor)
V
V[0]
V[1]

tensor([1., 2., 3.])

tensor(1.)

tensor(2.)

In [27]:
# Get a Python number from it
V
V[0].item()

tensor([1., 2., 3.])

1.0

In [26]:
# Index into M and get a vector
M
M[0]

tensor([[1., 2., 3.],
        [4., 5., 6.]])

tensor([1., 2., 3.])

In [29]:
# Index into T and get a matrix
T
print(T[0])

tensor([[[1., 2.],
         [3., 4.]],

        [[5., 6.],
         [7., 8.]]])

tensor([[1., 2.],
        [3., 4.]])


#### Datatypes of tensors
You can also create tensors of other datatypes. The default, as you can
see, is Float. To create a tensor of integer types, try
torch.LongTensor(). Check the documentation for more data types, but
Float and Long will be the most common.




In [33]:
torch.Tensor(T_data) # creates a tensor of float datatype
torch.LongTensor(T_data) # creates a tensor of integer datatype

tensor([[[1., 2.],
         [3., 4.]],

        [[5., 6.],
         [7., 8.]]])

tensor([[[1, 2],
         [3, 4]],

        [[5, 6],
         [7, 8]]])

#### `torch.randn` create a tensor with random data and the supplied dimensionality

In [None]:
x = torch.randn((3, 4, 5))
print(x)

#### Operations with Tensors
- operate on tensors in the ways you would expect

In [35]:
x = torch.tensor([1., 2., 3.])
y = torch.tensor([4., 5., 6.])

In [36]:
z = x + y
z

tensor([5., 7., 9.])

See `the documentation <http://pytorch.org/docs/torch.html>`__ for a
complete list of the massive number of operations available to you. They
expand beyond just mathematical operations.

#### `torch.cat(tensor_1, tensor_2)` concatenation

In [38]:
# By default, concatenates along the first axis (concatenates rows)
x_1 = torch.randn(2, 5)
x_1

y_1 = torch.randn(3, 5)
y_1

z_1 = torch.cat([x_1, y_1])
z_1

tensor([[-0.4791,  1.3790,  2.5286,  0.4107, -0.9880],
        [-0.9081,  0.5423,  0.1103, -2.2590,  0.6067]])

tensor([[-0.1383,  0.8310, -0.2477, -0.8029,  0.2366],
        [ 0.2857,  0.6898, -0.6331,  0.8795, -0.6842],
        [ 0.4533,  0.2912, -0.8317, -0.5525,  0.6355]])

tensor([[-0.4791,  1.3790,  2.5286,  0.4107, -0.9880],
        [-0.9081,  0.5423,  0.1103, -2.2590,  0.6067],
        [-0.1383,  0.8310, -0.2477, -0.8029,  0.2366],
        [ 0.2857,  0.6898, -0.6331,  0.8795, -0.6842],
        [ 0.4533,  0.2912, -0.8317, -0.5525,  0.6355]])

In [40]:
# Concatenate columns:
x_2 = torch.randn(2, 3)
x_2
y_2 = torch.randn(2, 5)
y_2

tensor([[-1.1115,  0.3501, -0.7703],
        [-0.1473,  0.6272,  1.0935]])

tensor([[ 0.0939,  1.2381, -1.3459,  0.5119, -0.6933],
        [-0.1668, -0.9999, -1.6476,  0.8098,  0.0554]])

In [41]:
# second arg specifies which axis to concat along
z_2 = torch.cat([x_2, y_2], 1)
z_2

tensor([[-1.1115,  0.3501, -0.7703,  0.0939,  1.2381, -1.3459,  0.5119, -0.6933],
        [-0.1473,  0.6272,  1.0935, -0.1668, -0.9999, -1.6476,  0.8098,  0.0554]])

In [None]:
# If your tensors are not compatible, torch will complain.  Uncomment to see the error
# torch.cat([x_1, x_2])

#### `.view()` - Reshape Tensors
- .view() method to reshape a tensor.
- many neural network components expect their inputs to have
a certain shape. Often you will need to reshape before passing your data
to the component.




In [None]:
x = torch.randn(2, 3, 4)
print(x)
print(x.view(2, 12))  # Reshape to 2 rows, 12 columns
# Same as above.  If one of the dimensions is -1, its size can be inferred
print(x.view(2, -1))

Computation Graphs and Automatic Differentiation
================================================

- concept of a computation graph is essential to efficient deep learning programming
- it allows you to not have to write theback propagation gradients yourself. 
- a computation graph is simply a specification of how your data is combined to give you the output. 
- the graph totally specifies what parameters were involved with which operations, it contains enough information to compute derivatives. 

- fundamental flag ``requires_grad``.

- What is stored in the torch.Tensor objects we were creating above? 
    - Obviously the data and the shape, and maybe a few other things. 
    - But when we added two tensors together, we got an output tensor. 
    = All this output tensor knows is its data and shape. It has no idea that it was the sum of two other tensors (it could have been read in from a file, it could be the result of some other operation, etc.)

#### ``requires_grad=True`` - tensor object keeps track of how it was created

In [44]:
# tensor factory methods have a ``requires_grad`` flag
x = torch.tensor([1., 2., 3.], requires_grad=True)
x

tensor([1., 2., 3.], requires_grad=True)

In [45]:
# with requires_grad=True, can do all the operations
y = torch.tensor([4., 5., 6], requires_grad=True)
z = x + y
z

tensor([5., 7., 9.], grad_fn=<AddBackward0>)

In [46]:
# but z knows something extra.
z.grad_fn

<AddBackward0 at 0x7fee260166d8>

Tensors know what created them. 
- z knows:
    - that it wasn't read in from a file
    - it wasn't the result of a multiplication or exponential
    - if you keep following z.grad_fn, you will find yourself at x and y.

But how does that help us compute a gradient?

In [47]:
# Lets sum up all the entries in z
z
s = z.sum()
s
s.grad_fn

tensor([5., 7., 9.], grad_fn=<AddBackward0>)

tensor(21., grad_fn=<SumBackward0>)

<SumBackward0 at 0x7fee25b8bb00>

What is the derivative of this sum with respect to the first component of x?

\begin{align}\frac{\partial s}{\partial x_0}\end{align}



- s knows that it was created as a sum of the tensor z
- z knows that it was the sum x + y. So

\begin{align}s = \overbrace{x_0 + y_0}^\text{$z_0$} + \overbrace{x_1 + y_1}^\text{$z_1$} + \overbrace{x_2 + y_2}^\text{$z_2$}\end{align}

And so s contains enough information to determine that the derivative
we want is 1!

- this glosses over the challenge of how to actually compute that derivative
- the point here is that s is carrying along enough information that it is possible to compute it
- the developers of Pytorch program the sum() and + operations to know how to compute their gradients, and run the back propagation algorithm.

Lets have Pytorch compute the gradient, and see that we were right:
(note if you run this block multiple times, the gradient will increment.
That is because Pytorch *accumulates* the gradient into the .grad
property, since for many models this is very convenient.)




#### `.backward()` will run backprop

In [51]:
# calling .backward() on any variable will run backprop, starting from it.
s
s.backward()
x.grad

tensor(21., grad_fn=<SumBackward0>)

tensor([4., 4., 4.])

Understanding what is going on in the block below is crucial for being a
successful programmer in deep learning.




In [None]:
x = torch.randn(2, 2)
y = torch.randn(2, 2)

In [52]:
# by default, user-created Tensors have ``requires_grad=False``
x.requires_grad, y.requires_grad
z = x + y

(True, True)

In [53]:
# so you can't backprop through z - #TODO (Lee) - unclear
z.grad_fn

<AddBackward0 at 0x7fee259e0630>

#### `.requires_grad_()` - changes existing Tensor's `requires_grad`  flag in-place.

In [None]:
# ``.requires_grad_( ... )`` changes an existing Tensor's ``requires_grad``
# flag in-place. The input flag defaults to ``True`` if not given.
x = x.requires_grad_()
y = y.requires_grad_()

In [55]:
# z contains enough information to compute gradients, as we saw above
z = x + y
z.grad_fn

<AddBackward0 at 0x7fee25aa15c0>

In [56]:
# If any input to an operation has ``requires_grad=True``, so will the output
print(z.requires_grad)

True


#### `.detach()` breaks Tensor away from its past history

In [57]:
# Now z has the computation history that relates itself to x and y
# Can we just take its values, and **detach** it from its history?
new_z = z.detach()

In [None]:
# ... does new_z have information to backprop to x and y?
# NO!

print(new_z.grad_fn)
# And how could it? ``z.detach()`` returns a tensor that shares the same storage
# as ``z``, but with the computation history forgotten. It doesn't know anything
# about how it was computed.
# In essence, we have broken the Tensor away from its past history

#### wrapping the code block in``with torch.no_grad():`` also stop autograd from tracking history on Tensors with ``.requires_grad``=True



In [58]:
print(x.requires_grad)
print((x ** 2).requires_grad)

with torch.no_grad():
	print((x ** 2).requires_grad)

True
True
False
