# PyTorch Intro

### Why PyTorch?
[the Gradient](https://thegradient.pub/state-of-ml-frameworks-2019-pytorch-dominates-research-tensorflow-dominates-industry/)

## Tensor Properties

In [2]:
import torch

In [3]:
example_tensor = torch.Tensor(
    [
     [[1, 2], [3, 4]], 
     [[5, 6], [7, 8]], 
     [[9, 0], [1, 2]]
    ]
)
example_tensor

tensor([[[1., 2.],
         [3., 4.]],

        [[5., 6.],
         [7., 8.]],

        [[9., 0.],
         [1., 2.]]])

In [4]:
example_tensor.device

device(type='cpu')

In [5]:
example_tensor.shape

torch.Size([3, 2, 2])

In [6]:
print("shape[0] =", example_tensor.shape[0])
print("size(1) =", example_tensor.size(1))

shape[0] = 3
size(1) = 2


In [7]:
print("Rank =", len(example_tensor.shape))
print("Number of elements =", example_tensor.numel())

Rank = 3
Number of elements = 12


### Initializing Tensors

In [8]:
torch.ones_like(example_tensor)

tensor([[[1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.]],

        [[1., 1.],
         [1., 1.]]])

In [9]:
torch.zeros_like(example_tensor)

tensor([[[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]]])

In [10]:
torch.randn_like(example_tensor)

tensor([[[-0.7398, -1.0145],
         [ 0.0854, -0.5984]],

        [[-0.2352, -0.2661],
         [-0.8248, -0.7447]],

        [[-0.8401, -0.3857],
         [-1.6909,  0.4557]]])

In [11]:
torch.randn(2, 2, device='cpu') # Alternatively, for a GPU tensor, you'd use device='cuda'

tensor([[ 0.8383,  1.7842],
        [-0.1754,  2.1677]])

### Basic Functions

In [12]:
(example_tensor - 5) * 2

tensor([[[ -8.,  -6.],
         [ -4.,  -2.]],

        [[  0.,   2.],
         [  4.,   6.]],

        [[  8., -10.],
         [ -8.,  -6.]]])

In [13]:
print("Mean:", example_tensor.mean())
print("Stdev:", example_tensor.std())

Mean: tensor(4.)
Stdev: tensor(2.9848)


In [16]:
example_tensor

tensor([[[1., 2.],
         [3., 4.]],

        [[5., 6.],
         [7., 8.]],

        [[9., 0.],
         [1., 2.]]])

In [14]:
example_tensor.mean(0)

tensor([[5.0000, 2.6667],
        [3.6667, 4.6667]])

In [17]:
example_tensor.mean(1)

tensor([[2., 3.],
        [6., 7.],
        [5., 1.]])

In [18]:
example_tensor.mean(2)

tensor([[1.5000, 3.5000],
        [5.5000, 7.5000],
        [4.5000, 1.5000]])

#### Equivalently, you could also write:
- example_tensor.mean(dim=0)
- example_tensor.mean(axis=0)
- torch.mean(example_tensor, 0)
- torch.mean(example_tensor, dim=0)
- torch.mean(example_tensor, axis=0)

## PyTorch NN Module

In [20]:
import torch.nn as nn

### nn.Linear
- To create a linear layer, pass the input dimension and output dimension
- A linear object initialized as `nn.Linear(10, 2)` will take in a n x 10 matrix and return n x 2 matrix
-  Where, all n elements have had the same linear transformation performed

In [28]:
linear = nn.Linear(10, 2)
example_input = torch.randn(3, 10)
example_output = linear(example_input)
example_output

tensor([[-0.0120,  0.6726],
        [ 0.0487, -0.5285],
        [-0.1308, -0.4747]], grad_fn=<AddmmBackward>)

In [29]:
linear = nn.Linear(10, 1)
example_input = torch.ones_like(example_input)
example_output_2 = linear(example_input)
example_output_2

tensor([[-0.3423],
        [-0.3423],
        [-0.3423]], grad_fn=<AddmmBackward>)

In [30]:
example_input

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

### nn.ReLU

In [37]:
relu = nn.ReLU()
relu_output = relu(example_output)
relu_output

tensor([[0.0000, 0.6726],
        [0.0487, 0.0000],
        [0.0000, 0.0000]], grad_fn=<ReluBackward0>)

In [38]:
relu_output_2 = relu(example_output_2)
relu_output_2

tensor([[0.],
        [0.],
        [0.]], grad_fn=<ReluBackward0>)

In [39]:
nn.ELU(example_output)

ELU(
  alpha=tensor([[-0.0120,  0.6726],
          [ 0.0487, -0.5285],
          [-0.1308, -0.4747]], grad_fn=<AddmmBackward>)
)

In [40]:
nn.ELU(example_output_2)

ELU(
  alpha=tensor([[-0.3423],
          [-0.3423],
          [-0.3423]], grad_fn=<AddmmBackward>)
)

### nn.BatchNorm1d
- Normalization technique that will rescale a batch of n inputs to have a constant mean and std between batches
- 1d - for situtations where one expects a set of inputs - each of them is a flat list of numbers 
- ie Input vector not a matrix or higher rank tensor
- `nn.BatchNorm2d` is used for images

In [41]:
batch_norm = nn.BatchNorm1d(2)
batch_norm_output = batch_norm(relu_output)
batch_norm_output

tensor([[-0.7005,  1.4141],
        [ 1.4010, -0.7071],
        [-0.7005, -0.7071]], grad_fn=<NativeBatchNormBackward>)

In [42]:
### nn.Sequential

In [45]:
mlp_layer = nn.Sequential(
    nn.Linear(5, 2),
    nn.BatchNorm1d(2),
    nn.ReLU()
)

test_example = torch.randn(5, 5) + 1
print("input: ")
print(test_example)
print("output: ")
print(mlp_layer(test_example))

input: 
tensor([[ 0.0232,  1.0344,  1.0607,  1.5545,  1.9843],
        [ 0.4536,  3.6048,  0.6400,  1.7168,  2.7016],
        [-0.4710,  0.9583,  2.8970,  1.9307,  2.5374],
        [ 1.4215,  0.9669,  0.6048, -0.5217,  0.0539],
        [ 0.0519,  1.7444,  0.8946,  1.4093,  2.7785]])
output: 
tensor([[0.0000, 0.1471],
        [1.6699, 0.0000],
        [0.0000, 0.0000],
        [0.0000, 1.8640],
        [0.4609, 0.0000]], grad_fn=<ReluBackward0>)


## Optimization
One of the most important aspects of essentially any machine learning framework is its automatic differentiation library.

### Optimizers
`torch.optim` module offers the optimization functions

In [46]:
import torch.optim as optim
adam_opt = optim.Adam(mlp_layer.parameters(), lr=1e-1)

### Training Loop
A (basic) training step in PyTorch consists of four basic parts:


1.   Set all of the gradients to zero using `opt.zero_grad()`
2.   Calculate the loss, `loss`
3.   Calculate the gradients with respect to the loss using `loss.backward()`
4.   Update the parameters being optimized using `opt.step()`

That might look like the following code (and you'll notice that if you run it several times, the loss goes down):


In [47]:
train_example = torch.randn(100,5) + 1
adam_opt.zero_grad()

# We'll use a simple loss function of mean distance from 1
# torch.abs takes the absolute value of a tensor
cur_loss = torch.abs(1 - mlp_layer(train_example)).mean()

cur_loss.backward()
adam_opt.step()
print(cur_loss)

tensor(0.7670, grad_fn=<MeanBackward0>)


In [52]:

adam_opt.step()
print(cur_loss)

tensor(0.7670, grad_fn=<MeanBackward0>)


## New `nn` Classes

In [53]:
class ExampleModule(nn.Module):
    def __init__(self, input_dims, output_dims):
        super(ExampleModule, self).__init__()
        self.linear = nn.Linear(input_dims, output_dims)
        self.exponent = nn.Parameter(torch.tensor(1.))

    def forward(self, x):
        x = self.linear(x)

        # This is the notation for element-wise exponentiation, 
        # which matches python in general
        x = x ** self.exponent 
        
        return x

In [54]:
example_model = ExampleModule(10, 2)
list(example_model.parameters())

[Parameter containing:
 tensor(1., requires_grad=True),
 Parameter containing:
 tensor([[ 0.2270, -0.2372,  0.0722,  0.0638, -0.2157, -0.0459,  0.0849, -0.2476,
           0.0833,  0.2897],
         [-0.2389, -0.0873, -0.1533, -0.2492, -0.2355, -0.0893, -0.0969,  0.3143,
           0.2190,  0.3067]], requires_grad=True),
 Parameter containing:
 tensor([-0.0011,  0.1198], requires_grad=True)]

In [55]:
list(example_model.named_parameters())

[('exponent',
  Parameter containing:
  tensor(1., requires_grad=True)),
 ('linear.weight',
  Parameter containing:
  tensor([[ 0.2270, -0.2372,  0.0722,  0.0638, -0.2157, -0.0459,  0.0849, -0.2476,
            0.0833,  0.2897],
          [-0.2389, -0.0873, -0.1533, -0.2492, -0.2355, -0.0893, -0.0969,  0.3143,
            0.2190,  0.3067]], requires_grad=True)),
 ('linear.bias',
  Parameter containing:
  tensor([-0.0011,  0.1198], requires_grad=True))]

In [56]:
input = torch.randn(2, 10)
example_model(input)

tensor([[-0.5685,  0.6811],
        [-0.1088,  0.2884]], grad_fn=<PowBackward1>)