# Neural Network Example
- Tutorial: http://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html

In [8]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]  # All dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


## Return learnable parameters

In [9]:
params = list(net.parameters())
print(len(params))
for pp in params:
    print(pp.size())

10
torch.Size([6, 1, 5, 5])
torch.Size([6])
torch.Size([16, 6, 5, 5])
torch.Size([16])
torch.Size([120, 400])
torch.Size([120])
torch.Size([84, 120])
torch.Size([84])
torch.Size([10, 84])
torch.Size([10])


# Forward
- Input should be `autograd.Variable` so is the output
- **`torch.nn` only supports mini-batches, not a single sample. Then nn.Conv2d will take a 4D Tensor: `nSamples x nChannels x Height x Width`. If you have a single sample, just use `input.unsqueeze(0)` to add a fake batch dimension.**

In [23]:
input = Variable(torch.randn(1, 1, 32, 32))
out = net(input)
print(out)

Variable containing:
 0.0580 -0.0482 -0.0119  0.1147  0.1726  0.0611 -0.0192 -0.0216  0.0960 -0.0617
[torch.FloatTensor of size 1x10]



# Backward
- It is done automatically with autograd

In [24]:
net.zero_grad()
out.backward(torch.randn(1, 10))

# Loss Function
- all functions in [link](http://pytorch.org/docs/nn.html#loss-functions)
- For the NN the backward is:
```
input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
      -> view -> linear -> relu -> linear -> relu -> linear
      -> MSELoss
      -> loss
```
- with loss.backward(), the whole graph is differentiated w.r.t. the loss. And all Variables in the graph will have their `.brad` Variable accumulated with the gradient.

In [33]:
output = net(input)
target = Variable(torch.arange(1, 11))  # A dummy target
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

Variable containing:
 38.1958
[torch.FloatTensor of size 1]



In [34]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

<MseLossBackward object at 0x7f6f57f2b780>
<AddmmBackward object at 0x7f6f57f2ba58>
<ExpandBackward object at 0x7f6f57f2b780>


# Backprop

In [35]:
net.zero_grad()     # zeroes the gradient buffers of all parameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv.1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
Variable containing:
 0
 0
 0
 0
 0
 0
[torch.FloatTensor of size 6]

conv.1.bias.grad after backward
Variable containing:
-0.0157
 0.0361
-0.0150
-0.0073
-0.0048
-0.1162
[torch.FloatTensor of size 6]



# Update the weights
- `weight = weight - learning_rate * gradient`
- To use various different **update rules** such as SGD, Nesterov-SGD, Adam, RMSProp, etc. To enable this, use: `torch.optim` that implements all these methods.

In [36]:
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)  # It is substracting

In [39]:
import torch.optim as optim

# Create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# In your training loop
optimizer.zero_grad()  # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()  # Does the update