In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn

In [2]:
nn.Conv2d?

In [2]:
import torch.nn.functional as F

In [3]:
class Net(nn.Module):
    
    def __init__(self):
        # calls nn.Module function
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        #define functions
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        #affine -> y = Wx + b
        # input is 16*5*5 vector (well its 16x5x5 image, but gets flattened?)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        # Max pooling over a (2,2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2,2))
        # or you can just say 2
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        # flatten?
        x = x.view(-1,self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
    
    
        

#### create the network

In [4]:
net = Net()

#### parameters randomly initialized?

In [5]:
for p in net.parameters():
    print p.size()
    print p.mean().data.numpy()
    print p.var().data.numpy()
    print p.norm().data.numpy()

torch.Size([6, 1, 5, 5])
[-0.00510966]
[ 0.01352612]
[ 1.42102361]
torch.Size([6])
[-0.08028553]
[ 0.01111766]
[ 0.30702263]
torch.Size([16, 6, 5, 5])
[ 0.00123969]
[ 0.00230243]
[ 2.35100484]
torch.Size([16])
[ 0.00986653]
[ 0.00289939]
[ 0.21224613]
torch.Size([120, 400])
[ 0.00010932]
[ 0.00083616]
[ 6.33524752]
torch.Size([120])
[-0.00048936]
[ 0.00085036]
[ 0.31815371]
torch.Size([84, 120])
[ 0.00046801]
[ 0.00275739]
[ 5.27199364]
torch.Size([84])
[-0.0009875]
[ 0.00272932]
[ 0.47604105]
torch.Size([10, 84])
[  6.97920041e-05]
[ 0.00405315]
[ 1.8440702]
torch.Size([10])
[-0.01046631]
[ 0.00427368]
[ 0.19889325]


In [6]:
print(net)

Net (
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear (400 -> 120)
  (fc2): Linear (120 -> 84)
  (fc3): Linear (84 -> 10)
)


You defined the forward function and the backward function is automatically defined by backward function

Learnable parameters of a model are returned by net.parameters

In [7]:
params = list(net.parameters())
print(len(params))
print(params[0].size())

10
torch.Size([6, 1, 5, 5])


Input to the forward is an autograd.Variable and so is the output

Note the input has be a minibatch, so for a 2d image for example the Conv2D takes a 4D tensor of nSamples x nChannels x Height x Width

* making a pseudo batch dimension is as simple as input.unsqueeze() to add a dummy dimension of 1 as the first dimension

In [8]:
# make input random inputs
input = Variable(torch.randn(1,1,32,32))
out = net(input)
print(out)

Variable containing:
-0.0443  0.0938 -0.1499 -0.0734  0.0874 -0.0139  0.0066 -0.1026 -0.0442  0.0010
[torch.FloatTensor of size 1x10]



zero the gradient buffers of all parameters and backprops with random gradients

In [9]:
net.zero_grad()

In [10]:
#must specify an input gradient because the output is a vector
#so we we can't get a gradient becasue gradient dL/dx requires that
#L be a scalar but if L is a vector and M is scalar that is a function of L
# then as long as we provide out with dM/dL, which should be a vector
# of length L then, we can compute gradients because if a parameter w
# has an affect on every element Li of L, then dM/dw = sum_i((dM/dL)_i*dLi/dw),
out.backward(torch.ones((1,10)))

In [11]:
net.conv1.bias.grad

Variable containing:
-0.1712
 0.0667
-0.0032
 0.0088
 0.0935
-0.0895
[torch.FloatTensor of size 6]

### Recap
* torch.Tensor
    * multi-dimensional array
* autograd.Variable
    * wraps a Tensor and records history of operations applied to it
    * has same API as Tensor, but with backward() and a few other things
* nn.Module
    * nice way of encapsulating parameters and has helpers for moving to a GPU, exporting, loading, etc.
* nn.Parameter
    * A kind of Variable that is automatically registered as a parameter when assigned as an attribute to a Module class. WOW!
* autograd.Function
    * implements forward and backward defintions of an autograd operation. Every Variable operation creates at least a single Function node, that connects to functions that created a variable and encodes its history


### Loss Function

In [12]:
#remember input is a vector randomly initialized
output = net(input)


In [13]:
print output

Variable containing:
-0.0443  0.0938 -0.1499 -0.0734  0.0874 -0.0139  0.0066 -0.1026 -0.0442  0.0010
[torch.FloatTensor of size 1x10]



In [14]:
target = Variable(torch.arange(1,11))
criterion = nn.MSELoss()

In [15]:
loss = criterion(output, target)
print(loss)

Variable containing:
 38.7876
[torch.FloatTensor of size 1]



In [16]:
print(loss.grad_fn)

<torch.autograd.function.MSELossBackward object at 0x11749a050>


In [17]:
print(loss.grad_fn.next_functions[0])

(<torch.autograd.function.AddmmBackward object at 0x117483ed8>, 0)


In [18]:
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])

<AccumulateGrad object at 0x114b19d50>


### Backprop
to backprop the error, we just do loss.backward()
but we must clear gradients or else gradients will be accumulated to existing gradients

In [19]:
net.zero_grad() #zeroes buffers

In [20]:
print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)
#prints 6 zeroes because 6 biases ( 1 for each feature map)
#they are zero because we have cleared the buffer, so no gradients
#have been accumulated, so they all start at 0
print(net.conv1.weight.grad)
#gradient of weights is all zero also

conv1.bias.grad before backward
Variable containing:
 0
 0
 0
 0
 0
 0
[torch.FloatTensor of size 6]

Variable containing:
(0 ,0 ,.,.) = 
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0

(1 ,0 ,.,.) = 
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0

(2 ,0 ,.,.) = 
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0

(3 ,0 ,.,.) = 
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0

(4 ,0 ,.,.) = 
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0

(5 ,0 ,.,.) = 
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0
  0  0  0  0  0
[torch.FloatTensor of size 6x1x5x5]



In [21]:
# computes dL/dw for every w, where w is all trainable parameters
loss.backward()

In [22]:
print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad after backward
Variable containing:
 0.1673
-0.0275
 0.0086
-0.0254
-0.0884
 0.1040
[torch.FloatTensor of size 6]



full list of loss functions here: http://pytorch.org/docs/nn

### Update the weights

the very simple SGD: w' = w - lr*dL/dw

In [23]:
learning_rate = 0.01

In [24]:
# remember functions with underscore are in place
# so this subtracts from each parameter it's gradient scaled by a learnign rate
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

#### what if we want to do SGD with momentum or Nesterov or Adam, RMSProp, torch.optim has it for us!

so let's just do the same as above but using the optim library

In [28]:
import torch.optim as optim

In [29]:
# create optimizer

optimizer = optim.SGD(net.parameters(), lr=0.01)

In [35]:
for i in range(100):
    # in your training loop:
    optimizer.zero_grad() #zero the ol gradient
    output = net(input) # forward prop for output
    loss = criterion(output, target) #forward prop loss
    loss.backward() #calculate gradients with backprop
    optimizer.step() # update weights

In [36]:
loss.data


1.00000e-04 *
  7.5026
[torch.FloatTensor of size 1]