In [1]:
## Neural nets in pytorch
 # defined in torch.nn
 # relies on autograd to define models and differentiate them
 # nn.Module contains layers and a forward(input) method that returns the net's output

In [2]:
## Typical training procedure (we know most of this already, but always good to review)
 # Define the net that has some weights (learnable parameters)
 # Process some inputs through the network
 # Compute the loss for those inputs
 # Propogate gradients back to the paramters
 # Update the weights of the network using your sample rule (e.g. new_weight = old_weight - learning_rate * gradient)

In [29]:
## We're going to build the classic digit image classifier :)
import torch
import torch.nn as nn
import torch.nn.functional as F

# Note: this is all but copy and pasted from the tutorial for the sake of time
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution kernel
        # Note: Nice review on kernel sizing here: https://stats.stackexchange.com/questions/296679/what-does-kernel-size-mean/296701
        self.conv1 = nn.Conv2d(1,6,3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # Output * Conv2 input * Conv1 output
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)  # 10 output features representing the digit being 0...9
    
    def forward(self, x):
        # Max pooling over a (2,2) window
        # Note: https://machinelearningmastery.com/rectified-linear-activation-function-for-deep-learning-neural-networks/
        x = F.max_pool2d(F.relu(self.conv1(x)), (2,2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:] # All features except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features        

In [31]:
net = Net()
params = list(net.parameters())
print(len(params))
print(params[0].size())  # Conv1's .weight

10
torch.Size([6, 1, 3, 3])


In [32]:
# Let's try a random input of 32x32
input = torch.randn(1, 32, 32).unsqueeze(0)
out = net.forward(input)

# Zero the gradient buffers and backprop random gradients
net.zero_grad()
out.backward(torch.randn(1,10))

In [33]:
## !! Important !!
 #  
 # torch.nn only supports mini-batches. The entire torch.nn package only supports inputs that are a mini-batch of samples, and ##  not a single sample.
 # For example, nn.Conv2d will take in a 4D Tensor of nSamples x nChannels x Height x Width.
 # If you have a single sample, just use input.unsqueeze(0) to add a fake batch dimension.

In [39]:
# Now we need to compute the loss and update the weights
output = net(input)
target = torch.randn(10)  # A dummy desired output (10 out features due to our net design)
target = target.view(1, -1)  # Convert to match output shape
criterion = nn.MSELoss()  # Use the built in mean squared error loss function
loss = criterion(output, target)
print(loss)

tensor(1.0706, grad_fn=<MseLossBackward>)


In [42]:
# We can actually follow the differentiation back, starting at the loss
print(loss.grad_fn)
print(loss.grad_fn.next_functions[0][0])
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])

<MseLossBackward object at 0x7f7e2021ca00>
<AddmmBackward object at 0x7f7e2021ca60>
<AccumulateGrad object at 0x7f7e2021ca00>


In [43]:
# Now we can examine gradient changes before and after backprop
print(f"conv1.bias.grad initial:\n{net.conv1.bias.grad}")

loss.backward()

print(f"conv2.bias.grad after backprop:\n{net.conv1.bias.grad}")

conv1.bias.grad initial:
tensor([-0.0059, -0.0016, -0.0104,  0.0126,  0.0205,  0.0118])
conv2.bias.grad after backprop:
tensor([ 0.0147,  0.0099, -0.0152,  0.0056,  0.0343,  0.0246])


In [44]:
# We can manually update the weights. This is easy for SGD simple rule: old_weight = new_weight - learning_rate * gradient
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)  # Updating the weight in place  '_'

In [45]:
# But, of course, torch makes this easy for us. Especially when we want more complex update rules.
import torch.optim as optim
optimizer = optim.SGD(net.parameters(), lr=0.01)

# within the training loop
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()