## Losses in PyTorch

In [1]:
import torch
from torch import nn
from torchvision import datasets, transforms

# Define a transform to normalize the data
transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,)),
    ],
)
# Download and load the training data
trainset = datasets.MNIST("~/.pytorch/MNIST_data/", download=True, train=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)

In [2]:
# Build a feed-forward network
model = nn.Sequential(nn.Linear(784, 128), nn.ReLU(), nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, 10))

# Define the loss
criterion = nn.CrossEntropyLoss()

# Get our data
dataiter = iter(trainloader)

images, labels = next(dataiter)

# Flatten images
images = images.view(images.shape[0], -1)

# Forward pass, get our logits
logits = model(images)
# Calculate the loss with the logits and the labels
loss = criterion(logits, labels)

print(loss)

tensor(2.3047, grad_fn=<NllLossBackward0>)


In [3]:
# TODO: Build a feed-forward network
model = nn.Sequential(nn.Linear(784, 128), nn.ReLU(), nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, 10), nn.LogSoftmax(dim=1))

# TODO: Define the loss
criterion = nn.NLLLoss()

### Run this to check your work
# Get our data
dataiter = iter(trainloader)

images, labels = next(dataiter)

# Flatten images
images = images.view(images.shape[0], -1)

# Forward pass, get our logits
logits = model(images)
# Calculate the loss with the logits and the labels
loss = criterion(logits, labels)

print(loss)


tensor(2.2762, grad_fn=<NllLossBackward0>)


## Autograd


In [4]:
x = torch.randn(2, 2, requires_grad=True)
print(x)

tensor([[0.5072, 1.9234],
        [0.0815, 1.4944]], requires_grad=True)


In [5]:
y = x**2
print(y)

tensor([[0.2573, 3.6994],
        [0.0066, 2.2332]], grad_fn=<PowBackward0>)


In [None]:
## grad_fn shows the function that generated this variable
print(y.grad_fn)

<PowBackward0 object at 0x7f3cb420ef50>


In [9]:
z = y.mean()
print(z)

tensor(1.5491, grad_fn=<MeanBackward0>)


In [10]:
print(x.grad)

None


In [11]:
z.backward()
print(x.grad)
print(2 * x / torch.numel(x))

tensor([[0.2536, 0.9617],
        [0.0408, 0.7472]])
tensor([[0.2536, 0.9617],
        [0.0408, 0.7472]], grad_fn=<DivBackward0>)


## Loss and Autograd together

In [12]:
# Build a feed-forward network
model = nn.Sequential(
    nn.Linear(784, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 10),
    nn.LogSoftmax(dim=1),
)

criterion = nn.NLLLoss()
dataiter = iter(trainloader)
images, labels = next(dataiter)
images = images.view(images.shape[0], -1)

logits = model(images)
loss = criterion(logits, labels)

In [13]:
print("Before backward pass: \n", model[0].weight.grad)

loss.backward()

print("After backward pass: \n", model[0].weight.grad)

Before backward pass: 
 None
After backward pass: 
 tensor([[-0.0016, -0.0016, -0.0016,  ..., -0.0016, -0.0016, -0.0016],
        [-0.0002, -0.0002, -0.0002,  ..., -0.0002, -0.0002, -0.0002],
        [-0.0016, -0.0016, -0.0016,  ..., -0.0016, -0.0016, -0.0016],
        ...,
        [ 0.0021,  0.0021,  0.0021,  ...,  0.0021,  0.0021,  0.0021],
        [-0.0018, -0.0018, -0.0018,  ..., -0.0018, -0.0018, -0.0018],
        [-0.0018, -0.0018, -0.0018,  ..., -0.0018, -0.0018, -0.0018]])


## Training the network!

In [14]:
from torch import optim

# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [15]:
print("Initial weights - ", model[0].weight)

dataiter = iter(trainloader)
images, labels = next(dataiter)
images.resize_(64, 784)

# Clear the gradients, do this because gradients are accumulated
optimizer.zero_grad()

# Forward pass, then backward pass, then update weights
output = model(images)
loss = criterion(output, labels)
loss.backward()
print("Gradient -", model[0].weight.grad)

Initial weights -  Parameter containing:
tensor([[ 0.0271, -0.0034,  0.0091,  ...,  0.0349, -0.0027, -0.0252],
        [-0.0322, -0.0232, -0.0256,  ...,  0.0073,  0.0304, -0.0209],
        [-0.0232,  0.0309,  0.0314,  ...,  0.0326, -0.0052,  0.0186],
        ...,
        [-0.0213,  0.0322,  0.0149,  ..., -0.0302,  0.0339,  0.0168],
        [ 0.0334, -0.0277, -0.0090,  ...,  0.0075, -0.0044,  0.0138],
        [-0.0298,  0.0242,  0.0016,  ..., -0.0182, -0.0073,  0.0222]],
       requires_grad=True)
Gradient - tensor([[-3.9921e-05, -3.9921e-05, -3.9921e-05,  ..., -3.9921e-05,
         -3.9921e-05, -3.9921e-05],
        [-1.3228e-04, -1.3228e-04, -1.3228e-04,  ..., -1.3228e-04,
         -1.3228e-04, -1.3228e-04],
        [ 2.0258e-03,  2.0258e-03,  2.0258e-03,  ...,  2.0258e-03,
          2.0258e-03,  2.0258e-03],
        ...,
        [-7.7204e-04, -7.7204e-04, -7.7204e-04,  ..., -7.7204e-04,
         -7.7204e-04, -7.7204e-04],
        [-4.4808e-03, -4.4808e-03, -4.4808e-03,  ..., -4.4808e

In [16]:
# Take an update step and view the new weights
optimizer.step()
print("Updated weights - ", model[0].weight)

Updated weights -  Parameter containing:
tensor([[ 0.0271, -0.0034,  0.0091,  ...,  0.0349, -0.0027, -0.0252],
        [-0.0322, -0.0232, -0.0256,  ...,  0.0073,  0.0304, -0.0209],
        [-0.0232,  0.0309,  0.0314,  ...,  0.0325, -0.0052,  0.0186],
        ...,
        [-0.0213,  0.0322,  0.0149,  ..., -0.0302,  0.0339,  0.0168],
        [ 0.0334, -0.0277, -0.0090,  ...,  0.0075, -0.0043,  0.0139],
        [-0.0297,  0.0242,  0.0016,  ..., -0.0182, -0.0073,  0.0223]],
       requires_grad=True)


## Training for real

In [None]:
## Your solution here

model = nn.Sequential(
    nn.Linear(784, 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 10),
    nn.LogSoftmax(dim=1),
)

criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.003)

epochs = 5
for _ in range(epochs):
    running_loss = 0
    for images, labels in trainloader:  # noqa: B007
        # Flatten MNIST images into a 784 long vector
        images = images.view(images.shape[0], -1)

        # TODO: Training pass
        optimizer.zero_grad()
        output = model(images)
        loss = criterion(output, labels)
        loss.backward()
        running_loss += loss.item()
    else:
        print(f"Training loss: {running_loss / len(trainloader)}")

Training loss: 2.308278787873193
Training loss: 2.308262900248774
Training loss: 2.3082751973605613
Training loss: 2.3082752692928192


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import helper

dataiter = iter(trainloader)
images, labels = next(dataiter)

img = images[0].view(1, 784)
# Turn off gradients to speed up this part
with torch.no_grad():
    logps = model(img)

# Output of the network are log-probabilities, need to take exponential for probabilities
ps = torch.exp(logps)
helper.view_classify(img.view(1, 28, 28), ps)
plt.show()