In [84]:
import os
import sys
import numpy as np
import einops
from typing import Union, Optional, Tuple, List, Dict
import torch as t
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from jaxtyping import Float, Int
import functools
from pathlib import Path
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Subset
from tqdm.notebook import tqdm
from dataclasses import dataclass
from PIL import Image
import json

# Make sure exercises are in the path
chapter = r"chapter0_fundamentals"
exercises_dir = Path(f"{os.getcwd().split(chapter)[0]}/{chapter}/exercises").resolve()
section_dir = exercises_dir / "part2_cnns"
if str(exercises_dir) not in sys.path: sys.path.append(str(exercises_dir))

from plotly_utils import imshow, line, bar
import part2_cnns.tests as tests
from part2_cnns.utils import print_param_count

MAIN = __name__ == "__main__"

device = t.device("cuda" if t.cuda.is_available() else "cpu")

# Making your own modules

## ReLU

In [4]:
class ReLU(nn.Module):
    
    def forward(self, x: t.Tensor) -> t.Tensor:
        return x.clip(min=0.)

tests.test_relu(ReLU)

All tests in `test_relu` passed!


## Linear

#### Question - what type should these variables be?

The variables should be two tensors wrapped in `nn.Parameter`s.  The weights tensor should be shape (D_in, D_out) and the bias tensor should be shape (D_out)

In [141]:
class Linear(nn.Module):
    def __init__(self, in_features: int, out_features: int, bias=True):
        '''
        A simple linear (technically, affine) transformation.

        The fields should be named `weight` and `bias` for compatibility with PyTorch.
        If `bias` is False, set `self.bias` to None.
        '''
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        
        weights = 2 / np.sqrt(in_features) * t.rand((out_features, in_features)) - 1/np.sqrt(in_features)
        self.weight = nn.Parameter(weights)        
        self.bias = nn.Parameter(t.zeros(out_features)) if bias else None

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''
        x: shape (*, in_features)
        Return: shape (*, out_features)
        '''
        z = x @ self.weight.T
        if self.bias is not None:
            z += self.bias
        return z


tests.test_linear_forward(Linear)
tests.test_linear_parameters(Linear)
tests.test_linear_no_bias(Linear)

All tests in `test_linear_forward` passed!
All tests in `test_linear_parameters` passed!
All tests in `test_linear_no_bias` passed!


### Flatten

In [142]:
class Flatten(nn.Module):
    def __init__(self, start_dim: int = 1, end_dim: int = -1) -> None:
        super().__init__()
        self.start_dim = start_dim
        self.end_dim = end_dim

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''
        Flatten out dimensions from start_dim to end_dim, inclusive of both.
        '''
        in_shape = list(x.shape)
        ds = self.start_dim
        de = self.end_dim if self.end_dim > 0 else len(in_shape) + self.end_dim

        flat_dim_shape = 1
        for d in range(ds, de+1):
            flat_dim_shape *= in_shape[d]
        out_shape = tuple(in_shape[:ds] + [flat_dim_shape] + in_shape[de+1:])
        return x.view(*out_shape)
        



tests.test_flatten(Flatten)

All tests in `test_flatten` passed!


### Simple Multi-Layer Perceptron

##### Question - can you see what makes logits non-unique (i.e. why any given set of probabilities might correspond to several different possible sets of logits)?

The softmax function is invariant to translation in x.  We can see this by multiplying the function by 1 written as $\frac{e^s}{e^s}$ which results in 

\begin{align}
    softmax[x_i] =  \frac{e^{x_i + s}}{\sum_j e^{x_j + s}}
\end{align}

This invariance can be used to make softmax numerically stable.

In [143]:
class SimpleMLP(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.flatten = Flatten()
        self.linear1 = Linear(28**2, 100)
        self.relu = ReLU()
        self.linear2 = Linear(100, 10)
        
        

    def forward(self, x: t.Tensor) -> t.Tensor:
        z1 = self.flatten(x)
        z2 = self.linear1(z1)
        z3 = self.relu(z2)
        z4 = self.linear2(z3)
        return z4
        


tests.test_mlp(SimpleMLP)

All tests in `test_mlp` passed!


# Training Neural Networks

### Transforms, Datasets & DataLoaders

In [144]:
MNIST_TRANSFORM = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

def get_mnist(subset: int = 1):
    '''Returns MNIST training data, sampled by the frequency given in `subset`.'''
    mnist_trainset = datasets.MNIST(root="./data", train=True, download=True, transform=MNIST_TRANSFORM)
    mnist_testset = datasets.MNIST(root="./data", train=False, download=True, transform=MNIST_TRANSFORM)

    if subset > 1:
        mnist_trainset = Subset(mnist_trainset, indices=range(0, len(mnist_trainset), subset))
        mnist_testset = Subset(mnist_testset, indices=range(0, len(mnist_testset), subset))

    return mnist_trainset, mnist_testset


mnist_trainset, mnist_testset = get_mnist()
mnist_trainloader = DataLoader(mnist_trainset, batch_size=64, shuffle=True)
mnist_testloader = DataLoader(mnist_testset, batch_size=64, shuffle=False)

#### Question - can you explain why we include a data normalization function in torchvision.transforms?

Generally, we care about learning the pattern of variation in the data. If we use unnormalized data, the same pattern of variation is present, but the scale can be anything. For this kind of analysis, where we only have a single input layer, normalization is probably not that important. Often we have input features with radically different scales and this can make training with gradient descent quite difficult as the differential scale of the data distort the loss surface.  

#### Question - what is the benefit of using shuffle=True when defining our dataloaders? What might the problem be if we didn't do this?

This ensures we get different subsets of data in each training batch.  This is required if we want to be able to mathematically reason about gradient descent and to get the implicit regularization effects to be unbiased.


### Training loop

In [145]:
model = SimpleMLP().to(device)

batch_size = 64
epochs = 3

mnist_trainset, _ = get_mnist(subset = 10)
mnist_trainloader = DataLoader(mnist_trainset, batch_size=batch_size, shuffle=True)

optimizer = t.optim.Adam(model.parameters(), lr=1e-3)
loss_list = []

for epoch in tqdm(range(epochs)):
    for imgs, labels in mnist_trainloader:
        imgs = imgs.to(device)
        labels = labels.to(device)
        logits = model(imgs)
        loss = F.cross_entropy(logits, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_list.append(loss.item())   

line(
    loss_list, 
    yaxis_range=[0, max(loss_list) + 0.1],
    labels={"x": "Num batches seen", "y": "Cross entropy loss"}, 
    title="SimpleMLP training on MNIST",
    width=700
)

  0%|          | 0/3 [00:00<?, ?it/s]

In [146]:
@dataclass
class SimpleMLPTrainingArgs():
    '''
    Defining this class implicitly creates an __init__ method, which sets arguments as 
    given below, e.g. self.batch_size = 64. Any of these arguments can also be overridden
    when you create an instance, e.g. args = SimpleMLPTrainingArgs(batch_size=128).
    '''
    batch_size: int = 64
    epochs: int = 3
    learning_rate: float = 1e-3
    subset: int = 10


def train(args: SimpleMLPTrainingArgs):
    '''
    Trains the model, using training parameters from the `args` object.
    '''
    model = SimpleMLP().to(device)

    mnist_trainset, mnist_testset = get_mnist(subset=args.subset)
    mnist_trainloader = DataLoader(mnist_trainset, batch_size=args.batch_size, shuffle=True)
    mnist_testloader = DataLoader(mnist_testset, batch_size=args.batch_size, shuffle=False)

    optimizer = t.optim.Adam(model.parameters(), lr=args.learning_rate)
    loss_list = []
    val_loss_list = []
    val_accuracy_list = []

    # How does the model do initially?
    with t.no_grad():
        val_loss = [] 
        num_correct = 0
        for imgs, labels in mnist_testloader:
            imgs = imgs.to(device)
            labels = labels.to(device)                
            logits = model(imgs)
            loss = F.cross_entropy(logits, labels)
            val_loss.append(loss.item())                
            num_correct += (logits.argmax(dim=1) == labels).sum()
        val_loss_list.append(sum(val_loss)/len(val_loss))
        val_accuracy_list.append(num_correct / len(mnist_testset))

    for epoch in tqdm(range(args.epochs)):
        for imgs, labels in mnist_trainloader:
            imgs = imgs.to(device)
            labels = labels.to(device)
            logits = model(imgs)
            loss = F.cross_entropy(logits, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            loss_list.append(loss.item())   

        with t.no_grad():
            val_loss = [] 
            num_correct = 0
            for imgs, labels in mnist_testloader:
                imgs = imgs.to(device)
                labels = labels.to(device)                
                logits = model(imgs)
                loss = F.cross_entropy(logits, labels)
                val_loss.append(loss.item())                
                num_correct += (logits.argmax(dim=1) == labels).sum()
            val_loss_list.append(sum(val_loss)/len(val_loss))
            val_accuracy_list.append(num_correct / len(mnist_testset))
                

        

    line(
        loss_list, 
        yaxis_range=[0, max(loss_list) + 0.1],
        labels={"x": "Num batches seen", "y": "Cross entropy loss (train)"}, 
        title="SimpleMLP training on MNIST",
        width=700
    )

    line(
        val_loss_list,
        yaxis_range=[0, max(val_loss_list) + 0.1],
        labels={"x": "Num epochs", "y": "Cross entropy loss (test)"}, 
        width=700
    )

    line(
        val_accuracy_list,
        yaxis_range=[0, 1],
        labels={"x": "Num epochs", "y": "Test Accuracy"}, 
        width=700
    )


args = SimpleMLPTrainingArgs()
train(args)

  0%|          | 0/3 [00:00<?, ?it/s]

# Convolutions

#### Why would convolutional layers be less likely to overfit data than standard linear (fully connected) layers?

Convolutional layers have far, far fewer parameters and a strong inductive bias. Both of these make it significantly less likely to overfit a problem (though they may also make it useless for the problem if the inductive bias is a bad one)

#### Suppose you fixed some random permutation of the pixels in an image, and applied this to all images in your dataset, before training a convolutional neural network for classifying images. Do you expect this to be less effective, or equally effective?

This should be less effective (may be task dependent, I'd expect the permutation to have a larger effect on segmentation tasks than on classification ones). The model may be able to learn (and reverse) the random permutation, but we've broken the spatial correlation in the data with the random permutation. 

#### If you have a 28x28 image, and you apply a 3x3 convolution with stride 1, padding 1, what shape will the output be?

Presuming the "padding" here means we add an extra row/column to each side of the image (ie a bounding rectangle), the resulting image will be the same shape.  

## Conv2d

In [147]:
class Conv2d(nn.Module):
    def __init__(
        self, in_channels: int, out_channels: int, kernel_size: int, stride: int = 1, padding: int = 0
    ):
        '''
        Same as torch.nn.Conv2d with bias=False.

        Name your weight field `self.weight` for compatibility with the PyTorch version.
        '''
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding

        sqrt_k = np.sqrt(1 / (in_channels * kernel_size**2))
        w = -sqrt_k + 2*sqrt_k * t.rand(out_channels, in_channels, kernel_size, kernel_size)

        self.weight = nn.Parameter(w)

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''Apply the functional conv2d, which you can import.'''
        return nn.functional.conv2d(x, self.weight, stride=self.stride, padding=self.padding)

    def extra_repr(self) -> str:
        names = ['in_channels', 'out_channels', 'kernel_size', 'stride', 'padding']
        return ", ".join(f"{n}={getattr(self, n)}" for n in names)        


tests.test_conv2d_module(Conv2d)
m = Conv2d(in_channels=24, out_channels=12, kernel_size=3, stride=2, padding=1)
print(f"Manually verify that this is an informative repr: {m}")

All tests in `test_conv2d_module` passed!
Manually verify that this is an informative repr: Conv2d(in_channels=24, out_channels=12, kernel_size=3, stride=2, padding=1)


## MaxPool2d

In [148]:
class MaxPool2d(nn.Module):
    def __init__(self, kernel_size: int, stride: Optional[int] = None, padding: int = 1):
        super().__init__()
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''Call the functional version of max_pool2d.'''
        return nn.functional.max_pool2d(x, self.kernel_size, self.stride, self.padding)

    def extra_repr(self) -> str:
        '''Add additional information to the string representation of this class.'''
        names = ['kernel_size', 'stride', 'padding']
        return ", ".join(f"{n}={getattr(self, n)}" for n in names)  


tests.test_maxpool2d_module(MaxPool2d)
m = MaxPool2d(kernel_size=3, stride=2, padding=1)
print(f"Manually verify that this is an informative repr: {m}")

All tests in `test_maxpool2d_module` passed!
Manually verify that this is an informative repr: MaxPool2d(kernel_size=3, stride=2, padding=1)


# ResNets

### Reading


#### "Batch Normalization allows us to be less careful about initialization." Explain this statement.

So batch normalization occurs between the linear piece of the model and the activation function in a neuron.  The reason we generally have to be careful about initialization is not about any particular layer, but about the propagation of variance through multiple layers.  When we batchnorm, we stabilize the variance layer by layer, so the effects of a poorly initialized layer are never propagated to downstream layers.

#### Give three reasons why batch norm improves the performance of neural networks.

- It makes the network less sensitive to initialization
- It _may_ make the model less sensitive to internal covariate shift. I'd need more convincing about how this impacts generalization.
- It acts as a regularizer
- It allows for training deeper networks.

#### If you have an input tensor of size (batch, channels, width, height), and you apply a batchnorm layer, how many learned parameters will there be?

2 * channels as we learn a mean and std deviation for each channel.

#### In the paper, the diagram shows additive skip connections (i.e. F(x) + x). One can also form concatenated skip connections, by "gluing together" F(x) and x into a single tensor. Give one advantage and one disadvantage of these, relative to additive connections.

An advantage of concatenation is that you end up parallelizing computation through the network. This can be thought of as a form of ensembling where each channel carries information from a subnetwork of different depth.  This also give backprop more direct access to many layers of computation, potentially improving training speeds.  A disadvantage is that you've got more output so you're dealing with more data. You're also teaching the network something pretty differnent (incrementally adjusting the residual is a different logical model than the ensembling picture concatenation give you)

### Sequential 

In [149]:
class Sequential(nn.Module):
    _modules: Dict[str, nn.Module]

    def __init__(self, *modules: nn.Module):
        super().__init__()
        for index, mod in enumerate(modules):
            self._modules[str(index)] = mod

    def __getitem__(self, index: int) -> nn.Module:
        index %= len(self._modules) # deal with negative indices
        return self._modules[str(index)]

    def __setitem__(self, index: int, module: nn.Module) -> None:
        index %= len(self._modules) # deal with negative indices
        self._modules[str(index)] = module

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''Chain each module together, with the output from one feeding into the next one.'''
        for mod in self._modules.values():
            x = mod(x)
        return x

### BatchNorm2d

In [150]:
class BatchNorm2d(nn.Module):
    # The type hints below aren't functional, they're just for documentation
    running_mean: Float[Tensor, "num_features"]
    running_var: Float[Tensor, "num_features"]
    num_batches_tracked: Int[Tensor, ""] # This is how we denote a scalar tensor

    def __init__(self, num_features: int, eps=1e-05, momentum=0.1):
        '''
        Like nn.BatchNorm2d with track_running_stats=True and affine=True.

        Name the learnable affine parameters `weight` and `bias` in that order.
        '''
        super().__init__()
        self.num_features = num_features
        self.math_shape = (self.num_features, 1, 1)
        self.eps = eps
        self.momentum = momentum

        self.register_buffer("running_mean", t.zeros(num_features))
        self.register_buffer("running_var", t.ones(num_features))
        self.register_buffer("num_batches_tracked", t.tensor(0))

        self.weight = nn.Parameter(t.ones(num_features))
        self.bias = nn.Parameter(t.zeros(num_features))

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''
        Normalize each channel.

        Compute the variance using `torch.var(x, unbiased=False)`
        Hint: you may also find it helpful to use the argument `keepdim`.

        x: shape (batch, channels, height, width)
        Return: shape (batch, channels, height, width)
        '''
        if self.training:
            mean, var =  t.mean(x, (0, 2, 3)), t.var(x, (0, 2, 3), unbiased=False)
            self.running_mean = (1 - self.momentum)*self.running_mean + self.momentum*mean
            self.running_var = (1 - self.momentum)*self.running_var + self.momentum*var
            self.num_batches_tracked += 1
        else:
            mean, var = self.running_mean, self.running_var
            
        
        z = (x - mean.view(self.math_shape)) / t.sqrt(var.view(self.math_shape) + self.eps)        
        return self.weight.view(self.math_shape) * z + self.bias.view(self.math_shape)

    def extra_repr(self) -> str:
        pass


tests.test_batchnorm2d_module(BatchNorm2d)
tests.test_batchnorm2d_forward(BatchNorm2d)
tests.test_batchnorm2d_running_mean(BatchNorm2d)

All tests in `test_batchnorm2d_module` passed!
All tests in `test_batchnorm2d_forward` passed!
All tests in `test_batchnorm2d_running_mean` passed!


### AveragePool

In [151]:
class AveragePool(nn.Module):
    def forward(self, x: t.Tensor) -> t.Tensor:
        '''
        x: shape (batch, channels, height, width)
        Return: shape (batch, channels)
        '''
        return t.mean(x, dim=(2, 3))

## Building ResNet

#### Question: there would be no advantage to enabling biases on the convolutional layers. Why?

With batch norm, the biases get sucked out into the batch mean.  

#### Question: why is it necessary for the output of the left and right computational tracks in ResidualBlock to be the same shape?

So we can add them together...?

### ResidualBlock

In [152]:
class ResidualBlock(nn.Module):
    def __init__(self, in_feats: int, out_feats: int, first_stride=1):
        '''
        A single residual block with optional downsampling.

        For compatibility with the pretrained model, declare the left side branch first using a `Sequential`.

        If first_stride is > 1, this means the optional (conv + bn) should be present on the right branch. Declare it second using another `Sequential`.
        '''
        super().__init__()
        self.in_feats = in_feats
        self.out_feats = out_feats
        self.first_stride = first_stride

        self.model_block = nn.Sequential(
            Conv2d(in_feats, out_feats, 3, stride=first_stride, padding=1),
            BatchNorm2d(out_feats),
            ReLU(),
            Conv2d(out_feats, out_feats, 3, padding=1),
            BatchNorm2d(out_feats),
        )
        if first_stride > 1:
            self.residual_block = nn.Sequential(
                Conv2d(in_feats, out_feats, 1, stride=first_stride),
                BatchNorm2d(out_feats),
            )
        else:
            self.residual_block = nn.Identity()
        self.activation = ReLU()

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''
        Compute the forward pass.

        x: shape (batch, in_feats, height, width)

        Return: shape (batch, out_feats, height / stride, width / stride)

        If no downsampling block is present, the addition should just add the left branch's output to the input.
        '''
        y = self.model_block(x)
        res = self.residual_block(x)
        return self.activation(y + res)

### BlockGroup

In [153]:
class BlockGroup(nn.Module):
    def __init__(self, n_blocks: int, in_feats: int, out_feats: int, first_stride=1):
        '''An n_blocks-long sequence of ResidualBlock where only the first block uses the provided stride.'''
        super().__init__()
        self.n_blocks = n_blocks
        self.in_feats = in_feats
        self.out_feats = out_feats
        self.first_stride = first_stride

        assert n_blocks >= 1

        blocks = [ResidualBlock(in_feats, out_feats, first_stride)]
        for i in range(1, n_blocks):
            blocks.append(ResidualBlock(out_feats, out_feats))
        self.blocks = Sequential(*blocks)
        

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''
        Compute the forward pass.

        x: shape (batch, in_feats, height, width)

        Return: shape (batch, out_feats, height / first_stride, width / first_stride)
        '''
        return self.blocks(x)

### ResNet34

In [154]:
class ResNet34(nn.Module):
    def __init__(
        self,
        n_blocks_per_group=[3, 4, 6, 3],
        out_features_per_group=[64, 128, 256, 512],
        first_strides_per_group=[1, 2, 2, 2],
        n_classes=1000,
    ):
        super().__init__()
        params = zip(
            n_blocks_per_group, 
            [64] + out_features_per_group[:-1],
            out_features_per_group, 
            first_strides_per_group,
        )
        
        self.layers = Sequential(
            Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=2, padding=3),
            BatchNorm2d(num_features=64),
            ReLU(),
            MaxPool2d(kernel_size=3, stride=2),
            *[BlockGroup(*p) for p in params],
            AveragePool(),
            Flatten(),
            Linear(in_features=out_features_per_group[-1], out_features=n_classes)            
        )
        
        

    def forward(self, x: t.Tensor) -> t.Tensor:
        '''
        x: shape (batch, channels, height, width)
        Return: shape (batch, n_classes)
        '''
        return self.layers(x)


my_resnet = ResNet34()

In [155]:
def copy_weights(my_resnet: ResNet34, pretrained_resnet: models.resnet.ResNet) -> ResNet34:
    '''Copy over the weights of `pretrained_resnet` to your resnet.'''

    # Get the state dictionaries for each model, check they have the same number of parameters & buffers
    mydict = my_resnet.state_dict()
    pretraineddict = pretrained_resnet.state_dict()
    assert len(mydict) == len(pretraineddict), "Mismatching state dictionaries."

    # Define a dictionary mapping the names of your parameters / buffers to their values in the pretrained model
    state_dict_to_load = {
        mykey: pretrainedvalue
        for (mykey, myvalue), (pretrainedkey, pretrainedvalue) in zip(mydict.items(), pretraineddict.items())
    }

    # Load in this dictionary to your model
    my_resnet.load_state_dict(state_dict_to_load)

    return my_resnet


pretrained_resnet = models.resnet34(weights=models.ResNet34_Weights.IMAGENET1K_V1)
my_resnet = copy_weights(my_resnet, pretrained_resnet)

In [156]:
IMAGE_FILENAMES = [
    "chimpanzee.jpg",
    "golden_retriever.jpg",
    "platypus.jpg",
    "frogs.jpg",
    "fireworks.jpg",
    "astronaut.jpg",
    "iguana.jpg",
    "volcano.jpg",
    "goofy.jpg",
    "dragonfly.jpg",
]

IMAGE_FOLDER = section_dir / "resnet_inputs"

images = [Image.open(IMAGE_FOLDER / filename) for filename in IMAGE_FILENAMES]

In [157]:
IMAGE_SIZE = 224
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

IMAGENET_TRANSFORM = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

prepared_images = t.stack([IMAGENET_TRANSFORM(img) for img in images], dim=0)

assert prepared_images.shape == (len(images), 3, IMAGE_SIZE, IMAGE_SIZE)

In [158]:
def predict(model, images: t.Tensor) -> t.Tensor:
    '''
    Returns the predicted class for each image (as a 1D array of ints).
    '''
    return model(images).argmax(dim=1)


with open(section_dir / "imagenet_labels.json") as f:
    imagenet_labels = list(json.load(f).values())

# Check your predictions match those of the pretrained model
my_predictions = predict(my_resnet, prepared_images)
pretrained_predictions = predict(pretrained_resnet, prepared_images)
assert all(my_predictions == pretrained_predictions)
print("All predictions match!")

# # Print out your predictions, next to the corresponding images
# for img, label in zip(images, my_predictions):
#     print(f"Class {label}: {imagenet_labels[label]}")
#     display(img)
#     print()

All predictions match!
