# Loading pre-trained network using TorchVision


## 1. Load the pre-trained model


Import model and look at the deep learning architectures implemented in the torch vision library.


In [47]:
from torchvision import models
import torch

dir(models)

['AlexNet',
 'AlexNet_Weights',
 'ConvNeXt',
 'ConvNeXt_Base_Weights',
 'ConvNeXt_Large_Weights',
 'ConvNeXt_Small_Weights',
 'ConvNeXt_Tiny_Weights',
 'DenseNet',
 'DenseNet121_Weights',
 'DenseNet161_Weights',
 'DenseNet169_Weights',
 'DenseNet201_Weights',
 'EfficientNet',
 'EfficientNet_B0_Weights',
 'EfficientNet_B1_Weights',
 'EfficientNet_B2_Weights',
 'EfficientNet_B3_Weights',
 'EfficientNet_B4_Weights',
 'EfficientNet_B5_Weights',
 'EfficientNet_B6_Weights',
 'EfficientNet_B7_Weights',
 'EfficientNet_V2_L_Weights',
 'EfficientNet_V2_M_Weights',
 'EfficientNet_V2_S_Weights',
 'GoogLeNet',
 'GoogLeNetOutputs',
 'GoogLeNet_Weights',
 'Inception3',
 'InceptionOutputs',
 'Inception_V3_Weights',
 'MNASNet',
 'MNASNet0_5_Weights',
 'MNASNet0_75_Weights',
 'MNASNet1_0_Weights',
 'MNASNet1_3_Weights',
 'MaxVit',
 'MaxVit_T_Weights',
 'MobileNetV2',
 'MobileNetV3',
 'MobileNet_V2_Weights',
 'MobileNet_V3_Large_Weights',
 'MobileNet_V3_Small_Weights',
 'RegNet',
 'RegNet_X_16GF_Weights'

Create an instance of the network. When "pretrained" argument is set to True, program will download
the weights of the network (in this example trained on the ImageNet dataset).


In [48]:
alexnet = models.alexnet(pretrained=True)

Print the network architecture.


In [49]:
print(alexnet)

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

## 2. Specify image transformations


Transform the input image so it has the right shape and other characteristics like mean and standard deviation.


In [50]:
from torchvision import transforms

transform = transforms.Compose([  # [1]
    transforms.Resize(256),  # [2]
    transforms.CenterCrop(224),  # [3]
    transforms.ToTensor(),  # [4]
    transforms.Normalize(  # [5]
        mean=[0.485, 0.456, 0.406],  # [6]
        std=[0.229, 0.224, 0.225]  # [7]
    )])

- Line [1]: Defina a variable transform which is a combination of all the image transformations to be carried out on the input image.
- Line [2]: Resize the image to 256×256 pixels.
- Line [3]: Crop the image to 224×224 pixels about the center.
- Line [4]: Convert the image to PyTorch Tensor data type.
- Line [5-7]: Normalize the image by setting its mean and standard deviation to the specified values.


## 3. Load the input image and pre-process it


Use **Pillow** (PIL) module to load the image from the disk. PIL is the default image backend supported by TorchVision.


In [51]:
from PIL import Image
img = Image.open("images/dog.jpg")

Pre-process the image using the "transform" defined above and prepare a batch to be passed through the network.


In [52]:
img_t = transform(img)
batch_t = torch.unsqueeze(img_t, 0)

## 4. Model inference


Now it's time to use pre-trained model to classify the input image. First step is to put model in the **eval** mode.


In [53]:
alexnet.eval()

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

Next step is to pass the prepared batch through the network and get the output which will be a vector of size 1000.


In [54]:
out = alexnet(batch_t)
print(out.shape)

torch.Size([1, 1000])


Read image labels (also named classes) from the text file and store them in variable.


In [55]:
with open('imagenet_labels.txt') as f:
    labels = [line.strip() for line in f.readlines()]

Find the index where the maximum score in output vector out occurs and use this index to find the predicted label. Also calculate the probability of the predicted label.


In [56]:
value, index = torch.max(out, 1)

percentage = torch.nn.functional.softmax(out, dim=1)[0] * 100

print(labels[index[0]], percentage[index[0]].item())

Labrador retriever 42.4673957824707


Now find 5 most probable classes predicted by the network.


In [57]:
values, indexes = torch.sort(out, descending=True)
[(labels[idx], percentage[idx].item()) for idx in indexes[0][:5]]

[('Labrador retriever', 42.4673957824707),
 ('golden retriever', 16.60862922668457),
 ('Saluki, gazelle hound', 15.473831176757812),
 ('whippet', 2.7881901264190674),
 ('Ibizan hound, Ibizan Podenco', 2.3617072105407715)]

These are all dog breeds. So the model managed to predict that it was a dog with a fairly high confidence but it was not very sure about the breed of the dog.


## 4.1 Test the model on a different images


In [58]:
from pprint import pprint


def print_labels(image_path: str):
    img = Image.open(image_path)

    img_t = transform(img)
    batch_t = torch.unsqueeze(img_t, 0)

    out = alexnet(batch_t)

    values, indexes = torch.sort(out, descending=True)
    results = [(labels[idx], percentage[idx].item()) for idx in indexes[0][:5]]

    pprint(results)

Correct prediction for cat.


In [61]:
print_labels("images/cat.jpg")

[('tabby, tabby cat', 5.015179340261966e-05),
 ('tiger cat', 0.0008578496635891497),
 ('Egyptian cat', 8.540306589566171e-05),
 ('plastic bag', 0.00017712454427964985),
 ('ping-pong ball', 0.00014264888886827976)]


Also pretty accurate prediction for a car.


In [62]:
print_labels("images/car.jpg")

[('convertible', 6.501838925032644e-07),
 ('sports car, sport car', 5.185696068110701e-07),
 ('car wheel', 6.273152393987402e-06),
 ('minivan', 1.08923325115029e-06),
 ('beach wagon, station wagon, wagon, estate car, beach waggon, station '
  'waggon, waggon',
  6.559849339282664e-07)]


However in case of a tree, the output is not much accurate.


In [67]:
print_labels("images/tree.jpeg")

[('rapeseed', 0.0003924946067854762),
 ('hay', 1.1074106623709667e-05),
 ('golfcart, golf cart', 7.680908424845256e-07),
 ('park bench', 4.2394549382152036e-05),
 ('golf ball', 0.0001426295784767717)]


And with the forest, the model is completely wrong.


In [59]:
print_labels("images/forest.jpg")

[('viaduct', 1.1651597731088259e-07),
 ('worm fence, snake fence, snake-rail fence, Virginia fence',
  0.0055572097189724445),
 ('mountain tent', 1.0924984053417575e-05),
 ('park bench', 4.2394549382152036e-05),
 ('maze, labyrinth', 6.897549610584974e-05)]
