# Single image inference

This notebook contains the code to perform inference on a single image, thus extracting the classification score for such image and producing its Class Activation Maps (CAMs).
In order to do this, the image must be provided as input to a model, whose architecture and weights must be specified respectively in the `model` and `checkpoint_path` variables, assigned in different cells of this notebook. The final cell allows to visualise the outputs produced by such inference process.

***Note:*** the classification scores and CAMs obtained from the inference proposed in this notebook are not saved to disk but only displayed in the notebook.

In [None]:
# Import libraries
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import torch
import torchvision.transforms.v2 as transforms
from PIL import Image
from nets import ResNet, SwinT
from misc.inferutils import get_reshape_transform, get_target_layers
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.image import show_cam_on_image

In [None]:
# User-defined parameters
checkpoint_path = 'nets/weights/swint-rsp-20-210.pth' # File containing the weights of the model
cat_src_file = 'AerialWaste3.0/testing-binary.json' # File containing a list of categories for which to compute CAMs and predictions
gpu = 1 # ID of the GPU to use for performing inference
img_path = "AerialWaste3.0/images/23.png" # Image path
resize_size = (1048,1048)
# Check parameter file existence
assert os.path.isfile(checkpoint_path)
assert os.path.isfile(cat_src_file)
assert os.path.isfile(img_path)

In [None]:
# Extract list of categories
with open(cat_src_file, 'r') as file:
    cats = [cat['name'] for cat in json.load(file)['categories']]

In [None]:
# Open image with PIL
image = Image.open(img_path).convert('RGB')
# Create transform to process image
transform = transforms.Compose([transforms.ToImage(), 
                                transforms.ToDtype(torch.float32, scale=True),
                                transforms.Resize(size=resize_size),
                                transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))])  # ImageNet normalization values
# Preprocess image
input_tensor = transform(image).unsqueeze(0) # Add leading dimesion

In [None]:
# Create model [uncomment line related to the model to use for inference]
# model = ResNet('resnet50', head=[2048,1], pretraining_model=checkpoint_path, first_trainable=4).to(f"cuda:{gpu}")
model = SwinT(head=[768,1], pretraining_model=checkpoint_path, first_trainable=4).to(f"cuda:{gpu}")
# Create GradCam extractor
cam = GradCAM(model=model, 
              target_layers=get_target_layers(model), 
              reshape_transform=get_reshape_transform(model))

In [None]:
# Actually compute CAMs
grayscale_cam = cam(input_tensor=input_tensor, targets=None) # Setting the targets to None implies computing CAMs for each output
# Extract CAM of the single image
grayscale_cam = grayscale_cam[0, :]
# Create visualization
rgb_img = np.asarray(image.resize(resize_size), dtype=np.float32)/255
visualization = show_cam_on_image(rgb_img, grayscale_cam, use_rgb=True)

In [None]:
# Display outputs 
print(f"Classification scores: {cam.outputs[0,:].detach().cpu().numpy()}")
fig,axes = plt.subplots(1,2, figsize=(16,10))
axes[0].imshow(image)
axes[1].imshow(visualization);
for ax in axes:
    ax.axis('off')