In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import copy
import random
from tqdm import tqdm
import torch
import torchvision
from torchvision import datasets
import torchvision.transforms as transforms


In [None]:
%%sh
# Download the data - you need to do this only once
wget --no-verbose --output-document=image_dog.jpg https://github.com/chrirupp/cv_course/raw/main/data/image_dog.jpg

In [None]:
class Visualizer():
    def __init__(self, num_rows=1, num_cols=1, figsize=(5,5), axis_off=True, title='', tight=False, cm=None):
        self.fig, self.axs = plt.subplots(num_rows, num_cols, figsize=figsize, squeeze=False)
        # remove ticks
        if axis_off:
          plt.setp(plt.gcf().get_axes(), xticks=[], yticks=[])
        # set colormap
        if cm is not None:
            plt.set_cmap(cm)
        # set supertitle
        self.fig.suptitle(title)
        if tight:
            self.fig.subplots_adjust(top=0.88)

    def add_image_subplot(self, i, j, image, normalize=False, title_str=''):
        if normalize:
            image = self.normalize_image(image)
        if len(image.shape) == 3:
            #BGR -> RGB
            image = image[:, :, ::-1]
        self.axs[i, j].imshow(image)
        self.axs[i, j].set_title(title_str)

    def add_stem_subplot(self, i, j, x, y, title_str=''):
        self.axs[i, j].stem(x, y)
        self.axs[i, j].set_title(title_str)

    def add_subplot(self, i, j, data, title_str=''):
        self.axs[i, j].plot(data)
        self.axs[i, j].set_title(title_str)

    def add_bar_subplot(self, i, j, x, y, title_str=''):
        self.axs[i, j].bar(x, y)
        self.axs[i, j].set_title(title_str)

    def add_scatter_subplot_with_labels(self, i, j, data, labels, legend=None, title_str=''):
        scatter = self.axs[i, j].scatter(data[:,0], data[:,1], c=labels)
        scatter.set_cmap('jet')
        if legend is not None:
            plt.legend(handles=scatter.legend_elements()[0], labels=legend)
        self.axs[i, j].set_title(title_str)

    def add_scatter_subplot_with_txt(self, i, j, data, txt, title_str=''):
        self.axs[i, j].scatter(data[:,0], data[:,1])
        for idx, txt in enumerate(txt):
            self.axs[i, j].annotate(txt, (data[idx,0], data[idx,1]))
        self.axs[i, j].set_title(title_str)

    @staticmethod
    def normalize_image(image):
        img = np.float64(image) - np.min(image)
        img /= np.max(img)
        return img

In [None]:
# load a resnet18 model
model = torch.hub.load('pytorch/vision', 'resnet18', weights=torchvision.models.resnet.ResNet18_Weights.DEFAULT)

# visualise first layer filters (this is slow)
first_layer_weights = model.conv1.weight.data.cpu()
first_layer_weights = first_layer_weights.permute(0, 2, 3, 1).numpy()
second_layer_weights = model.layer1[0].conv1.weight.data.cpu().numpy()
print(second_layer_weights.shape)

size = int(first_layer_weights.shape[0]**0.5)
vis = Visualizer(num_rows=size, num_cols=size, figsize=(10,10), axis_off=True, title='First layer weights - ResNet18')
for i in range(first_layer_weights.shape[0]):
    w = first_layer_weights[i]
    w = (w - np.min(w)) / (np.max(w) - np.min(w))
    vis.add_image_subplot(i // size, i % size, w, normalize=False)

# visualise second layer filters (this is even slower)
vis = Visualizer(num_rows=second_layer_weights.shape[1], num_cols=second_layer_weights.shape[0], figsize=(20,10), axis_off=True, title='Second layer weights - ResNet18')
for i in range(second_layer_weights.shape[0]):
    for j in range(second_layer_weights.shape[1]):
        w = second_layer_weights[i, j]
        w = (w - np.min(w)) / (np.max(w) - np.min(w))
        vis.add_image_subplot(j, i, w, normalize=False)


In [None]:
# load last layer weights
last_layer_weights = model.fc.weight.data.cpu().numpy()
# get class names
class_names = torchvision.models.resnet.ResNet18_Weights.DEFAULT.meta["categories"]

# pca embedding of class vectors
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(last_layer_weights)
pca_weights = pca.transform(last_layer_weights)

# visualise pca embedding
vis = Visualizer(figsize=(20,20), axis_off=True)
vis.add_scatter_subplot_with_txt(0, 0, pca_weights, class_names, title_str='PCA embedding of class vectors - ResNet18')


In [None]:
# load cifar10 dataset
cifar10 = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]))
model = torch.hub.load("chenyaofo/pytorch-cifar-models", "cifar10_resnet20", pretrained=True)  # change pretrained to False to load random weights and compare the pca/tsne plots

# create a dataloader
cifar10_loader = torch.utils.data.DataLoader(cifar10, batch_size=100, shuffle=False)

# get last layer activations
last_layer_activations = []

with torch.no_grad():
    for images, labels in tqdm(cifar10_loader,desc='Computing last layer activations'):

        x = model.conv1(images)
        x = model.bn1(x)
        x = model.relu(x)

        x = model.layer1(x)
        x = model.layer2(x)
        x = model.layer3(x)

        x = model.avgpool(x)
        activations = x.view(x.size(0), -1).cpu().detach().numpy()
        last_layer_activations.append(activations)
last_layer_activations = np.concatenate(last_layer_activations, axis=0)

In [None]:
pca = PCA(n_components=2)
pca.fit(last_layer_activations)
pca_activations = pca.transform(last_layer_activations)

# visualise pca embedding
vis = Visualizer(figsize=(10,10), axis_off=True)
vis.add_scatter_subplot_with_labels(0, 0, pca_activations, cifar10.targets, legend=cifar10.classes, title_str='PCA embedding of last layer activations - random init ResNet20')

# visualise tsne embedding
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, perplexity=30, n_iter=300)
tsne_activations = tsne.fit_transform(last_layer_activations)
vis = Visualizer(figsize=(10,10), axis_off=True)
vis.add_scatter_subplot_with_labels(0, 0, tsne_activations, cifar10.targets, legend=cifar10.classes, title_str='t-SNE embedding of last layer activations - random init ResNet20')


In [None]:
# load different models

model = torch.hub.load('pytorch/vision', 'resnet18', weights=None)
model = torch.hub.load('pytorch/vision', 'resnet18', weights=torchvision.models.resnet.ResNet18_Weights.DEFAULT)
#class_names = torchvision.models.resnet.ResNet18_Weights.DEFAULT.meta["categories"]

# model = torch.hub.load('pytorch/vision', 'resnet50', weights=torchvision.models.resnet.ResNet50_Weights.DEFAULT)
# class_names = torchvision.models.resnet.ResNet18_Weights.DEFAULT.meta["categories"]

# model = torch.hub.load('pytorch/vision', 'vgg16', weights=torchvision.models.vgg.VGG16_Weights.DEFAULT)
# class_names = torchvision.models.vgg.VGG16_Weights.DEFAULT.meta["categories"]

# model = torch.hub.load('pytorch/vision', 'alexnet', weights=None)
# model = torch.hub.load('pytorch/vision', 'alexnet', weights=torchvision.models.AlexNet_Weights.DEFAULT)
#class_names = torchvision.models.AlexNet_Weights.DEFAULT.meta["categories"]

#model = torch.hub.load('pytorch/vision', 'vit_l_16', weights=torchvision.models.ViT_L_16_Weights.IMAGENET1K_V1)
#class_names = torchvision.models.ViT_L_16_Weights.IMAGENET1K_V1.meta["categories"]

# load image
image = cv2.imread('image_dog.jpg')

# preprocess image
#crop to square on the left
image = image[:, :image.shape[0]]
image = cv2.resize(image, (224, 224))
vis = Visualizer(num_rows=1, num_cols=1, figsize=(5,5), axis_off=True, title='Input image')
vis.add_image_subplot(0, 0, image)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = image / 255.0
image = image - np.array([0.485, 0.456, 0.406])
image = image / np.array([0.229, 0.224, 0.225])
image = image.transpose(2, 0, 1)
image = np.expand_dims(image, axis=0)
image = torch.from_numpy(image).float()

# get prediction
model.eval()
with torch.no_grad():
    prediction = model(image).detach().cpu().numpy()

# show top 5 predictions
probabilities = np.exp(prediction) / np.sum(np.exp(prediction))
top5 = np.argsort(-prediction, axis=1)[:, :5]
for i in range(5):
    print(f'{i+1}. class: {class_names[top5[0, i]]} probability: {probabilities[0, top5[0, i]]:.2f}')


In [None]:
# occlusion method

def get_response(image, model, x, y, size=50):
    occ = np.copy(image)
    occ[0, :, x:x+size, y:y+size] = 0
    with torch.no_grad():
        response = model(torch.from_numpy(occ)).detach().cpu().numpy()
    return response

# get heatmap
heatmap = np.zeros((224, 224))
# get prediction
model.eval()
with torch.no_grad():
    original_prediction = model(image).detach().cpu().numpy()

size=15
top_class = top5[0, 0]
print(f'Class: {class_names[top_class]}')
for x in tqdm(range(0, 224, size)):
    for y in range(0, 224, size):
        response = get_response(image, model, x, y, size)
        heatmap[x:x+size, y:y+size] = (original_prediction[0, top_class] - response[0, top_class])**2

# normalise heatmap
heatmap = (heatmap - np.min(heatmap)) / (np.max(heatmap) - np.min(heatmap))
vis = Visualizer(num_rows=1, num_cols=1, figsize=(10,10), axis_off=True, title='Occlusion heatmap')
vis.add_image_subplot(0, 0, heatmap, )


In [None]:
# gradient method

def get_gradient(image, model, top_class, num_samples=1):
    gradients = []
    for i in range(num_samples):
        input = torch.from_numpy(np.copy(image)).float()
        if num_samples > 1:
            input += torch.normal(0, 0.1, image.shape)
        input.requires_grad = True
        prediction = model(input)
        prediction[0, top_class].backward()
        gradients.append(input.grad.detach().cpu().numpy().max(axis=1))
    gradient = np.mean(np.concatenate(gradients, axis=0), axis=0)
    # gradient = np.var(np.concatenate(gradients, axis=0), axis=0)
    return gradient

# get gradient
top_class = top5[0, 0]
gradient = get_gradient(image, model, top_class)
gradient = (gradient - np.min(gradient)) / (np.max(gradient) - np.min(gradient))
vis = Visualizer(num_rows=1, num_cols=1, figsize=(10,10), axis_off=True, title='Gradient heatmap')
vis.add_image_subplot(0, 0, gradient)

In [None]:
# input maximisation

def normalise_image(x):
    x = x.detach().cpu().numpy()
    x = x[0].transpose(1, 2, 0)
    x = (x - np.min(x)) / (np.max(x) - np.min(x))
    return x

image = np.random.rand(1, 3, 224, 224)*0.1+0.45
image = torch.from_numpy(image).float()
image = torch.nn.Parameter(image, requires_grad=True)
optimizer = torch.optim.Adam([image], lr=0.1)

top_class = 207
model.eval()

num_iterations = 300
imgnet_mean = torch.from_numpy(np.array([0.485, 0.456, 0.406])).float()
imgnet_std = torch.from_numpy(np.array([0.229, 0.224, 0.225])).float()
tv_losses = []
class_scores = []
for i in tqdm(range(num_iterations)):
    inputs = []
    for i in range(3):
        roll = (random.randint(-8, 8), random.randint(-8, 8))
        input = torch.roll(image, shifts=roll, dims=(2,3))
        inputs.append(input)
    input = torch.concatenate(inputs, axis=0)
    input = (input - imgnet_mean[None, :, None, None]) / imgnet_std[None, :, None, None]
    prediction = model(input)
    tv_loss = torch.sum(torch.abs(input[:, :, :, :-1] - input[:, :, :, 1:])) + torch.sum(torch.abs(input[:, :, :-1, :] - input[:, :, 1:, :]))
    tv_losses.append(tv_loss.detach().cpu().numpy())
    class_scores.append(prediction[0, top_class].detach().cpu().numpy())
    loss = -prediction[0, top_class] + 0.06 * tv_loss
    optimizer.zero_grad()
    loss.backward()
    # gradient clipping
    torch.nn.utils.clip_grad_norm_(image, 0.1)
    optimizer.step()
    image.data = torch.clamp(image, 0, 1)

vis = Visualizer(num_rows=1, num_cols=3, figsize=(15,5), axis_off=False, title=f'Input maximisation class {class_names[top_class]}')
vis.add_image_subplot(0, 0, normalise_image(image))
vis.add_subplot(0, 1, class_scores, title_str='Class score')
vis.add_subplot(0, 2, tv_losses, title_str='TV loss')

In [None]:
# attention visualisation

# load a vit model
#model = torch.hub.load('pytorch/vision', 'vit_base_patch16_224', pretrained=True)
#class_names = torchvision.models.vit_base_patch16_224(pretrained=True).meta["categories"]
model = torch.hub.load('pytorch/vision', 'vit_b_16', weights=torchvision.models.ViT_B_16_Weights.IMAGENET1K_V1)
class_names = torchvision.models.ViT_B_16_Weights.IMAGENET1K_V1.meta["categories"]

# load image
image = cv2.imread('image_dog.jpg')

# preprocess image
image = image[:, :image.shape[0]]
image = cv2.resize(image, (224, 224))
vis = Visualizer(num_rows=1, num_cols=1, figsize=(5,5), axis_off=True, title='Input image')
vis.add_image_subplot(0, 0, image)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = image / 255.0
image = image - np.array([0.485, 0.456, 0.406])
image = image / np.array([0.229, 0.224, 0.225])
image = image.transpose(2, 0, 1)
image = np.expand_dims(image, axis=0)
image = torch.from_numpy(image).float()


# get prediction
with torch.no_grad():
    x = model._process_input(image)
    n = x.shape[0]
    batch_class_token = model.class_token.expand(n, -1, -1)
    x = torch.cat([batch_class_token, x], dim=1)

    input = x + model.encoder.pos_embedding
    input = model.encoder.dropout(input)

    attention = []
    for i, layer in enumerate(model.encoder.layers):
        x = layer.ln_1(input)
        x, atn_weights = layer.self_attention(x, x, x, need_weights=True, average_attn_weights=True)
        n = int(np.sqrt(atn_weights.shape[2]-1))
        atn_weights = atn_weights[:, 1:, 1:].view(n, n, n, n)
        attention.append(atn_weights.cpu().detach().numpy())
        x = layer.dropout(x)
        x = x + input

        y = layer.ln_2(x)
        y = layer.mlp(y)
        input = x + y
    x = model.encoder.ln(input)
    x = x[:, 0]
    prediction = model.heads(x).detach().cpu().numpy()

# show top 5 predictions
probabilities = np.exp(prediction) / np.sum(np.exp(prediction))
top5 = np.argsort(-prediction, axis=1)[:, :5]
for i in range(5):
    print(f'{i+1}. class: {class_names[top5[0, i]]} probability: {probabilities[0, top5[0, i]]:.2f}')

# visualise attention maps
vis = Visualizer(num_rows=1, num_cols=len(model.encoder.layers), figsize=(120,10), axis_off=True, title='Attention maps')
for i in range(len(attention)):
    vis.add_image_subplot(0, i, attention[i][:, :, 7, 7], normalize=True, title_str=f'Layer {i+1}')

vis = Visualizer(num_rows=1, num_cols=len(model.encoder.layers), figsize=(120,10), axis_off=True, title='Attention maps')
for i in range(len(attention)):
    vis.add_image_subplot(0, i, attention[i][:, :, 1, 1], normalize=True, title_str=f'Layer {i+1}')

vis = Visualizer(num_rows=1, num_cols=len(model.encoder.layers), figsize=(120,10), axis_off=True, title='Attention maps')
for i in range(len(attention)):
    vis.add_image_subplot(0, i, attention[i][:, :, 6, 12], normalize=True, title_str=f'Layer {i+1}')
