<a href="https://colab.research.google.com/github/francesco-vaccari/ProjectDL/blob/fra/Test_grad_cam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
!gdown 1xijq32XfEm6FPhUb7RsZYWHc2UuwVkiq
!tar -xf /content/refcocog.tar.gz
!pip install -qr https://raw.githubusercontent.com/ultralytics/yolov5/master/requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy
Successfully installed ftfy-6.1.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-2ers5ksp
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-2ers5ksp
  Resolved https://github.com/openai/CLIP.git to commit a9b1bf5920416aaeaec965c25dd9e8f98c864f16
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-n

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import clip
import torch
import pandas
import numpy as np

from torch.utils.data import Dataset, DataLoader
from typing import Sequence, Union

from PIL import Image, ImageDraw



class RefcocogDataset(Dataset):
    def __init__(self, base_path, split=None, transform=None, tokenization=None):
        annotation_path = base_path + "/annotations/"

        self.IMAGES_PATH = base_path + "/images/"
        self.transform = transform
        self.tokenization = tokenization

        tmp_annotations = pandas.read_pickle(annotation_path + "refs(umd).p")
        tmp_instances = json.load(open(annotation_path + "instances.json", "r"))

        annotations_dt = pandas.DataFrame.from_records(tmp_annotations) \
            .filter(items=["image_id", "split", "sentences", "ann_id"])

        instances_dt = pandas.DataFrame.from_records(tmp_instances['annotations'])

        self.annotations = annotations_dt \
            .merge(instances_dt[["id", "bbox", "area"]], left_on="ann_id", right_on="id") \
            .drop(columns="id")

        if split is not None:
            self.annotations = self.__get_annotations_by_split(split.lower())

    def getImage(self, sample):
        id = sample['idx'][0].item()
        item = self.annotations.iloc[id]
        image = self.__getimage(item.image_id)

        return image

    def getSentences(self, sample):
        id = sample['idx'][0].item()
        item = self.annotations.iloc[id]

        return self.__extract_sentences(item.sentences)
    
    def showImage(self, train_features, train_bbox):
        img = self.getImage(train_features)
        img1 = ImageDraw.Draw(img)
        img1.rectangle([(train_bbox[0].item(), train_bbox[1].item()), (train_bbox[2].item(), train_bbox[3].item())], outline ="red")
        img.show()

    def __get_annotations_by_split(self, split):
        return self.annotations[self.annotations.split == split].reset_index()

    def __getimage(self, id):
        return Image.open(self.IMAGES_PATH + "COCO_train2014_" + str(id).zfill(12) + ".jpg")

    def __extract_sentences(self, sentences):
        return [f"a photo of {s['sent']}" for s in sentences]

    def __tokenize_sents(self, sentences):
        return [self.tokenization(s) for s in sentences]

    def __len__(self):
        return self.annotations.shape[0]

    def __getitem__(self, idx):
        item = self.annotations.iloc[idx]
        image = self.__getimage(item.image_id)
        sentences = self.__extract_sentences(item.sentences)

        if self.transform:
            image = self.transform(image)

        if self.tokenization:
            sentences = self.__tokenize_sents(sentences)

        sample = {'idx': idx, 'image': image, 'sentences': sentences}

        return sample, item.bbox

In [None]:
import numpy as np
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
import clip
from PIL import Image
from scipy.ndimage import gaussian_filter
from torch import nn


def normalize(x: np.ndarray) -> np.ndarray:
    x = x - x.min()
    if x.max() > 0:
        x = x / x.max()
    return x

def getAttMap(img, attn_map, blur=True):
    if blur:
        attn_map = gaussian_filter(attn_map, 0.02*max(img.shape[:2]))
    attn_map = normalize(attn_map)
    cmap = plt.get_cmap('jet')
    attn_map_c = np.delete(cmap(attn_map), 3, 2)
    attn_map = 1*(1-attn_map**0.7).reshape(attn_map.shape + (1,))*img + \
            (attn_map**0.7).reshape(attn_map.shape+(1,)) * attn_map_c
    return attn_map

def getCmap(img, attn_map, blur=True):
    if blur:
        attn_map = gaussian_filter(attn_map, 0.02*max(img.shape[:2]))
    attn_map = normalize(attn_map)
    cmap = plt.get_cmap('jet')
    attn_map_c = np.delete(cmap(attn_map), 3, 2)
    return attn_map_c

def viz_attn(img, attn_map, blur=True):
    _, axes = plt.subplots(1, 2, figsize=(10, 5))
    axes[0].imshow(img)
    axes[1].imshow(getAttMap(img, attn_map, blur))
    for ax in axes:
        ax.axis("off")
    plt.show()
    
def load_image(image, resize=None):
    image = image.convert("RGB")
    if resize is not None:
        image = image.resize((resize, resize))
    return np.asarray(image).astype(np.float32) / 255.


class Hook:
    def __init__(self, module: nn.Module):
        self.data = None
        self.hook = module.register_forward_hook(self.save_grad)
        
    def save_grad(self, module, input, output):
        self.data = output
        output.requires_grad_(True)
        output.retain_grad()
        
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_value, exc_traceback):
        self.hook.remove()
        
    @property
    def activation(self) -> torch.Tensor:
        return self.data
    
    @property
    def gradient(self) -> torch.Tensor:
        return self.data.grad


def gradCAM(model: nn.Module, input: torch.Tensor, target: torch.Tensor, layer: nn.Module) -> torch.Tensor:
    if input.grad is not None:
        input.grad.data.zero_()
    
    requires_grad = {}
    for name, param in model.named_parameters():
        requires_grad[name] = param.requires_grad
        param.requires_grad_(False)
        
    assert isinstance(layer, nn.Module)
    with Hook(layer) as hook:        
        output = model(input)
        output.backward(target)

        grad = hook.gradient.float()
        act = hook.activation.float()
    
        alpha = grad.mean(dim=(2, 3), keepdim=True)
        gradcam = torch.sum(act * alpha, dim=1, keepdim=True)
        gradcam = torch.clamp(gradcam, min=0)

    gradcam = F.interpolate(gradcam, input.shape[2:], mode='bicubic', align_corners=False)
    
    for name, param in model.named_parameters():
        param.requires_grad_(requires_grad[name])
        
    return gradcam


class FeatureCouple:
    def __init__(self, index, image_feature, sentence_feature, norm_image_feature, norm_sentence_feature):
        self.index = index
        self.image_feature = image_feature
        self.sentence_feature = sentence_feature
        self.similarity = norm_image_feature * norm_sentence_feature


def getSalientEncodedFeatures(preprocessed_image, encoded_text, model):
    with torch.no_grad():
        encoded_image = model.encode_image(preprocessed_image).float()
        norm_encoded_image = encoded_image / encoded_image.norm(dim=-1, keepdim=True)
        norm_encoded_text = encoded_text / encoded_text.norm(dim=-1, keepdim=True)
        original_sim = norm_encoded_text.cpu().numpy() @ norm_encoded_image.cpu().numpy().T
    

        features = []
        for i in range(1024):
            features.append(FeatureCouple(i, encoded_image[0][i].item(), encoded_text[0][i].item(), norm_encoded_image[0][i].item(), norm_encoded_text[0][i].item()))
        features.sort(key=lambda x: x.similarity, reverse=True)

        reconstruct_indexes = set()
        reconstruct_sim = 0
        for elem in features:
            if reconstruct_sim / original_sim < 0.95:
                reconstruct_sim += elem.similarity
                reconstruct_indexes.add(elem.index)

        for index in range(1024):
            if index not in reconstruct_indexes:
                encoded_text[0][index] = 0
                encoded_image[0][index] = 0
                norm_encoded_image[0][index] = 0
                norm_encoded_text[0][index] = 0
    
    return encoded_image, encoded_text
    return norm_encoded_image, norm_encoded_text

In [None]:
def computeIntersection(fx1, fy1, fx2, fy2, sx1, sy1, sx2, sy2):
    dx = min(fx2, sx2) - max(fx1, sx1)
    dy = min(fy2, sy2) - max(fy1, sy1)
    if (dx>=0) and (dy>=0):
        area = dx*dy
    else:
        area = 0
    return area

def computeAccuracy(bbox, label):
    intersection = computeIntersection(bbox[0], bbox[1], bbox[2], bbox[3],
                                       label[0].item(), label[1].item(), label[0].item()+label[2].item(), label[1].item()+label[3].item())
    area1 = (bbox[2]-bbox[0])*(bbox[3]-bbox[1])
    area2 = label[2].item()*label[3].item()
    union = area1 + area2 - intersection
    return intersection / union

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
modelCLIP, preprocessCLIP = clip.load("RN50", device=device)

REFCOCOG_PATH = "refcocog"

train_dataset = RefcocogDataset(REFCOCOG_PATH, split="train", transform=preprocessCLIP, tokenization=clip.tokenize)
val_dataset = RefcocogDataset(REFCOCOG_PATH, split="val", transform=preprocessCLIP, tokenization=clip.tokenize)
test_dataset = RefcocogDataset(REFCOCOG_PATH, split="test", transform=preprocessCLIP, tokenization=clip.tokenize)

train_dataloader = DataLoader(train_dataset, shuffle=False)
val_dataloader = DataLoader(val_dataset, shuffle=False)
test_dataloader = DataLoader(test_dataset, shuffle=False)

100%|████████████████████████████████████████| 244M/244M [00:02<00:00, 117MiB/s]


In [15]:
f = open("/content/drive/MyDrive/output.txt", "w")

def baseline(loader, dataset, modelCLIP, preprocessCLIP):
    n_samples = 0
    tot_accuracy = 0
    blur = False
    for data_features, data_bbox in loader:
        sentences = dataset.getSentences(data_features)
        minxs = []
        minys = []
        maxxs = []
        maxys = []
        for sent in sentences:
            image = dataset.getImage(data_features)

            preprocessed_image = preprocessCLIP(image).unsqueeze(0).to(device)

            encoded_text = modelCLIP.encode_text(clip.tokenize([sent]).to(device)).float()
            # encoded_image, encoded_text = getSalientEncodedFeatures(preprocessed_image, encoded_text, modelCLIP)

            attn_map = gradCAM(modelCLIP.visual, preprocessed_image, encoded_text, getattr(modelCLIP.visual, "layer4"))
            attn_map = attn_map.squeeze().detach().cpu().numpy()

            #viz_attn(load_image(image, modelCLIP.visual.input_resolution), attn_map, blur)
            cmap = getCmap(load_image(image, modelCLIP.visual.input_resolution), attn_map, blur)


            red = torch.zeros((224, 224))

            for i in range(cmap.shape[0]):
                for j in range(cmap.shape[1]):
                    for k in range(cmap.shape[2]):
                        red[i][j] = cmap[i][j][0]

            xs = []
            ys = []

            for i in range(224):
                for j in range(224):
                    if red[i][j] > 0.5:
                        xs.append(j)
                        ys.append(i)
            
            original_size = dataset.getImage(data_features).size
            scaling_factor_x = original_size[0]/224
            scaling_factor_y = original_size[1]/224

            if(len(xs) > 0 and len(ys) > 0):
                minxs.append(min(xs)*scaling_factor_x)
                minys.append(min(ys)*scaling_factor_y)
                maxxs.append(max(xs)*scaling_factor_x)
                maxys.append(max(ys)*scaling_factor_y)

        if(len(minxs) > 0 and len(minys) > 0 and len(maxxs) > 0 and len(maxys) > 0):
            bbox = [sum(minxs)/len(minxs), sum(minys)/len(minys), sum(maxxs)/len(maxxs), sum(maxys)/len(maxys)]
            accuracy = computeAccuracy(bbox, data_bbox)
            tot_accuracy += accuracy
            n_samples += 1
            f.write(f'Image {n_samples:^8}/{len(dataset):^8}\t{accuracy}\n')
            print(f'Image {n_samples:^8}/{len(dataset):^8}\t{accuracy}')
        else:
            n_samples += 1
            f.write(f'Image {n_samples:^8}/{len(dataset):^8}\terror\n')
            print(f'Image {n_samples:^8}/{len(dataset):^8}\terror')
    f.write(f'Final accuracy\t{tot_accuracy/n_samples}')
    return tot_accuracy/n_samples

print(baseline(val_dataloader, val_dataset, modelCLIP, preprocessCLIP))
f.close()

Image    1    /  2573  	0.11640592620357353
Image    2    /  2573  	0.09317950912418302
Image    3    /  2573  	0.18925824245041642
Image    4    /  2573  	0.4403179678598696
Image    5    /  2573  	0.2728777144223261
Image    6    /  2573  	0.4727027566956363
Image    7    /  2573  	0.15454730474843675
Image    8    /  2573  	0.1572190847674305
Image    9    /  2573  	0.38430624185066803
Image    10   /  2573  	0.017191140001649032
Image    11   /  2573  	0.3868090559775686
Image    12   /  2573  	0.2580537794932998
Image    13   /  2573  	0.026602960184765975
Image    14   /  2573  	0.5650156368422193
Image    15   /  2573  	0.13621563047640936
Image    16   /  2573  	0.19720713450389926
Image    17   /  2573  	0.0
Image    18   /  2573  	0.11883316657492815
Image    19   /  2573  	0.14622647867035754
Image    20   /  2573  	0.22270753332188603
Image    21   /  2573  	0.1993756919353232
Image    22   /  2573  	0.8236357275023996
Image    23   /  2573  	0.19485687174887895
Image    24

This is WITHOUT the saliency of the final feature vector

Maybe look at percentage of zeros of accuracy