In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !pip install inflect==6.0.0
# !pip install matplotlib==3.5.1
# !pip install numpy==1.22.3
# !pip install pandas==1.4.2
# !pip install Pillow==9.4.0
# !pip install SceneGraphParser==0.1.0
# !pip install submitit==1.4.5
# !pip install tensorboard==2.9.1
# !pip install tqdm==4.64.0

In [None]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-js9viqqa
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-js9viqqa
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369501 sha256=4793907e313c11d2fcd9dc39faa954bd9904e3ce5ed3525c9dd0a01751f0575e
  Stored in directory: /tmp/pip-ephem-wheel-cache-ejqx5j12/wheels/da/2b/4c/d6691fa9597aac8bb85d2ac13b112deb897d5b50f5ad9a37e4
Successfully built clip
Inst

In [None]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import clip

from PIL import Image

import numpy as np

import os
from os.path import exists, join, isfile, realpath, isdir
from os import listdir, makedirs, walk

import shutil as sh

from tqdm import tqdm

In [None]:
if exists('/content/drive/MyDrive/ORT/Master/Codes'):
  WORK_DIR = '/content/drive/MyDrive/ORT/Master/Codes'
elif exists('/content/drive/MyDrive/ORT/Tesis/Codes'):
  WORK_DIR = '/content/drive/MyDrive/ORT/Tesis/Codes'

WORK_DIR

'/content/drive/MyDrive/ORT/Master/Codes'

In [None]:
%cd {WORK_DIR}

/content/drive/.shortcut-targets-by-id/1OcSwcT_BqQPziJvgS9FWGbmkXwMFmj7A/Tesis/Codes


In [None]:
def get_dir_files(dir_path: str):
    return [f for f in listdir(dir_path) if isfile(join(dir_path, f))]


def get_dirs(dir_path: str):
    return [d for d in listdir(dir_path) if isdir(join(dir_path, d))]

In [None]:
def strip_state_dict(state_dict: torch.nn.Module.state_dict, strip_key: str = 'module.'):

    """
    Strip strip_key from start of state_dict keys
    Useful if model has been trained as DDP model
    """

    for k in list(state_dict.keys()):
        if k.startswith(strip_key):
            state_dict[k[len(strip_key):]] = state_dict[k]
            del state_dict[k]

    return state_dict

In [None]:
# Copyright (c) Meta Platforms, Inc. and affiliates.
# Modified by Sagar Vaze from https://github.com/ABaldrati/CLIP4CirDemo/blob/main/model.py

import torch
import torch.nn.functional as F
from torch import nn
import numpy as np

"""
Code from: https://github.com/ABaldrati/CLIP4CirDemo/blob/main/model.py
"""

class Combiner(nn.Module):
    """
    Combiner module which once trained fuses textual and visual information
    """

    def __init__(self, clip_feature_dim: int, projection_dim: int, hidden_dim: int):
        """
        :param clip_feature_dim: CLIP input feature dimension
        :param projection_dim: projection dimension
        :param hidden_dim: hidden dimension
        """
        super(Combiner, self).__init__()
        self.text_projection_layer = nn.Linear(clip_feature_dim, projection_dim)
        self.image_projection_layer = nn.Linear(clip_feature_dim, projection_dim)

        self.dropout1 = nn.Dropout(0.5)
        self.dropout2 = nn.Dropout(0.5)

        self.combiner_layer = nn.Linear(projection_dim * 2, hidden_dim)
        self.output_layer = nn.Linear(hidden_dim, clip_feature_dim)

        self.dropout3 = nn.Dropout(0.5)
        self.dynamic_scalar = nn.Sequential(nn.Linear(projection_dim * 2, hidden_dim), nn.ReLU(), nn.Dropout(0.5),
                                            nn.Linear(hidden_dim, 1),
                                            nn.Sigmoid())

        self.logit_scale = 100

    @torch.jit.export
    def forward(self, image_features, text_features):
        """
        Cobmine the reference image features and the caption features. It outputs the predicted features
        :param image_features: CLIP reference image features
        :param text_features: CLIP relative caption features
        :return: predicted features
        """

        text_projected_features = self.dropout1(F.relu(self.text_projection_layer(text_features)))
        image_projected_features = self.dropout2(F.relu(self.image_projection_layer(image_features)))

        raw_combined_features = torch.cat((text_projected_features, image_projected_features), -1)
        combined_features = self.dropout3(F.relu(self.combiner_layer(raw_combined_features)))
        dynamic_scalar = self.dynamic_scalar(raw_combined_features)
        output = self.output_layer(combined_features) + dynamic_scalar * text_features + (
                1 - dynamic_scalar) * image_features

        return F.normalize(output)

In [None]:
# Set to path of model to evaluate (combiner head)  (set to 'None' if using image_only etc.)
COMBINER_PRETRAIN_PATH = join(WORK_DIR, "clustering/GeneCIS/vitb16_combiner_head.pt")
# Set to path of model to evaluate (backbone)  (set to 'None' to use CLIP pre-trained model, if using image_only etc.)
BACKBONE_PRETRAIN_PATH = join(WORK_DIR, "clustering/GeneCIS/vitb16_backbone.pt")

In [None]:
model = "RN50x4" # ViT-B/16

In [None]:
clip_model, preprocess = clip.load(model)
clip_model.float().eval()
input_dim = clip_model.visual.input_resolution
feature_dim = clip_model.visual.output_dim

combiner = Combiner(clip_feature_dim=feature_dim, projection_dim=2560, hidden_dim=2 * 2560)

100%|███████████████████████████████████████| 402M/402M [00:09<00:00, 42.2MiB/s]


In [None]:
state_dict = torch.load(COMBINER_PRETRAIN_PATH, map_location='cpu')
state_dict = strip_state_dict(state_dict=state_dict, strip_key='module.')
combiner.load_state_dict(state_dict)

state_dict = torch.load(BACKBONE_PRETRAIN_PATH, map_location='cpu')
state_dict = strip_state_dict(state_dict=state_dict, strip_key='module.')
clip_model.load_state_dict(state_dict)

# --------------
# To cuda
# --------------
clip_model, combiner = clip_model.cuda(), combiner.cuda()

clip_model.to("cuda")
combiner.to("cuda")

# if any([p.requires_grad for p in clip_model.parameters()]):
#     clip_model = CLIPDistDataParallel(clip_model, device_ids=[args.gpu])
# if any([p.requires_grad for p in combiner.parameters()]):
#     combiner = torch.nn.parallel.DistributedDataParallel(combiner, device_ids=[args.gpu])

In [None]:
clip_model.eval()
combiner.eval()



In [None]:
@torch.no_grad()
def validate(clip_model, combiner, valloader, topk=(1, 2, 3), save_path=None):

    print('Computing eval with combiner...')

    clip_model.eval()
    combiner.eval()

    sims_to_save = []

    with torch.no_grad():
        for batch in tqdm(valloader):

            ref_img, caption, gallery_set, target_rank = [x.cuda(non_blocking=True) for x in batch[:4]]
            bsz, n_gallery, _, h, w = gallery_set.size()
            caption = caption.squeeze()

            # Forward pass in CLIP
            imgs_ = torch.cat([ref_img, gallery_set.view(-1, 3, h, w)], dim=0)
            all_img_feats = clip_model.encode_image(imgs_).float()
            caption_feats = clip_model.encode_text(caption).float()

            # L2 normalize and view into correct shapes
            ref_feats, gallery_feats = all_img_feats.split((bsz, bsz * n_gallery), dim=0)
            gallery_feats = gallery_feats.view(bsz, n_gallery, -1)
            gallery_feats = torch.nn.functional.normalize(gallery_feats, dim=-1)

            # Forward pass in combiner
            combined_feats = combiner(ref_feats, caption_feats)

            # Compute similarity
            similarities = combined_feats[:, None, :] * gallery_feats       # B x N x D
            similarities = similarities.sum(dim=-1)                         # B x N

            # Sort the similarities in ascending order (closest example is the predicted sample)
            _, sort_idxs = similarities.sort(dim=-1, descending=True)                   # B x N

            # Compute recall at K
            for k in topk:

                recall_k = get_recall(sort_idxs[:, :k], target_rank)
                meters[k].update(recall_k, bsz)

            sims_to_save.append(similarities.cpu())

        if save_path is not None:
            sims_to_save = torch.cat(sims_to_save)
            print(f'Saving text only preds to: {save_path}')
            torch.save(sims_to_save, save_path)

        # Print results
        print_str = '\n'.join([f'Recall @ {k} = {v.avg:.4f}' for k, v in meters.items()])
        print(print_str)

        return meters