In [None]:
import numpy as np
import torch
import os.path
from pathlib import Path

from torch.utils.data import DataLoader
from tqdm import tqdm
from utils.factory import create_model_and_transforms, get_tokenizer
from torchvision.datasets import CIFAR100, CIFAR10
from torch.nn import functional as F

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

import torch.nn as nn
from load import *
import random
from loss import *

In [2]:
def compute_similarity(image_features, text_features, bs = 1000):
    # compute similarity
    max_pairs = image_features.shape[0]
    similarity_scores = torch.zeros(max_pairs, max_pairs)
    for v in range(0, max_pairs, bs):
        for t in range(0, max_pairs, bs):
            print('Processing Visual '+str(v)+' Text '+str(t), end='\r')
            batch_visual_emb = image_features[v:v+bs]
            batch_caption_emb = text_features[t:t+bs]

            batch_visual_emb = batch_visual_emb.to(torch.float32)
            batch_caption_emb = batch_caption_emb.to(torch.float32)

            logits = batch_visual_emb @ batch_caption_emb.t()
            similarity_scores[v:v+bs,t:t+bs] = logits

    print('Done similarity')
    return similarity_scores

def compute_retrieval(a2b_sims, return_ranks=True):
    """
    Args:
        a2b_sims: Result of computing similarity between two sets of embeddings (emb1 @ emb2.T)
            with shape (num_datapoints, num_datapoints).

    Returns:
        Retrieval metrics for that similarity.
    """
    npts = a2b_sims.shape[0]
    ranks = np.zeros(npts)
    top1 = np.zeros(npts)
    # loop source embedding indices
    for index in range(npts):
        # get order of similarities to target embeddings
        inds = np.argsort(a2b_sims[index])[::-1]
        # find where the correct embedding is ranked
        where = np.where(inds == index)
        rank = where[0][0]
        ranks[index] = rank
        # save the top1 result as well
        top1[index] = inds[0]

    # Compute metrics
    r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks)
    r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks)
    r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks)
    r50 = 100.0 * len(np.where(ranks < 50)[0]) / len(ranks)
    medr = np.floor(np.median(ranks)) + 1
    meanr = ranks.mean() + 1

    report_dict = {"r1": r1, "r5": r5, "r10": r10, "r50": r50, "medr": medr, "meanr": meanr, "sum": r1 + r5 + r10}

    if return_ranks:
        return report_dict, (ranks, top1)
    else:
        return report_dict

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "ViT-B-32"

# pretrained = "openai"
pretrained = "./DCP-ViT-B-32.pt"

batch_size = 64

In [None]:
model, _, preprocess = create_model_and_transforms(
    model_name, pretrained=pretrained
)
model.to(device)
model.eval()
context_length = model.context_length
vocab_size = model.vocab_size

print(
    "Model parameters:",
    f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}",
)
print("Context length:", context_length)
print("Vocab size:", vocab_size)
print("Len of res:", len(model.visual.transformer.resblocks))

# prs = hook_prs_logger(model, device)

In [5]:
tokenizer = get_tokenizer(model_name)

In [6]:
def zeroshot_classifier(classnames, templates):
    with torch.no_grad():
        zeroshot_weights = []
        for classname in tqdm(classnames):
            texts = [template.format(classname) for template in templates] #format with class
            # texts = clip.tokenize(texts).cuda() #tokenize
            texts = tokenizer(texts).cuda() #tokenize
            class_embeddings = model.encode_text(texts) #embed with text encoder
            class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
            class_embedding = class_embeddings.mean(dim=0)
            class_embedding /= class_embedding.norm()
            zeroshot_weights.append(class_embedding)
        zeroshot_weights = torch.stack(zeroshot_weights, dim=1).cuda()
    return zeroshot_weights

In [8]:
def accuracy(output, target, topk=(1,)):
    pred = output.topk(max(topk), 1, True, True)[1].t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]

In [9]:
dataset_cifar10_val = CIFAR10(root=CIFAR10_DIR, train=False, transform=preprocess)
dataloader_cifar10_val = DataLoader(dataset_cifar10_val, batch_size, shuffle=True, num_workers=16, pin_memory=True)

In [None]:
zeroshot_weights = zeroshot_classifier(label_to_classname_cifar10, imagenet_templates)

In [None]:
with torch.no_grad():
    top1, top5, n = 0., 0., 0.
    for i, (images, target) in enumerate(tqdm(dataloader_cifar10_val)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images, attn_method="direct")
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights

        # measure accuracy
        acc1, acc5 = accuracy(logits, target, topk=(1, 5))
        top1 += acc1
        top5 += acc5
        n += images.size(0)

top1 = (top1 / n) * 100
top5 = (top5 / n) * 100 

print(f"Top-1 accuracy: {top1:.2f}")
print(f"Top-5 accuracy: {top5:.2f}")

In [12]:
dataset_cifar100_val = CIFAR100(root=CIFAR100_DIR, train=False, transform=preprocess)
dataloader_cifar100_val = DataLoader(dataset_cifar100_val, batch_size, shuffle=True, num_workers=16, pin_memory=True)

In [None]:
zeroshot_weights = zeroshot_classifier(label_to_classname_cifar100, imagenet_templates)

In [None]:
with torch.no_grad():
    top1, top5, n = 0., 0., 0.
    for i, (images, target) in enumerate(tqdm(dataloader_cifar100_val)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images, attn_method="direct")
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights

        # measure accuracy
        acc1, acc5 = accuracy(logits, target, topk=(1, 5))
        top1 += acc1
        top5 += acc5
        n += images.size(0)

top1 = (top1 / n) * 100
top5 = (top5 / n) * 100 

print(f"Top-1 accuracy: {top1:.2f}")
print(f"Top-5 accuracy: {top5:.2f}")

In [15]:
dataset_cub_val = CUBDataset(CUB_DIR, train=False, transform=preprocess)
dataloader_cub_val = DataLoader(dataset_cub_val, batch_size, shuffle=True, num_workers=16, pin_memory=True)

In [None]:
zeroshot_weights = zeroshot_classifier(label_to_classname_cub, imagenet_templates)

In [None]:
with torch.no_grad():
    top1, top5, n = 0., 0., 0.
    for i, (images, target) in enumerate(tqdm(dataloader_cub_val)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images, attn_method="direct")
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights

        # measure accuracy
        acc1, acc5 = accuracy(logits, target, topk=(1, 5))
        top1 += acc1
        top5 += acc5
        n += images.size(0)

top1 = (top1 / n) * 100
top5 = (top5 / n) * 100 

print(f"Top-1 accuracy: {top1:.2f}")
print(f"Top-5 accuracy: {top5:.2f}")

In [18]:
dataset_caltech101 = torchvision.datasets.Caltech101(root=CALTECH101_DIR, transform=preprocess)
dataloader_caltech101 = DataLoader(dataset_caltech101, batch_size, shuffle=True, num_workers=16, pin_memory=True)

In [None]:
zeroshot_weights = zeroshot_classifier(label_to_classname_caltech101, imagenet_templates)

In [None]:
with torch.no_grad():
    top1, top5, n = 0., 0., 0.
    for i, (images, target) in enumerate(tqdm(dataloader_caltech101)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images, attn_method="direct")
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights

        # measure accuracy
        acc1, acc5 = accuracy(logits, target, topk=(1, 5))
        top1 += acc1
        top5 += acc5
        n += images.size(0)

top1 = (top1 / n) * 100
top5 = (top5 / n) * 100 

print(f"Top-1 accuracy: {top1:.2f}")
print(f"Top-5 accuracy: {top5:.2f}")

In [21]:
dataset_oxfordpets_tst = torchvision.datasets.OxfordIIITPet(root=OXFORDPET_DIR, transform=preprocess, split='test')
dataloader_oxfordpets_tst = DataLoader(dataset_oxfordpets_tst, batch_size, shuffle=True, num_workers=16, pin_memory=True)

In [None]:
zeroshot_weights = zeroshot_classifier(label_to_classname_oxfordpets, imagenet_templates)

In [None]:
with torch.no_grad():
    top1, top5, n = 0., 0., 0.
    for i, (images, target) in enumerate(tqdm(dataloader_oxfordpets_tst)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images, attn_method="direct")
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights

        # measure accuracy
        acc1, acc5 = accuracy(logits, target, topk=(1, 5))
        top1 += acc1
        top5 += acc5
        n += images.size(0)

top1 = (top1 / n) * 100
top5 = (top5 / n) * 100 

print(f"Top-1 accuracy: {top1:.2f}")
print(f"Top-5 accuracy: {top5:.2f}")

In [24]:
dataset_food101_tst = torchvision.datasets.Food101(root=FOOD101_DIR, transform=preprocess, split='test')
dataloader_food101_tst = DataLoader(dataset_food101_tst, batch_size, shuffle=True, num_workers=16, pin_memory=True)

In [None]:
zeroshot_weights = zeroshot_classifier(label_to_classname_food101, imagenet_templates)

In [None]:
with torch.no_grad():
    top1, top5, n = 0., 0., 0.
    for i, (images, target) in enumerate(tqdm(dataloader_food101_tst)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images, attn_method="direct")
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights

        # measure accuracy
        acc1, acc5 = accuracy(logits, target, topk=(1, 5))
        top1 += acc1
        top5 += acc5
        n += images.size(0)

top1 = (top1 / n) * 100
top5 = (top5 / n) * 100 

print(f"Top-1 accuracy: {top1:.2f}")
print(f"Top-5 accuracy: {top5:.2f}")

In [27]:
dataset_sun397 = torchvision.datasets.SUN397(root=SUN397_DIR, transform=preprocess)
dataset_sun397_trn, dataset_sun397_tst = torch.utils.data.random_split(dataset_sun397, [100000, 8754])
dataloader_sun397_tst = DataLoader(dataset_sun397_tst, batch_size, shuffle=True, num_workers=16, pin_memory=True)

In [None]:
zeroshot_weights = zeroshot_classifier(label_to_classname_sun397, imagenet_templates)

In [None]:
with torch.no_grad():
    top1, top5, n = 0., 0., 0.
    for i, (images, target) in enumerate(tqdm(dataloader_sun397_tst)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images, attn_method="direct")
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights

        # measure accuracy
        acc1, acc5 = accuracy(logits, target, topk=(1, 5))
        top1 += acc1
        top5 += acc5
        n += images.size(0)

top1 = (top1 / n) * 100
top5 = (top5 / n) * 100 

print(f"Top-1 accuracy: {top1:.2f}")
print(f"Top-5 accuracy: {top5:.2f}")

In [30]:
dataset_stanfordcars_trn, dataset_stanfordcars_tst = torch.utils.data.random_split(dataset_stanfordcars, [6000, 2144])
dataloader_stanfordcars_tst = DataLoader(dataset_stanfordcars_tst, batch_size, shuffle=True, num_workers=16, pin_memory=True)

In [None]:
zeroshot_weights = zeroshot_classifier(label_to_classname_stanfordcars, imagenet_templates)

In [None]:
with torch.no_grad():
    top1, top5, n = 0., 0., 0.
    for i, (images, target) in enumerate(tqdm(dataloader_stanfordcars_tst)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images, attn_method="direct")
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights

        # measure accuracy
        acc1, acc5 = accuracy(logits, target, topk=(1, 5))
        top1 += acc1
        top5 += acc5
        n += images.size(0)

top1 = (top1 / n) * 100
top5 = (top5 / n) * 100 

print(f"Top-1 accuracy: {top1:.2f}")
print(f"Top-5 accuracy: {top5:.2f}")

In [None]:
dataloader_dtd = DataLoader(dataset_dtd, batch_size, shuffle=True, num_workers=16, pin_memory=True)
zeroshot_weights = zeroshot_classifier(label_to_classname_dtd, imagenet_templates)

In [None]:
with torch.no_grad():
    top1, top5, n = 0., 0., 0.
    for i, (images, target) in enumerate(tqdm(dataloader_dtd)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features = model.encode_image(images, attn_method="direct")
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights

        # measure accuracy
        acc1, acc5 = accuracy(logits, target, topk=(1, 5))
        top1 += acc1
        top5 += acc5
        n += images.size(0)

top1 = (top1 / n) * 100
top5 = (top5 / n) * 100 

print(f"Top-1 accuracy: {top1:.2f}")
print(f"Top-5 accuracy: {top5:.2f}")