In [1]:
#exclude - GPU setup
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
import subprocess

CUDA_version = [s for s in subprocess.check_output(["nvcc", "--version"]).decode("UTF-8").split(", ") if s.startswith("release")][0].split(" ")[-1]
print("CUDA version:", CUDA_version)

if CUDA_version == "10.0":
    torch_version_suffix = "+cu100"
elif CUDA_version == "10.1":
    torch_version_suffix = "+cu101"
elif CUDA_version == "10.2":
    torch_version_suffix = ""
else:
    torch_version_suffix = "+cu110"

CUDA version: 11.1


In [3]:
# ! pip install torch==1.7.1{torch_version_suffix} torchvision==0.8.2{torch_version_suffix} -f https://download.pytorch.org/whl/torch_stable.html ftfy regex

# The following command installs the `clip` module from its source:
 # ! pip install git+https://github.com/openai/CLIP.git

In [None]:
import numpy as np
import torch
import clip
from tqdm.notebook import tqdm

from torchvision import transforms
import matplotlib.pyplot as plt
import cv2

print("Torch version:", torch.__version__)

# Loading the model

Download and instantiate a CLIP model using the `clip` module that we just installed.

In [None]:
print("Avaliable Models: ", clip.available_models())
model, preprocess = clip.load("ViT-B/32") #clip.load("RN50", jit=False) #

input_resolution = model.input_resolution #.item()
context_length = model.context_length #.item()
vocab_size = model.vocab_size #.item()

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

## Load the text descriptions

In [None]:
# Load inputs
act_map = {}
with open("../../../act_idx_to_name_new.txt") as f:
	for line in f:
		(key, val) = line.split()
		act_map[int(key)] = val

with open("../../../activity_descriptions.txt", 'r') as f:
    act_des = f.readlines()

act_dict = {}
for i in range(len(act_des)//2):
    act_dict[act_des[2*i].strip()] = act_des[2*i + 1].strip()
    
classnames = [act_map[i] for i in range(1, 38)]#sorted(list(act_dict.keys()))
classnames_str = {x:x.replace('_', ' ') for x in classnames}

In [None]:
inv_act_map = {val:key for key, val in act_map.items()}

In [None]:
preprocess

In [None]:
def accuracy(output, target, topk=(1,)):
    pred = output.topk(max(topk), 1, True, True)[1].t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))
    return [float(correct[:k].reshape(-1).float().sum(0, keepdim=True).cpu().numpy()) for k in topk]

In [None]:
def zeroshot_classifier(classnames, act_descriptions):
    with torch.no_grad():
        zeroshot_weights = []
        for classname in tqdm(classnames):
            des_size = len(act_descriptions[classname])
            texts =  [ act_descriptions[classname][x : x+100] for x in range(0, des_size, 100)]#format with class
            # print("\n\n".join(texts), "\n###################################\n")
            texts = clip.tokenize(texts).cuda() #tokenize
            # print(texts.shape, "\n###################################\n")
            class_embeddings = model.encode_text(texts) #embed with text encoder
            class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
            class_embedding = class_embeddings.mean(dim=0)
            
            class_embedding /= class_embedding.norm()
            zeroshot_weights.append(class_embedding)
        zeroshot_weights = torch.stack(zeroshot_weights, dim=1).cuda()
    return zeroshot_weights


zeroshot_weights = zeroshot_classifier(classnames, act_dict)

## Visual embeddings

In [None]:
mdata = np.load('data.npy',allow_pickle=True).item()

In [None]:
from MEVA.code.utils.vattr import VAttr, MAttr
from PIL import Image

In [None]:
mattr = MAttr("")
vattr = VAttr(mdata[0]['Video_Path'][5:], mattr = mattr)
count = 0
images_full = []
targets = []
logits_full = []
top1, top3, top5, n = 0., 0., 0., 0.
with torch.no_grad():
    for key, val in mdata.items():
        vattr.set_vidname(val['Video_Path'][5:])
        frame_path = vattr.frame_path
        start_frame = int(val['Start_Frame'])
        end_frame = int(val['End_Frame'])
        mid_frame = (end_frame + start_frame)//2
        bbox = val['BBox']
        for i in range(mid_frame-2, mid_frame+ 3):
            if  not os.path.exists(frame_path): continue
            
            full_path = os.path.join(frame_path, '{0:06}.jpg'.format(i))
            try:
                img =  Image.open(full_path)
            except:
                continue
            img2 = img.crop(tuple(bbox))
            images = preprocess(img2)
            images_full.append(images)
            images = torch.unsqueeze(images, 0)
            target = torch.Tensor([val['Activity_Id'] - 1])
            images = images.cuda()
            target = target.cuda()
            image_features = model.encode_image(images)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            logits = 100. * image_features @ zeroshot_weights
            acc1, acc3, acc5 = accuracy(logits, target, topk=(1, 3, 5))
            top1 += acc1
            top3 += acc3
            top5 += acc5
            n += 1
            logits_full.append(logits)
            targets.append(val['Activity_Id']- 1)
            count+= 1
            print(count, end='\r')
top1 = (top1 / n) * 100
top3 = (top3 / n) * 100
top5 = (top5 / n) * 100 

print(f"Top-1 accuracy: {top1:.2f}")
print(f"Top-3 accuracy: {top3:.2f}")
print(f"Top-5 accuracy: {top5:.2f}")


In [None]:
preds = [np.argmax(x.cpu().numpy()) for x in logits_full]
fig = plt.figure(figsize=(15, 5))
bins = np.arange(1,38)
plt.hist([targets, preds] , bins=37, range=(1,38), label=['gt', 'preds'])
plt.legend(loc='upper right')
# plt.gca().set_xticks(classnames)
_ = plt.xticks(bins+0.5, classnames, rotation = 90)

# Zero-shot prediction

In [None]:
invTrans = transforms.Compose([ transforms.Normalize(mean = [ 0., 0., 0. ],
                                                     std = [1/0.26862954, 1/0.26130258, 1/0.27577711]),
                                transforms.Normalize(mean = [-0.48145466, -0.4578275, -0.40821073],
                                                     std = [ 1., 1., 1. ]),
                               ])



In [None]:
plt.ioff()
with torch.no_grad():
    top1, top5, n = 0., 0., 0.
    for i, (images, target) in enumerate(tqdm(loader)):
        images = images.cuda()
        target = target.cuda()
        
        # predict
        image_features, attention_weights = model.encode_image(images, feat=True)
        image_features /= image_features.norm(dim=-1, keepdim=True)
        logits = 100. * image_features @ zeroshot_weights
        preds = np.argmax(logits.cpu().detach().numpy(), axis=-1)
        visualize(images, attention_weights, i, target, preds)

        # measure accuracy
        acc1, acc5 = accuracy(logits, target, topk=(1, 5))
        top1 += acc1
        top5 += acc5
        n += images.size(0)

top1 = (top1 / n) * 100
top5 = (top5 / n) * 100 

print(f"Top-1 accuracy: {top1:.2f}")
print(f"Top-5 accuracy: {top5:.2f}")
plt.ion()

In [None]:
def visualize(images, attention_weights, batch_id, target, preds):

    img_viz = invTrans(images).cpu().numpy().transpose((0, 2,3, 1))
    attention_weights = attention_weights.detach().cpu().numpy()
    num_images = a.shape[0]

    for idx in range(num_images):
        fig = plt.figure(figsize=(10, 10))
        plt.subplot(2,2, 1)
        plt.imshow(img_viz[idx])
        plt.title(imagenet_classes[target[idx]])

        att_map = attention_weights[idx][:49].reshape(7,7).astype(np.float32)
        plt.subplot(2,2, 2)
        plt.imshow(att_map)
        plt.title(imagenet_classes[preds[idx]])

        att_map_resized = cv2.resize(att_map, dsize=(224, 224), interpolation=cv2.INTER_CUBIC)
        att_map_resized= att_map_resized
        plt.subplot(2,2, 3)
        plt.imshow(att_map_resized)

        frame = img_viz[idx]
        heatmap = att_map_resized #cv2.cvtColor(att_map_resized, cv2.COLOR_GRAY2RGB)
        heatmapshow = None
        heatmapshow = cv2.normalize(heatmap, heatmapshow, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8U)
        heatmapshow = cv2.applyColorMap(heatmapshow, cv2.COLORMAP_JET)
        heatmapshow = heatmapshow.astype(np.float32)/255
        alpha = 0.4 # set convering image transparency 
        frame = cv2.addWeighted(heatmapshow, alpha, frame, 1-alpha, 0) # overlap background with original image
        plt.subplot(2,2, 4)
        plt.imshow(frame)
        plt.savefig("visuals/{}_{}.png".format(batch_id, idx), bbox_inches='tight')
        plt.close()
