## Code for performing inference with ViFi-CLIP on custom videos

### Please set the corresponding values in the cell below. Afterwards, just run the cells for inference with ViFi-CLIP model

In [1]:
### Set values here ###

config = 'configs/zero_shot/train/k400/16_16_vifi_clip.yaml'
output_folder_name = "outputs"
pretrained_model_path = "ckpts/vifi_clip_10_epochs_k400_full_finetuned.pth"
# Class label csv path
labels_csv_path = '../datasets/kinetics_400_labels.csv'
# Video example
video_path = '../datasets/Kinetics400/test/crying/dNiFeUrLBZM_000009_000019.mp4'

### Read class labels

In [2]:
import csv

label2id = {}
id2label = {}
with open(labels_csv_path, mode='r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)

    for row in csv_reader:
        class_id, class_name = row
        label2id[class_name] = class_id
        id2label[class_id] = class_name

class_names = list(label2id.keys())

print(f"{len(class_names)} Unique classes: {class_names}.")

400 Unique classes: ['abseiling', 'air drumming', 'answering questions', 'applauding', 'applying cream', 'archery', 'arm wrestling', 'arranging flowers', 'assembling computer', 'auctioning', 'baby waking up', 'baking cookies', 'balloon blowing', 'bandaging', 'barbequing', 'bartending', 'beatboxing', 'bee keeping', 'belly dancing', 'bench pressing', 'bending back', 'bending metal', 'biking through snow', 'blasting sand', 'blowing glass', 'blowing leaves', 'blowing nose', 'blowing out candles', 'bobsledding', 'bookbinding', 'bouncing on trampoline', 'bowling', 'braiding hair', 'breading or breadcrumbing', 'breakdancing', 'brush painting', 'brushing hair', 'brushing teeth', 'building cabinet', 'building shed', 'bungee jumping', 'busking', 'canoeing or kayaking', 'capoeira', 'carrying baby', 'cartwheeling', 'carving pumpkin', 'catching fish', 'catching or throwing baseball', 'catching or throwing frisbee', 'catching or throwing softball', 'celebrating', 'changing oil', 'changing wheel', 'c

### Import libraries 

In [3]:
import torch
import torch.nn as nn
from utils.config import get_config
from utils.logger import create_logger
import time
import numpy as np
from utils.config import get_config
from trainers import vificlip
from datasets.pipeline import *

  from .autonotebook import tqdm as notebook_tqdm


### Setting up configuration, no need to change anything.

In [4]:
# Step 1:
# Configuration class 
class parse_option():
    def __init__(self):
        self.config = config
        self.output =  output_folder_name   # Name of output folder to store logs and save weights
        self.resume = pretrained_model_path
        # No need to change below args.
        self.only_test = True
        self.opts = None
        self.batch_size = None
        self.pretrained = None
        self.accumulation_steps = None
        self.local_rank = 0
args = parse_option()
config = get_config(args)
# logger
logger = create_logger(output_dir=args.output, name=f"{config.MODEL.ARCH}")
logger.info(f"working dir: {config.OUTPUT}")

=> merge config from configs/zero_shot/train/k400/16_16_vifi_clip.yaml
[32m[2024-03-12 17:52:33 ViT-B/16][0m[33m(588939814.py 19)[0m: INFO working dir: outputs


### Loading ViFi-CLIP and its pretrained weights

In [5]:
# Step 2:
# Create the ViFi-CLIP models and load pretrained weights
model = vificlip.returnCLIP(config,
                            logger=logger,
                            class_names=class_names,)
model = model.float().cuda()  # changing to cuda here


[32m[2024-03-12 17:52:33 ViT-B/16][0m[33m(vificlip.py 203)[0m: INFO Loading CLIP (backbone: ViT-B/16)
[32m[2024-03-12 17:52:34 ViT-B/16][0m[33m(vificlip.py 206)[0m: INFO Building ViFi-CLIP CLIP
[32m[2024-03-12 17:52:34 ViT-B/16][0m[33m(vificlip.py 223)[0m: INFO Turning on gradients for COMPLETE ViFi-CLIP model
[32m[2024-03-12 17:52:34 ViT-B/16][0m[33m(vificlip.py 246)[0m: INFO Parameters to be updated: {'text_encoder.transformer.resblocks.4.mlp.c_fc.weight', 'text_encoder.transformer.resblocks.1.attn.out_proj.weight', 'text_encoder.transformer.resblocks.6.mlp.c_fc.weight', 'image_encoder.transformer.resblocks.1.mlp.c_fc.bias', 'text_encoder.transformer.resblocks.9.ln_2.weight', 'text_encoder.transformer.resblocks.0.attn.out_proj.bias', 'image_encoder.transformer.resblocks.0.ln_1.bias', 'text_encoder.transformer.resblocks.2.ln_2.weight', 'image_encoder.transformer.resblocks.2.ln_2.weight', 'image_encoder.transformer.resblocks.7.mlp.c_proj.bias', 'image_encoder.transforme

In [6]:
logger.info(f"==============> Resuming form {config.MODEL.RESUME}....................")
checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
load_state_dict = checkpoint['model']
# now remove the unwanted keys:
if "module.prompt_learner.token_prefix" in load_state_dict:
    del load_state_dict["module.prompt_learner.token_prefix"]

if "module.prompt_learner.token_suffix" in load_state_dict:
    del load_state_dict["module.prompt_learner.token_suffix"]

if "module.prompt_learner.complete_text_embeddings" in load_state_dict:
    del load_state_dict["module.prompt_learner.complete_text_embeddings"]
# create new OrderedDict that does not contain `module.`
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in load_state_dict.items():
    name = k[7:] # remove `module.`
    new_state_dict[name] = v



In [7]:
# load params
msg = model.load_state_dict(new_state_dict, strict=False)
logger.info(f"resume model: {msg}")

[32m[2024-03-12 17:52:35 ViT-B/16][0m[33m(3852643250.py 3)[0m: INFO resume model: _IncompatibleKeys(missing_keys=['prompt_learner.complete_text_embeddings'], unexpected_keys=[])


### Preprocessing input video 

In [8]:
# Step 3: 
# Preprocessing for video
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
scale_resize = int(256 / 224 * config.DATA.INPUT_SIZE)
val_pipeline = [
    dict(type='DecordInit'),
    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.DATA.NUM_FRAMES, test_mode=True),
    dict(type='DecordDecode'),
    dict(type='Resize', scale=(-1, scale_resize)),
    dict(type='CenterCrop', crop_size=config.DATA.INPUT_SIZE),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='FormatShape', input_format='NCHW'),
    dict(type='Collect', keys=['imgs'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs'])
]
if config.TEST.NUM_CROP == 3:
    val_pipeline[3] = dict(type='Resize', scale=(-1, config.DATA.INPUT_SIZE))
    val_pipeline[4] = dict(type='ThreeCrop', crop_size=config.DATA.INPUT_SIZE)
if config.TEST.NUM_CLIP > 1:
    val_pipeline[1] = dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.DATA.NUM_FRAMES, multiview=config.TEST.NUM_CLIP)
pipeline = Compose(val_pipeline)

In [9]:
dict_file = {'filename': video_path, 'tar': False, 'modality': 'RGB', 'start_index': 0}

### ViFi-CLIP inference with given video

In [10]:
video = pipeline(dict_file)
video_tensor = video['imgs'].unsqueeze(0).cuda().float()
# Inference through ViFi-CLIP
with torch.no_grad():
    with torch.cuda.amp.autocast():
        logits = model(video_tensor)
pred_index = logits.argmax(-1)

In [11]:
# print(f'logits: {logits}')
print(f'predicted action category is : {class_names[pred_index]}')

predicted action category is : crying
