### Set the corresponding values in the cell below. Afterwards, just run the following cells

In [1]:
# Class label csv path
labels_csv_path = '../datasets/home_labels.csv'
# Video example
video_path = '../datasets/train/crying/_ceBK5pQTrs_000033_000043.mp4'

### Read class labels

In [2]:
import csv
import numpy as np

label2id = {}
id2label = {}
with open(labels_csv_path, mode='r') as file:
    csv_reader = csv.reader(file)

    for i, row in enumerate(csv_reader):
        class_name = row[0]
        label2id[class_name] = i
        id2label[i] = class_name

class_labels = list(label2id.keys())

print(f"{len(class_labels)} Unique classes: {class_labels}.")

70 Unique classes: ['No people is in the room', 'Studying', 'Typing', 'Using computer', 'Making phone calls', 'Play with phone/tablet', 'Playing with pets', 'Feeding pets', 'Reading', 'Writing', 'Setting table', 'Eating food', 'Dining', 'Cooking', 'Sleeping', 'Brushing teeth', 'Showering', 'Playing music', 'TV', 'Cleaning', 'Doing laundry', 'Mopping floor', 'Vacumning', 'Ironing', 'Organizing space', 'Sewing', 'Knitting', 'Decorating', 'Party', 'Chatting', 'Talking', 'Singing', 'Laughing', 'Speaking', 'Dancing', 'Drinking', 'Stretching', 'Meditating', 'Drawing', 'Painting', 'Playing board games', 'Playing video games', 'Taking photos', 'Potluck', 'Working', 'Exercising', 'Walking', 'Running', 'Celebratin', 'Physical altercations', 'Verbal confrontations', 'Using drug', 'Theft or vandalism', 'Fighting', 'Domestic violence', 'Break in', 'Glass breaking', 'Fire accident', 'Fire alarm', 'Unattended cooking', 'Open flame', 'Smoking', 'Gunshot', 'Making noise', 'Falling down', 'Tripping', 'C

### Import libraries 

In [3]:
import torch
import torch.nn as nn
from utils.config import get_config
from utils.logger import create_logger
import numpy as np
from utils.config import get_config
from trainers import vificlip
from datasets.pipeline import *
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


### Setting up configuration, no need to change anything.

In [4]:
config = 'configs/zero_shot/train/k400/16_16_vifi_clip.yaml'
output_folder_name = "outputs"
pretrained_model_path = "ckpts/vifi_clip_10_epochs_k400_full_finetuned.pth"

# Step 1:
# Configuration class 
class parse_option():
    def __init__(self):
        self.config = config
        self.output =  output_folder_name   # Name of output folder to store logs and save weights
        self.resume = pretrained_model_path
        # No need to change below args.
        self.only_test = True
        self.opts = None
        self.batch_size = None
        self.pretrained = None
        self.accumulation_steps = None
        self.local_rank = 0
args = parse_option()
config = get_config(args)
# logger
logger = create_logger(output_dir=args.output, name=f"{config.MODEL.ARCH}")
logger.info(f"working dir: {config.OUTPUT}")

=> merge config from configs/zero_shot/train/k400/16_16_vifi_clip.yaml
[32m[2024-04-16 18:30:14 ViT-B/16][0m[33m(1413150786.py 23)[0m: INFO working dir: outputs


### Loading ViFi-CLIP and its pretrained weights

In [5]:
# Step 2:
# Create the ViFi-CLIP models and load pretrained weights
model = vificlip.returnCLIP(config,
                            logger=logger,
                            class_names=class_labels,)
model = model.float()


[32m[2024-04-16 18:30:14 ViT-B/16][0m[33m(vificlip.py 203)[0m: INFO Loading CLIP (backbone: ViT-B/16)
[32m[2024-04-16 18:30:17 ViT-B/16][0m[33m(vificlip.py 206)[0m: INFO Building ViFi-CLIP CLIP
[32m[2024-04-16 18:30:17 ViT-B/16][0m[33m(vificlip.py 223)[0m: INFO Turning on gradients for COMPLETE ViFi-CLIP model
[32m[2024-04-16 18:30:17 ViT-B/16][0m[33m(vificlip.py 246)[0m: INFO Parameters to be updated: {'image_encoder.transformer.resblocks.8.attn.in_proj_bias', 'image_encoder.transformer.resblocks.4.ln_2.weight', 'text_encoder.transformer.resblocks.3.ln_2.bias', 'image_encoder.transformer.resblocks.10.mlp.c_proj.weight', 'image_encoder.transformer.resblocks.11.ln_1.bias', 'image_encoder.transformer.resblocks.2.mlp.c_proj.bias', 'text_encoder.transformer.resblocks.8.attn.out_proj.bias', 'text_encoder.transformer.resblocks.8.attn.out_proj.weight', 'image_encoder.transformer.resblocks.2.ln_1.bias', 'image_encoder.transformer.resblocks.6.attn.in_proj_weight', 'text_encoder.

In [6]:
logger.info(f"==============> Resuming form {config.MODEL.RESUME}....................")
checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
load_state_dict = checkpoint['model']
# now remove the unwanted keys:
if "module.prompt_learner.token_prefix" in load_state_dict:
    del load_state_dict["module.prompt_learner.token_prefix"]

if "module.prompt_learner.token_suffix" in load_state_dict:
    del load_state_dict["module.prompt_learner.token_suffix"]

if "module.prompt_learner.complete_text_embeddings" in load_state_dict:
    del load_state_dict["module.prompt_learner.complete_text_embeddings"]
# create new OrderedDict that does not contain `module.`
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in load_state_dict.items():
    name = k[7:] # remove `module.`
    new_state_dict[name] = v



In [7]:
# load params
msg = model.load_state_dict(new_state_dict, strict=False)
logger.info(f"resume model: {msg}")

[32m[2024-04-16 18:30:18 ViT-B/16][0m[33m(3852643250.py 3)[0m: INFO resume model: _IncompatibleKeys(missing_keys=['prompt_learner.complete_text_embeddings'], unexpected_keys=[])


### Preprocessing input video 

In [8]:
# Step 3: 
# Preprocessing for video
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
scale_resize = int(256 / 224 * config.DATA.INPUT_SIZE)
val_pipeline = [
    dict(type='DecordInit'),
    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.DATA.NUM_FRAMES, test_mode=True),
    dict(type='DecordDecode'),
    dict(type='Resize', scale=(-1, scale_resize)),
    dict(type='CenterCrop', crop_size=config.DATA.INPUT_SIZE),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='FormatShape', input_format='NCHW'),
    dict(type='Collect', keys=['imgs'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs'])
]
if config.TEST.NUM_CROP == 3:
    val_pipeline[3] = dict(type='Resize', scale=(-1, config.DATA.INPUT_SIZE))
    val_pipeline[4] = dict(type='ThreeCrop', crop_size=config.DATA.INPUT_SIZE)
if config.TEST.NUM_CLIP > 1:
    val_pipeline[1] = dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.DATA.NUM_FRAMES, multiview=config.TEST.NUM_CLIP)
pipeline = Compose(val_pipeline)

In [9]:
dict_file = {'filename': video_path, 'tar': False, 'modality': 'RGB', 'start_index': 0, 'gt_label': video_path.split('/')[-2]}

### ViFi-CLIP inference with given video

In [10]:
video = pipeline(dict_file)
video_tensor = video['imgs'].unsqueeze(0).float()
# Inference through ViFi-CLIP
with torch.no_grad():
    # with torch.cuda.amp.autocast():
    similarities = model(video_tensor)

similarity = F.softmax(similarities, dim=1)
values, indices = similarity[0].topk(5)
pred_index = indices[0]

# Print the results
print("Ground Truth: {}".format(video_path.split('/')[-2]))
print("Top predictions:\n")
for value, index in zip(values, indices):
    print(f"{class_labels[index]:>16s}: {100 * value.item():.2f}%")

objc[5499]: Class AVFFrameReceiver is implemented in both /Users/felicialuo/opt/anaconda3/envs/vclap/lib/python3.11/site-packages/av/.dylibs/libavdevice.60.1.100.dylib (0x13fa320f8) and /Users/felicialuo/opt/anaconda3/envs/vclap/lib/python3.11/site-packages/decord/.dylibs/libavdevice.58.5.100.dylib (0x155a2d010). One of the two will be used. Which one is undefined.
objc[5499]: Class AVFAudioReceiver is implemented in both /Users/felicialuo/opt/anaconda3/envs/vclap/lib/python3.11/site-packages/av/.dylibs/libavdevice.60.1.100.dylib (0x13fa32148) and /Users/felicialuo/opt/anaconda3/envs/vclap/lib/python3.11/site-packages/decord/.dylibs/libavdevice.58.5.100.dylib (0x155a2d060). One of the two will be used. Which one is undefined.


: 