## Code for performing evaluation with ViFi-CLIP on custom datasets

### Please set the corresponding values in the cell below. Afterwards, just run the cells for inference with ViFi-CLIP model

In [1]:
import pathlib
# Class label csv path
labels_csv_path = '../datasets/home_labels.csv'
# Datset folder
dataset_root_path = pathlib.Path('../datasets')
# output result csv path
output_csv_path = '../outputs/vclip_results.csv'

In [2]:
### Set values here ###
config = 'configs/zero_shot/train/k400/16_16_vifi_clip.yaml'
output_folder_name = "outputs"
pretrained_model_path = "ckpts/vifi_clip_10_epochs_k400_full_finetuned.pth"

### Import libraries 

In [3]:
import torch
import torch.nn as nn
from utils.config import get_config
from utils.logger import create_logger
import time
import numpy as np
from utils.config import get_config
from trainers import vificlip
from datasets.pipeline import *

### Setting up configuration, no need to change anything.

In [4]:
# Step 1:
# Configuration class 
class parse_option():
    def __init__(self):
        self.config = config
        self.output =  output_folder_name   # Name of output folder to store logs and save weights
        self.resume = pretrained_model_path
        # No need to change below args.
        self.only_test = True
        self.opts = None
        self.batch_size = None
        self.pretrained = None
        self.accumulation_steps = None
        self.local_rank = 0
args = parse_option()
config = get_config(args)
# logger
logger = create_logger(output_dir=args.output, name=f"{config.MODEL.ARCH}")
logger.info(f"working dir: {config.OUTPUT}")

=> merge config from configs/zero_shot/train/k400/16_16_vifi_clip.yaml
[32m[2024-04-16 15:52:18 ViT-B/16][0m[33m(588939814.py 19)[0m: INFO working dir: outputs


### Read Class Labels

In [5]:
import csv

label2id = {}
id2label = {}
with open(labels_csv_path, mode='r') as file:
    csv_reader = csv.reader(file)

    for i, row in enumerate(csv_reader):
        class_name = row[0]
        label2id[class_name] = i
        id2label[i] = class_name

class_labels = list(label2id.keys())

print(f"{len(class_labels)} Unique classes: {class_labels}.")

70 Unique classes: ['No people is in the room', 'Studying', 'Typing', 'Using computer', 'Making phone calls', 'Play with phone/tablet', 'Playing with pets', 'Sitting down', 'Reading', 'Writing', 'Setting table', 'Eating food', 'Dining', 'Cooking', 'Sleeping', 'Laying down', 'Picking up objects', 'Brushing teeth', 'Showering', 'Playing music', 'TV', 'Cleaning', 'Doing laundry', 'Mopping floor', 'Vacumning', 'Organizing space', 'Sewing', 'Knitting', 'Decorating', 'Party', 'Chatting', 'Talking', 'Singing', 'Laughing', 'Speaking', 'Dancing', 'Drinking', 'Meditating', 'Drawing', 'Painting', 'Playing board games', 'Playing video games', 'Taking photos', 'Potluck', 'Working', 'Exercising', 'Walking', 'Running', 'Celebrating', 'Physical altercations', 'Verbal confrontations', 'Using drug', 'Theft or vandalism', 'Fighting', 'Domestic violence', 'Break in', 'Glass breaking', 'Unattended cooking', 'Smoke or Fire or Open flame', 'Smoking', 'Gunshot', 'Making noise', 'Falling down', 'Tripping', 'Cr

### Loading ViFi-CLIP and its pretrained weights

In [6]:
# Step 2:
# Create the ViFi-CLIP models and load pretrained weights
model = vificlip.returnCLIP(config,
                            logger=logger,
                            class_names=class_labels,)
model = model.float().cuda()  # changing to cuda here


[32m[2024-04-16 15:52:18 ViT-B/16][0m[33m(vificlip.py 204)[0m: INFO Loading CLIP (backbone: ViT-B/16)
[32m[2024-04-16 15:52:20 ViT-B/16][0m[33m(vificlip.py 207)[0m: INFO Building ViFi-CLIP CLIP
[32m[2024-04-16 15:52:20 ViT-B/16][0m[33m(vificlip.py 224)[0m: INFO Turning on gradients for COMPLETE ViFi-CLIP model
[32m[2024-04-16 15:52:20 ViT-B/16][0m[33m(vificlip.py 247)[0m: INFO Parameters to be updated: {'image_encoder.transformer.resblocks.5.attn.in_proj_weight', 'image_encoder.transformer.resblocks.2.mlp.c_proj.weight', 'image_encoder.transformer.resblocks.5.attn.out_proj.bias', 'text_encoder.transformer.resblocks.5.mlp.c_proj.weight', 'image_encoder.transformer.resblocks.11.attn.out_proj.bias', 'image_encoder.transformer.resblocks.2.attn.in_proj_weight', 'image_encoder.transformer.resblocks.3.attn.in_proj_bias', 'image_encoder.transformer.resblocks.9.attn.out_proj.bias', 'text_encoder.transformer.resblocks.3.mlp.c_proj.bias', 'text_encoder.transformer.resblocks.6.mlp.

In [7]:
logger.info(f"==============> Resuming form {config.MODEL.RESUME}....................")
checkpoint = torch.load(config.MODEL.RESUME, map_location='cpu')
load_state_dict = checkpoint['model']
# now remove the unwanted keys:
if "module.prompt_learner.token_prefix" in load_state_dict:
    del load_state_dict["module.prompt_learner.token_prefix"]

if "module.prompt_learner.token_suffix" in load_state_dict:
    del load_state_dict["module.prompt_learner.token_suffix"]

if "module.prompt_learner.complete_text_embeddings" in load_state_dict:
    del load_state_dict["module.prompt_learner.complete_text_embeddings"]
# create new OrderedDict that does not contain `module.`
from collections import OrderedDict
new_state_dict = OrderedDict()
for k, v in load_state_dict.items():
    name = k[7:] # remove `module.`
    new_state_dict[name] = v



In [8]:
# load params
msg = model.load_state_dict(new_state_dict, strict=False)
logger.info(f"resume model: {msg}")

[32m[2024-04-16 15:52:20 ViT-B/16][0m[33m(3852643250.py 3)[0m: INFO resume model: _IncompatibleKeys(missing_keys=['prompt_learner.complete_text_embeddings'], unexpected_keys=[])


### Preprocessing input video 

In [9]:
# Step 3: 
# Preprocessing for video
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
scale_resize = int(256 / 224 * config.DATA.INPUT_SIZE)
val_pipeline = [
    dict(type='DecordInit'),
    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.DATA.NUM_FRAMES, test_mode=True),
    dict(type='DecordDecode'),
    dict(type='Resize', scale=(-1, scale_resize)),
    dict(type='CenterCrop', crop_size=config.DATA.INPUT_SIZE),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='FormatShape', input_format='NCHW'),
    dict(type='Collect', keys=['imgs'], meta_keys=[]),
    dict(type='ToTensor', keys=['imgs'])
]
if config.TEST.NUM_CROP == 3:
    val_pipeline[3] = dict(type='Resize', scale=(-1, config.DATA.INPUT_SIZE))
    val_pipeline[4] = dict(type='ThreeCrop', crop_size=config.DATA.INPUT_SIZE)
if config.TEST.NUM_CLIP > 1:
    val_pipeline[1] = dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=config.DATA.NUM_FRAMES, multiview=config.TEST.NUM_CLIP)
pipeline = Compose(val_pipeline)

### Construct dataset

In [10]:
train_paths = list(dataset_root_path.glob("train/*/*"))
# test_paths = list(dataset_root_path.glob("test/*/*.mp4"))


def construct_dataset(file_paths):
    dataset = []
    for path in file_paths:
        label = str(path).split("\\")[-2]
        dataset.append({'filename': path, 'tar': False, 'modality': 'RGB', 'start_index': 0, 'label': label2id[label]})
    return dataset

train_dataset = construct_dataset(train_paths)
print('train_dataset', len(train_dataset), train_dataset[0])
# test_dataset = construct_dataset(test_paths)
# print('test_dataset', len(test_dataset), test_dataset[0])

train_dataset 424 {'filename': WindowsPath('../datasets/train/Crying/9q8mf6GUiiE_000009_000019.mp4'), 'tar': False, 'modality': 'RGB', 'start_index': 0, 'label': 64}


### ViFi-CLIP inference with given video

In [11]:
from tqdm import tqdm
from collections import defaultdict
import csv
import torch.nn.functional as F

def inference(dataset):
    correct = defaultdict(int)
    total = defaultdict(int)

    with open(output_csv_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["file path", "ground truth", "top1 pred", "confidence", "top2 pred", "confidence", "top3 pred", "confidence", "top4 pred", "confidence", "top5 pred", "confidence"])
        for i, dict_file in tqdm(enumerate(dataset), total=len(dataset)):
            label = int(dict_file['label'])
            out_row = [dict_file['filename'], label]
            total[label] += 1

            if not str(dict_file['filename']).endswith(".mp4"):
                out_row.extend([-1, 0] * 5) # use -1 if no audio
                writer.writerow(out_row)
                continue

            # Inference through ViFi-CLIP
            video = pipeline(dict_file)
            video_tensor = video['imgs'].unsqueeze(0).cuda().float()
            
            with torch.no_grad():
                with torch.cuda.amp.autocast():
                    similarities, _, _, _ = model(video_tensor)
            similarity = F.softmax(similarities, dim=1)
            pred_index = similarity.argmax(-1).item()

            values, indices = similarity[0].topk(5)
            pred_index = indices[0]

            # Output the results
            
            # print("\nFile:", dict_file['filename'])
            # print(f"Ground Truth: {class_labels[label]}")
            # print("Top predictions:")
            for value, index in zip(values, indices):
                # print(f"{class_labels[index]:>16s}: {100 * value.item():.2f}%")
                out_row.append(index.item())
                out_row.append(round(value.item() * 100, 4))
            print("")
            writer.writerow(out_row)

            if label == pred_index: 
                correct[label] += 1
            

        

    return correct, total
    

In [12]:
correct, total = inference(train_dataset)

for label in total:
    print(f"{class_labels[label]}: {correct[label]} / {total[label]}")

  0%|          | 1/424 [00:07<49:49,  7.07s/it]




  1%|          | 3/424 [00:07<13:04,  1.86s/it]





  1%|          | 4/424 [00:07<08:12,  1.17s/it]




  1%|▏         | 6/424 [00:08<04:55,  1.41it/s]





  2%|▏         | 8/424 [00:09<02:56,  2.35it/s]





  2%|▏         | 9/424 [00:09<02:32,  2.72it/s]




  2%|▏         | 10/424 [00:10<03:14,  2.13it/s]




  3%|▎         | 13/424 [00:10<01:48,  3.79it/s]






  3%|▎         | 14/424 [00:11<02:25,  2.82it/s]




  4%|▍         | 16/424 [00:11<01:52,  3.63it/s]





  4%|▍         | 17/424 [00:11<01:52,  3.61it/s]




  4%|▍         | 19/424 [00:12<01:44,  3.87it/s]





  5%|▍         | 21/424 [00:12<01:22,  4.87it/s]





  5%|▌         | 23/424 [00:13<01:30,  4.42it/s]





  6%|▌         | 25/424 [00:13<01:11,  5.62it/s]





  6%|▌         | 26/424 [00:13<01:04,  6.21it/s]




  6%|▋         | 27/424 [00:13<01:13,  5.41it/s]




  7%|▋         | 28/424 [00:14<02:04,  3.19it/s]




  7%|▋         | 29/424 [00:14<02:06,  3.13it/s]




  7%|▋         | 30/424 [00:15<02:14,  2.93it/s]




  8%|▊         | 32/424 [00:15<02:15,  2.89it/s]





  8%|▊         | 33/424 [00:16<02:05,  3.11it/s]




  8%|▊         | 35/424 [00:16<01:45,  3.68it/s]





  9%|▊         | 37/424 [00:17<01:26,  4.45it/s]





  9%|▉         | 38/424 [00:17<01:20,  4.78it/s]




  9%|▉         | 39/424 [00:17<01:28,  4.35it/s]




  9%|▉         | 40/424 [00:18<02:49,  2.26it/s]




 10%|▉         | 42/424 [00:18<01:55,  3.30it/s]





 10%|█         | 44/424 [00:19<02:07,  2.98it/s]





 11%|█         | 46/424 [00:20<01:36,  3.90it/s]





 11%|█         | 47/424 [00:20<02:02,  3.07it/s]




 12%|█▏        | 49/424 [00:21<02:12,  2.83it/s]





 12%|█▏        | 51/424 [00:21<01:35,  3.91it/s]





 12%|█▏        | 52/424 [00:21<01:19,  4.69it/s]




 13%|█▎        | 54/424 [00:22<01:11,  5.16it/s]





 13%|█▎        | 56/424 [00:22<00:56,  6.47it/s]





 14%|█▎        | 58/424 [00:22<00:50,  7.29it/s]





 14%|█▍        | 60/424 [00:23<00:58,  6.18it/s]





 15%|█▍        | 63/424 [00:23<00:45,  7.89it/s]






 15%|█▌        | 65/424 [00:23<00:40,  8.76it/s]





 16%|█▌        | 67/424 [00:23<00:50,  7.09it/s]





 16%|█▋        | 69/424 [00:24<00:43,  8.21it/s]





 17%|█▋        | 72/424 [00:24<00:38,  9.26it/s]






 18%|█▊        | 75/424 [00:24<00:42,  8.19it/s]






 18%|█▊        | 77/424 [00:25<00:39,  8.78it/s]





 19%|█▊        | 79/424 [00:25<00:37,  9.12it/s]





 19%|█▉        | 80/424 [00:25<00:46,  7.42it/s]




 30%|██▉       | 127/424 [00:26<00:08, 35.26it/s]




 31%|███       | 130/424 [00:27<00:15, 19.31it/s]




 44%|████▎     | 185/424 [00:28<00:06, 38.12it/s]






 45%|████▍     | 189/424 [00:29<00:09, 24.52it/s]






 45%|████▌     | 192/424 [00:30<00:13, 16.93it/s]






 46%|████▌     | 194/424 [00:30<00:17, 13.07it/s]





 46%|████▌     | 196/424 [00:31<00:18, 12.18it/s]





 47%|████▋     | 198/424 [00:31<00:20, 11.18it/s]




 47%|████▋     | 200/424 [00:32<00:24,  8.96it/s]





 48%|████▊     | 202/424 [00:32<00:27,  8.04it/s]





 48%|████▊     | 203/424 [00:32<00:28,  7.76it/s]




 48%|████▊     | 205/424 [00:32<00:34,  6.29it/s]





 49%|████▉     | 207/424 [00:33<00:36,  5.88it/s]





 49%|████▉     | 209/424 [00:33<00:43,  4.99it/s]





 50%|████▉     | 211/424 [00:34<00:39,  5.39it/s]





 50%|█████     | 212/424 [00:34<00:40,  5.27it/s]




 50%|█████     | 214/424 [00:34<00:38,  5.43it/s]





 51%|█████     | 216/424 [00:35<00:36,  5.68it/s]





 51%|█████▏    | 218/424 [00:35<00:37,  5.46it/s]





 52%|█████▏    | 221/424 [00:35<00:26,  7.53it/s]






 53%|█████▎    | 223/424 [00:36<00:24,  8.22it/s]





 53%|█████▎    | 224/424 [00:36<00:27,  7.39it/s]




 53%|█████▎    | 226/424 [00:36<00:31,  6.32it/s]





 54%|█████▍    | 228/424 [00:36<00:28,  6.91it/s]





 54%|█████▍    | 229/424 [00:36<00:29,  6.65it/s]




 54%|█████▍    | 231/424 [00:37<00:35,  5.45it/s]





 55%|█████▍    | 233/424 [00:37<00:32,  5.95it/s]





 55%|█████▌    | 234/424 [00:37<00:31,  5.99it/s]




 56%|█████▌    | 236/424 [00:38<00:33,  5.58it/s]





 56%|█████▌    | 238/424 [00:39<00:51,  3.64it/s]





 56%|█████▋    | 239/424 [00:40<01:24,  2.20it/s]




 57%|█████▋    | 240/424 [00:40<01:16,  2.41it/s]




 57%|█████▋    | 241/424 [00:41<01:35,  1.91it/s]




 57%|█████▋    | 242/424 [00:41<01:41,  1.79it/s]




 58%|█████▊    | 244/424 [00:42<01:30,  2.00it/s]





 58%|█████▊    | 246/424 [00:43<01:19,  2.25it/s]





 58%|█████▊    | 247/424 [00:43<01:04,  2.74it/s]




 58%|█████▊    | 248/424 [00:44<01:16,  2.31it/s]




 59%|█████▊    | 249/424 [00:44<01:10,  2.48it/s]




 59%|█████▉    | 250/424 [00:45<01:01,  2.83it/s]




 59%|█████▉    | 252/424 [00:45<00:49,  3.44it/s]





 60%|█████▉    | 253/424 [00:45<00:41,  4.12it/s]




 60%|█████▉    | 254/424 [00:46<01:02,  2.73it/s]




 60%|██████    | 256/424 [00:47<01:05,  2.55it/s]





 61%|██████    | 258/424 [00:48<01:10,  2.35it/s]





 61%|██████    | 259/424 [00:48<00:59,  2.75it/s]




 62%|██████▏   | 261/424 [00:49<01:00,  2.70it/s]





 62%|██████▏   | 262/424 [00:49<00:54,  2.98it/s]




 62%|██████▏   | 263/424 [00:50<01:02,  2.59it/s]




 62%|██████▏   | 264/424 [00:50<00:53,  3.00it/s]




 63%|██████▎   | 266/424 [00:50<00:47,  3.30it/s]





 63%|██████▎   | 268/424 [00:51<00:34,  4.50it/s]





 63%|██████▎   | 269/424 [00:51<00:48,  3.17it/s]




 64%|██████▍   | 271/424 [00:52<00:37,  4.05it/s]





 64%|██████▍   | 273/424 [00:52<00:31,  4.83it/s]





 65%|██████▍   | 274/424 [00:53<00:46,  3.23it/s]




 65%|██████▌   | 276/424 [00:53<00:40,  3.66it/s]





 65%|██████▌   | 277/424 [00:53<00:36,  3.99it/s]




 66%|██████▌   | 278/424 [00:54<00:49,  2.94it/s]




 66%|██████▌   | 279/424 [00:55<01:15,  1.92it/s]




 66%|██████▌   | 280/424 [00:55<01:21,  1.76it/s]




 67%|██████▋   | 282/424 [00:56<00:51,  2.76it/s]





 67%|██████▋   | 283/424 [00:56<00:41,  3.40it/s]




 67%|██████▋   | 285/424 [00:57<00:45,  3.09it/s]





 68%|██████▊   | 287/424 [00:57<00:32,  4.24it/s]





 68%|██████▊   | 288/424 [00:57<00:35,  3.88it/s]




 68%|██████▊   | 290/424 [00:58<00:30,  4.44it/s]





 69%|██████▊   | 291/424 [00:58<00:31,  4.21it/s]




 69%|██████▉   | 293/424 [00:58<00:29,  4.40it/s]





 69%|██████▉   | 294/424 [00:59<00:25,  5.01it/s]




 70%|██████▉   | 295/424 [00:59<00:42,  3.02it/s]




 70%|███████   | 297/424 [01:00<00:34,  3.71it/s]





 70%|███████   | 298/424 [01:00<00:28,  4.43it/s]




 71%|███████   | 300/424 [01:00<00:27,  4.56it/s]





 71%|███████   | 302/424 [01:01<00:36,  3.38it/s]





 72%|███████▏  | 304/424 [01:01<00:25,  4.64it/s]





 72%|███████▏  | 306/424 [01:02<00:28,  4.07it/s]





 73%|███████▎  | 308/424 [01:02<00:22,  5.05it/s]





 73%|███████▎  | 309/424 [01:03<00:33,  3.39it/s]




 73%|███████▎  | 311/424 [01:03<00:33,  3.33it/s]





 74%|███████▍  | 313/424 [01:04<00:30,  3.70it/s]





 74%|███████▍  | 314/424 [01:05<00:38,  2.88it/s]




 74%|███████▍  | 315/424 [01:05<00:46,  2.37it/s]




 75%|███████▍  | 316/424 [01:05<00:40,  2.64it/s]




 75%|███████▍  | 317/424 [01:06<00:49,  2.15it/s]




 75%|███████▌  | 318/424 [01:06<00:42,  2.51it/s]




 75%|███████▌  | 319/424 [01:07<00:47,  2.22it/s]




 75%|███████▌  | 320/424 [01:07<00:42,  2.48it/s]




 76%|███████▌  | 321/424 [01:08<00:46,  2.24it/s]




 76%|███████▌  | 323/424 [01:08<00:35,  2.84it/s]





 76%|███████▋  | 324/424 [01:08<00:28,  3.52it/s]




 77%|███████▋  | 326/424 [01:09<00:32,  2.98it/s]





 77%|███████▋  | 327/424 [01:10<00:39,  2.44it/s]




 77%|███████▋  | 328/424 [01:10<00:42,  2.24it/s]




 78%|███████▊  | 329/424 [01:11<00:47,  1.98it/s]




 78%|███████▊  | 330/424 [01:11<00:44,  2.12it/s]




 78%|███████▊  | 331/424 [01:12<00:48,  1.90it/s]




 78%|███████▊  | 332/424 [01:13<00:52,  1.76it/s]




 79%|███████▊  | 333/424 [01:13<00:49,  1.85it/s]




 79%|███████▉  | 335/424 [01:14<00:36,  2.42it/s]





 79%|███████▉  | 336/424 [01:14<00:33,  2.65it/s]




 80%|███████▉  | 338/424 [01:15<00:27,  3.08it/s]





 80%|████████  | 340/424 [01:15<00:21,  3.86it/s]





 81%|████████  | 342/424 [01:15<00:16,  4.84it/s]





 81%|████████  | 344/424 [01:16<00:21,  3.76it/s]





 81%|████████▏ | 345/424 [01:17<00:29,  2.69it/s]




 82%|████████▏ | 346/424 [01:18<00:40,  1.95it/s]




 82%|████████▏ | 348/424 [01:19<00:37,  2.04it/s]





 82%|████████▏ | 349/424 [01:19<00:35,  2.09it/s]




 83%|████████▎ | 351/424 [01:20<00:24,  2.99it/s]





 83%|████████▎ | 352/424 [01:20<00:19,  3.65it/s]




 83%|████████▎ | 353/424 [01:20<00:26,  2.64it/s]




 83%|████████▎ | 354/424 [01:21<00:28,  2.45it/s]




 84%|████████▍ | 356/424 [01:22<00:24,  2.75it/s]





 84%|████████▍ | 358/424 [01:22<00:16,  3.91it/s]





 85%|████████▍ | 359/424 [01:22<00:17,  3.61it/s]




 85%|████████▍ | 360/424 [01:23<00:21,  2.98it/s]




 85%|████████▌ | 361/424 [01:23<00:24,  2.57it/s]




 85%|████████▌ | 362/424 [01:24<00:32,  1.93it/s]




 86%|████████▌ | 363/424 [01:25<00:33,  1.80it/s]




 86%|████████▌ | 364/424 [01:25<00:27,  2.18it/s]




 86%|████████▌ | 365/424 [01:25<00:30,  1.96it/s]




 86%|████████▋ | 366/424 [01:26<00:25,  2.30it/s]




 87%|████████▋ | 367/424 [01:26<00:26,  2.16it/s]




 87%|████████▋ | 368/424 [01:27<00:26,  2.14it/s]




 87%|████████▋ | 369/424 [01:28<00:32,  1.69it/s]




 87%|████████▋ | 370/424 [01:28<00:28,  1.91it/s]




 88%|████████▊ | 371/424 [01:28<00:23,  2.24it/s]




 88%|████████▊ | 372/424 [01:29<00:22,  2.33it/s]




 88%|████████▊ | 374/424 [01:29<00:20,  2.48it/s]





 88%|████████▊ | 375/424 [01:30<00:16,  3.01it/s]




 89%|████████▊ | 376/424 [01:30<00:19,  2.42it/s]




 89%|████████▉ | 378/424 [01:31<00:20,  2.24it/s]





 89%|████████▉ | 379/424 [01:32<00:16,  2.72it/s]




 90%|████████▉ | 380/424 [01:32<00:18,  2.41it/s]





 90%|█████████ | 383/424 [01:32<00:10,  4.03it/s]





 91%|█████████ | 385/424 [01:33<00:09,  3.92it/s]





 91%|█████████▏| 387/424 [01:33<00:06,  5.39it/s]





 92%|█████████▏| 388/424 [01:33<00:06,  5.62it/s]




 92%|█████████▏| 390/424 [01:34<00:06,  5.12it/s]





 92%|█████████▏| 392/424 [01:34<00:05,  6.08it/s]





 93%|█████████▎| 393/424 [01:34<00:04,  6.31it/s]




 93%|█████████▎| 395/424 [01:35<00:05,  5.75it/s]





 94%|█████████▎| 397/424 [01:36<00:07,  3.51it/s]





 94%|█████████▍| 399/424 [01:36<00:07,  3.54it/s]





 94%|█████████▍| 400/424 [01:37<00:07,  3.36it/s]




 95%|█████████▍| 401/424 [01:37<00:07,  3.23it/s]




 95%|█████████▍| 402/424 [01:38<00:12,  1.72it/s]




 95%|█████████▌| 404/424 [01:39<00:08,  2.43it/s]





 96%|█████████▌| 406/424 [01:39<00:06,  2.95it/s]





 96%|█████████▌| 407/424 [01:40<00:09,  1.76it/s]




 96%|█████████▌| 408/424 [01:41<00:10,  1.55it/s]




 96%|█████████▋| 409/424 [01:41<00:07,  1.93it/s]




 97%|█████████▋| 410/424 [01:42<00:06,  2.18it/s]




 97%|█████████▋| 411/424 [01:42<00:07,  1.78it/s]




 97%|█████████▋| 412/424 [01:43<00:05,  2.03it/s]




 97%|█████████▋| 413/424 [01:44<00:06,  1.69it/s]




 98%|█████████▊| 414/424 [01:44<00:04,  2.00it/s]




 98%|█████████▊| 415/424 [01:45<00:05,  1.73it/s]




 98%|█████████▊| 417/424 [01:45<00:03,  2.20it/s]





 99%|█████████▊| 418/424 [01:46<00:02,  2.49it/s]




 99%|█████████▉| 419/424 [01:46<00:01,  2.78it/s]




 99%|█████████▉| 420/424 [01:46<00:01,  2.86it/s]




 99%|█████████▉| 421/424 [01:48<00:01,  1.51it/s]




100%|█████████▉| 422/424 [01:48<00:01,  1.59it/s]




100%|█████████▉| 423/424 [01:49<00:00,  1.84it/s]




100%|██████████| 424/424 [01:49<00:00,  3.87it/s]


Crying: 48 / 50
Falling down: 7 / 30
Glass breaking: 1 / 53
Gunshot: 0 / 51
Normal: 0 / 200
Smoke or Fire or Open flame: 34 / 40





In [14]:
for label in total:
    print(f"{class_labels[label]}: {correct[label]} / {total[label]}")

Crying: 48 / 50
Falling down: 7 / 30
Glass breaking: 1 / 53
Gunshot: 0 / 51
Normal: 0 / 200
Smoke or Fire or Open flame: 34 / 40
