In [1]:
## SPECIFY THESE ##
DATASET_FOLDER = '../dataset/20240320_22_33_11_fps1_clip_1_0/' # folder of images
# use environment vclap

In [2]:
import os
# Path Configs
PATH_DEPTH  = DATASET_FOLDER + 'depth/'
PATH_COLOR = DATASET_FOLDER + 'color'
PATH_ACT = DATASET_FOLDER + 'activity_recognition'
os.makedirs(PATH_ACT, exist_ok=True)

In [3]:
import numpy as np
import torch
from pkg_resources import packaging

print("Torch version:", torch.__version__)

import clip

clip.available_models()

Torch version: 2.1.2+cu118


['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

#model, preprocess = clip.load("ViT-B/32", device)
model, preprocess = clip.load("RN101", device)
model.eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 119,688,033
Input resolution: 224
Context length: 77
Vocab size: 49408


In [5]:
# Class label csv path
labels_csv_path = 'kinetics_400_labels.csv'

# Read Class labels
import csv

label2id = {}
id2label = {}
with open(labels_csv_path, mode='r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)

    for row in csv_reader:
        class_id, class_name = row
        label2id[class_name] = class_id
        id2label[class_id] = class_name

class_labels = list(label2id.keys())

print(f"{len(class_labels)} Unique classes: {class_labels}.")

400 Unique classes: ['abseiling', 'air drumming', 'answering questions', 'applauding', 'applying cream', 'archery', 'arm wrestling', 'arranging flowers', 'assembling computer', 'auctioning', 'baby waking up', 'baking cookies', 'balloon blowing', 'bandaging', 'barbequing', 'bartending', 'beatboxing', 'bee keeping', 'belly dancing', 'bench pressing', 'bending back', 'bending metal', 'biking through snow', 'blasting sand', 'blowing glass', 'blowing leaves', 'blowing nose', 'blowing out candles', 'bobsledding', 'bookbinding', 'bouncing on trampoline', 'bowling', 'braiding hair', 'breading or breadcrumbing', 'breakdancing', 'brush painting', 'brushing hair', 'brushing teeth', 'building cabinet', 'building shed', 'bungee jumping', 'busking', 'canoeing or kayaking', 'capoeira', 'carrying baby', 'cartwheeling', 'carving pumpkin', 'catching fish', 'catching or throwing baseball', 'catching or throwing frisbee', 'catching or throwing softball', 'celebrating', 'changing oil', 'changing wheel', 'c

In [6]:
from tqdm.notebook import tqdm
import cv2
import os
from PIL import Image
import csv
device = "cuda" if torch.cuda.is_available() else "cpu"

for filename in tqdm(os.listdir(PATH_COLOR)):

    image_input = preprocess(Image.open(os.path.join(PATH_COLOR, filename))).unsqueeze(0).to(device)
    text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}")for c in class_labels]).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image_input)
        text_features = model.encode_text(text_inputs)
    
        logits_per_image, logits_per_text = model(image_input, text_inputs)
        similarity = logits_per_image.softmax(dim=-1)
        values, indices = similarity[0].topk(5)
        pred_index = indices[0]

    # Print the results
    print("Top predictions:\n")
    for value, index in zip(values, indices):
        print(f"{class_labels[index]:>16s}: {100 * value.item():.2f}%")

    # save the results to csv
    csv_path = os.path.join(PATH_ACT, filename)[:-3] + 'csv'
    with open(csv_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # Write the header row
        writer.writerow(['Class Label', 'Probability'])
        # Write the class labels and probabilities to the CSV file
        for value, index in zip(values, indices):
            writer.writerow([class_labels[index], f"{100 * value.item():.2f}%"])

    




  0%|          | 0/5148 [00:00<?, ?it/s]

Top predictions:

         writing: 15.22%
using remote controller (not gaming): 14.29%
arranging flowers: 5.10%
   setting table: 4.50%
 opening present: 4.50%
Top predictions:

         writing: 12.70%
using remote controller (not gaming): 9.29%
   setting table: 7.95%
arranging flowers: 6.59%
          dining: 5.81%
Top predictions:

         writing: 15.36%
using remote controller (not gaming): 11.23%
arranging flowers: 8.22%
   setting table: 4.83%
  using computer: 3.88%
Top predictions:

         writing: 15.05%
using remote controller (not gaming): 11.36%
arranging flowers: 7.57%
   setting table: 5.37%
  using computer: 4.59%
Top predictions:

         writing: 13.83%
using remote controller (not gaming): 12.20%
arranging flowers: 7.64%
  using computer: 5.25%
   playing poker: 4.63%
Top predictions:

         writing: 15.36%
using remote controller (not gaming): 11.59%
arranging flowers: 7.48%
   setting table: 5.65%
  using computer: 4.83%
Top predictions:

         writing: