In [1]:
## SPECIFY THESE ##
DATASET_FOLDER = '../datasets/event_20240405_18_06_48_fps1_clip_1_0/' # folder of images
# use environment vclap

In [2]:
import os
# Path Configs
PATH_DEPTH  = DATASET_FOLDER + 'depth/'
PATH_COLOR = DATASET_FOLDER + 'color'
PATH_ACT = DATASET_FOLDER + 'activity_recognition_clip'
os.makedirs(PATH_ACT, exist_ok=True)

In [3]:
import numpy as np
import torch
from pkg_resources import packaging

print("Torch version:", torch.__version__)

import clip

clip.available_models()

  from pkg_resources import packaging


Torch version: 2.1.2+cu118


['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model, preprocess = clip.load("ViT-B/32", device)
# model, preprocess = clip.load("RN101", device)
model.eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


In [5]:
# Class label csv path
# labels_csv_path = 'k400+NTU120_labels.csv'
labels_csv_path = 'shared_office_labels.csv'

# Read Class labels
import csv

label2id = {}
id2label = {}
with open(labels_csv_path, mode='r') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)

    # for row in csv_reader:
    #     class_id, class_name = row
    #     if class_name in label2id: print(class_name)
    #     label2id[class_name] = class_id
    #     id2label[class_id] = class_name

    # gpt_labels.csv
    class_id = 0
    for row in csv_reader:
        class_name = row[0]
        label2id[class_name] = class_id
        id2label[class_id] = class_name
        class_id += 1

class_labels = list(label2id.keys())

print(f"{len(class_labels)} Unique classes: {class_labels}.")

90 Unique classes: ['No people is in the room', 'Studying', 'Group discussions', 'Printing documents', 'Using computers', 'Typing', 'Making phone calls', 'Play with phone/tablet', 'Writing on whiteboards', 'Reading books', 'Taking notes', 'Setting table', 'Eating food', 'Cooking', 'Drinking coffee or tea', 'Resting or napping', 'Listening to music', 'Watching tutorials', 'Watching movies', 'Brainstorming', 'Collaborating on projects', 'Meeting', 'Tutoring', 'Presentations', 'Organizing space', 'Charging electronic devices', 'Surfing the internet', 'Video conferencing', 'Speech', 'Cleaning', 'Taking trash out', 'Mopping floor', 'Decorating', 'Checking emails', 'Socializing with friends', 'Chatting', 'Conducting interviews', 'Writing', 'Reviewing lecture notes', 'Conducting experiments', 'Coding', 'Workshop', 'Reviewing for exams', 'Giving feedback on assignments', 'Conducting surveys', 'Talking', 'Singing', 'Laughing', 'Speaking', 'Dancing', 'Pouring drinks', 'Drinking', 'Stretching', '

In [6]:
from tqdm.notebook import tqdm
import cv2
import os
from PIL import Image
import csv
device = "cuda" if torch.cuda.is_available() else "cpu"

for filename in tqdm(os.listdir(PATH_COLOR)):

    image_input = preprocess(Image.open(os.path.join(PATH_COLOR, filename))).unsqueeze(0).to(device)
    text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}")for c in class_labels]).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image_input)
        text_features = model.encode_text(text_inputs)
    
        logits_per_image, logits_per_text = model(image_input, text_inputs)
        similarity = logits_per_image.softmax(dim=-1)
        values, indices = similarity[0].topk(5)
        pred_index = indices[0]

    # Print the results
    print("\nTop predictions:")
    for value, index in zip(values, indices):
        print(f"{class_labels[index]:>16s}: {100 * value.item():.2f}%")

    # save the results to csv
    csv_path = os.path.join(PATH_ACT, filename)[:-3] + 'csv'
    with open(csv_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # Write the header row
        writer.writerow(['Class Label', 'Confidence'])
        # Write the class labels and probabilities to the CSV file
        for value, index in zip(values, indices):
            writer.writerow([class_labels[index], f"{100 * value.item():.2f}%"])

    




  0%|          | 0/9799 [00:00<?, ?it/s]


Top predictions:
Organizing space: 40.82%
        Workshop: 16.24%
No people is in the room: 6.77%
Playing board games: 4.51%
Collaborating on projects: 4.37%

Top predictions:
Organizing space: 57.96%
        Workshop: 6.30%
Collaborating on projects: 6.11%
No people is in the room: 4.68%
   Setting table: 3.82%

Top predictions:
        Workshop: 14.62%
Organizing space: 12.31%
         Meeting: 9.15%
   Setting table: 6.29%
Playing board games: 5.82%

Top predictions:
Organizing space: 15.43%
No people is in the room: 11.47%
Playing board games: 7.29%
         Meeting: 6.96%
Collaborating on projects: 6.54%

Top predictions:
         Meeting: 12.01%
Playing board games: 7.17%
Organizing space: 6.95%
        Cleaning: 6.23%
No people is in the room: 5.95%

Top predictions:
Organizing space: 15.61%
Playing board games: 9.77%
 Solving puzzles: 6.71%
Collaborating on projects: 6.61%
   Setting table: 5.48%

Top predictions:
Organizing space: 15.58%
        Workshop: 11.76%
         Mee