In [2]:
import json
import os
import random
from operator import itemgetter
from collections import defaultdict

import torch
import numpy as np
import pandas as pd

from PIL import Image
from tqdm.auto import tqdm

from ego4d.research.readers import TorchAudioStreamReader, PyAvReader
VideoReader = TorchAudioStreamReader

In [3]:
RELEASE_DIR = "/checkpoint/miguelmartin/egoexo_data/dev"  # NOTE: changeme

egoexo = {
    "takes": os.path.join(RELEASE_DIR, "takes.json"),
    "captures": os.path.join(RELEASE_DIR, "captures.json"),
    "physical_setting": os.path.join(RELEASE_DIR, "physical_setting.json"),
    "participants": os.path.join(RELEASE_DIR, "participants.json"),
    "visual_objects": os.path.join(RELEASE_DIR, "visual_objects.json"),
}

for k, v in egoexo.items():
    egoexo[k] = json.load(open(v))

takes = egoexo["takes"]
captures = egoexo["captures"]
takes_by_uid = {x["take_uid"]: x for x in takes}

In [4]:
features_dir = "/checkpoint/miguelmartin/egoexo_features/maws_clip_2b_public"
features_paths = [x for x in os.listdir(features_dir) if x != "config.yaml"]

features_by_take_cam = {}
for x in features_paths:
    take_uid, cam_id, stream_id_pt = x.split("_")
    stream_id = stream_id_pt = stream_id_pt.split(".")[0]
    if take_uid not in features_by_take_cam:
        features_by_take_cam[take_uid] = {}
    key = (cam_id, stream_id)
    features_by_take_cam[take_uid][key] = os.path.join(features_dir, x)
    
features_paths[0]

'629a4ed5-9d3f-4a61-b699-99e77a03b449_aria02_rgb.pt'

In [5]:
from maws.model_builder import build_model

In [5]:
model = build_model("vit_2b14_xlmr_l", "maws_clip")
model = model.eval().half()
model = model.to("cuda")

In [6]:
txt_labels = [
    "A person using a knife",
    "Using a serrated knife, the person grips the handle firmly with their dominant hand and positions the blade at a slight angle to the tomato. They then apply gentle pressure and make a smooth, sweeping motion from the stem end towards the bottom of the tomato, carefully slicing off the top in a circular motion.",
    "A person cutting an onion",
    "A person picking up a spoon",
    "A person picking up an object",
    "A person looking at a chopping board",
    "A person eating an apple",
    "An image of a cat",
    "An image of a dog",
    "A person cooking",
    "A person cutting a vegetable",
    "A person cutting a fruit",
    "A person setting up a camera on a tripod",
    "A person opening a bag of noodles",
    "A person opening a bag",
    "A chopping board",
    "Person using a spoon",
    "Person using a spoon to stir a pot of water",
]


In [15]:
# txt_emb = model.encode_texts(
#     texts=txt_labels,
# )

In [16]:
features_by_take_cam.keys()

dict_keys(['629a4ed5-9d3f-4a61-b699-99e77a03b449', 'd2e07def-3ea4-4c31-b20c-5f5a7ed52fb9', '0b05324a-47a2-456a-9f7a-1c36519d86e0', '53ee9e25-ba9b-429e-9cad-70c223a2f881', 'fd053d4b-ae66-4b8d-880a-bdf1cebc7257', 'a261cc1d-7a45-479f-81a9-7c73eb379e6c', '6bc7a29f-3397-4549-9ee2-d98fa93da873', '154d1224-d0c0-45f9-8e89-33f1ffa04606', '3cbd7070-7c55-4b15-ac31-100ab8c7298a', '515c5c80-f4c2-4909-8bb9-8a11e7d83a91'])

In [11]:
vid_fs = [(torch.load(x), k) for k, x in features_by_take_cam["154d1224-d0c0-45f9-8e89-33f1ffa04606"].items()]
# vid_fs = torch.stack(vid_fs)

In [14]:
# vid_fs[0][0].dtype

In [42]:
vfs = vid_fs[0][0].squeeze().cuda()
with torch.no_grad():
    probs = model.classify(texts=txt_labels, image_features=vfs)

In [43]:
# ss = 60*30
ss = 10*30
fps=30
for frame in range(0, vfs.shape[0], ss):
    label_idxs = probs[frame:frame+ss].mean(0).topk(4).indices.tolist()
#     print(label_idxs, 
    print(frame/fps, (frame+ss)/fps, [txt_labels[idx] for idx in label_idxs])
# probs[0:60*30].mean(0)

0.0 10.0 ['A person cooking', 'A person looking at a chopping board', 'A person cutting a vegetable', 'A person setting up a camera on a tripod']
10.0 20.0 ['A person cooking', 'A person looking at a chopping board', 'A person cutting a vegetable', 'A person setting up a camera on a tripod']
20.0 30.0 ['A person cooking', 'A person looking at a chopping board', 'A person cutting a vegetable', 'A person setting up a camera on a tripod']
30.0 40.0 ['A person cooking', 'A person looking at a chopping board', 'A person cutting a vegetable', 'A person setting up a camera on a tripod']
40.0 50.0 ['A person cooking', 'A person setting up a camera on a tripod', 'A person looking at a chopping board', 'A person cutting a vegetable']
50.0 60.0 ['A person cooking', 'A person cutting a vegetable', 'A person looking at a chopping board', 'A person setting up a camera on a tripod']
60.0 70.0 ['A person cooking', 'A person cutting a vegetable', 'A person looking at a chopping board', 'A person settin