In [1]:
import os
import json
import numpy as np
from matplotlib.pyplot import *
import cv2
import torch

CLI_OUTPUT_DIR = "/local/juro4948/data/egoexo4d/egoexo" # Replace with the full path to the --output_directory you pass to the cli
VERSION = "v1"

METADATA_PATH = os.path.join(CLI_OUTPUT_DIR, "takes.json")
ANNOTATIONS_PATH = os.path.join(CLI_OUTPUT_DIR, "annotations")

assert os.path.exists(METADATA_PATH), f"Metadata doesn't exist at {METADATA_PATH}. Is the CLI_OUTPUT_DIR right? Do you satisfy the pre-requisites?"
assert os.path.exists(os.path.join(ANNOTATIONS_PATH, "keystep_train.json")), "Annotation metadata doesn't exist. Did you download it with the CLI?"


In [2]:
RELEASE_DIR = "/local/juro4948/data/egoexo4d/egoexo"  # NOTE: changeme
assert os.path.exists(RELEASE_DIR), "change RELEASE_DIR to where you downloaded the dataset to"

egoexo = {
    "takes": os.path.join(RELEASE_DIR, "takes.json"),
    # "takes_dropped": os.path.join(RELEASE_DIR, "takes_dropped.json"),
    "captures": os.path.join(RELEASE_DIR, "captures.json"),
    "physical_setting": os.path.join(RELEASE_DIR, "physical_setting.json"),
    "participants": os.path.join(RELEASE_DIR, "participants.json"),
    "visual_objects": os.path.join(RELEASE_DIR, "visual_objects.json"),
}

TASK_ID_CAT = {
    0: "Unknown",
    1000: "Cooking",
    2000: "Health",
    4000: "Bike Repair",
    5000: "Music",
    6000: "Basketball",
    7000: "Rock Climbing",
    8000: "Soccer",
    9000: "Dance",
}

for k, v in egoexo.items():
    egoexo[k] = json.load(open(v))

takes = egoexo["takes"] 
captures = egoexo["captures"]
takes_by_uid = {x["take_uid"]: x for x in takes}

Filter takes to only include cooking+narration takes

In [3]:
cooking_takes = [take for take in takes if (take['parent_task_id'] == 1000)]
len(cooking_takes)

636

# Prepare dataset for Bridge Prompt pre-training
- run BrP extract_frames.py on the video files
- extract action labels to npy 


Configs
- toggle between ego and exo cameras

In [31]:
egocentric = True
use_downscaled = False
omnivore_features = True

#  TODO:  we should parse takes.json to get the ego_rgb_code for the rgb aria take. From visual inspection it seems that it's always 214-1, but we should be sure.
if egocentric == True:
    downsample_rate = 1 # this is the sampling factor - take every <rate> samples
    camera = 'aria'
    ego_rgb_code = '214-1' #
    frame_rate = 30  # the raw frame rate ->  4k@60FPS (MP4) for GoPro devices and 1404x1404@30FPS (VRS) for Aria devices (https://docs.ego-exo4d-data.org/overview/#data)

# TODO: fill in values here. Also, how to handle the 60fps vs 30fps issue? Could downsample cam by half
elif egocentric == False and omnivore_features == True:
    camera = 'cam'
    ego_rgb_code = '' 
    frame_rate = 30


elif egocentric == False and omnivore_features == False:
    camera = 'cam'
    ego_rgb_code = '' 
    frame_rate = 60

### Read the Annotation File

See this for metatdata: https://docs.ego-exo4d-data.org/annotations/keystep/

In [32]:
# See raw annotations in this dictionary
keystep_anns = json.load(open(os.path.join(ANNOTATIONS_PATH, "keystep_train.json")))
keystep_anns_val = json.load(open(os.path.join(ANNOTATIONS_PATH, "keystep_val.json")))

anns = keystep_anns["annotations"]
print(f'Length of training items: {len(anns)}')
anns_test = keystep_anns_val["annotations"]

# Add anns_test to anns dictionary
anns.update(anns_test)
print(f'Length of training + val items: {len(anns)}')

Length of training items: 671
Length of training + val items: 852


In [33]:
keystep_anns['taxonomy']

{'Covid-19 Rapid Antigen Test': {'0': {'id': 0,
   'is_leafnode': False,
   'name': 'Covid-19 Rapid Antigen Test',
   'parent_id': None,
   'parent_name': None},
  '1': {'id': 1,
   'is_leafnode': False,
   'name': 'Preparation',
   'parent_id': 0,
   'parent_name': 'Covid-19 Rapid Antigen Test'},
  '2': {'id': 2,
   'is_leafnode': True,
   'name': 'Check the expiration date',
   'parent_id': 1,
   'parent_name': 'Preparation',
   'unique_id': 818},
  '3': {'id': 3,
   'is_leafnode': True,
   'name': 'Locate test tube',
   'parent_id': 1,
   'parent_name': 'Preparation',
   'unique_id': 819},
  '4': {'id': 4,
   'is_leafnode': True,
   'name': 'Locate and unwrap test tube cap',
   'parent_id': 1,
   'parent_name': 'Preparation',
   'unique_id': 820},
  '5': {'id': 5,
   'is_leafnode': False,
   'name': 'Unbox the covid19 kit',
   'parent_id': 1,
   'parent_name': 'Preparation'},
  '6': {'id': 6,
   'is_leafnode': True,
   'name': 'Arrange test material',
   'parent_id': 5,
   'parent_n

# Get video uids for cooking videos with narration annotations
- Get video uids to download from CLI EGoExo downloader


In [34]:
# only get videos with keystep annotations
cooking_uids = []
for item in takes:
    # check if this take is cooking
    cat = item['parent_task_id']
    if cat != 1000:
        continue

    # check if this take has annotations
    has_annotations = False
    for check_if_annotated in anns.keys():
        if anns[check_if_annotated]['take_uid'] == item['take_uid']:
            has_annotations = True
            break
    if not has_annotations:
        continue
    
    cooking_uids.append(item['take_uid'])

print(len(cooking_uids))

# with open('egoexo4d/all_cooking_videos.txt', mode='wt', encoding='utf-8') as myfile:
#     myfile.write('\n'.join(cooking_uids))

344


In [35]:
cooking_annotations = {}
for uid in cooking_uids:
    cooking_annotations[uid] = keystep_anns['annotations'][uid]

# collect all cooking scenarios  
cooking_scenarios = set()
for uid in cooking_annotations.keys():
    cooking_scenarios.add(cooking_annotations[uid]['scenario'])

cooking_scenarios = list(cooking_scenarios)
cooking_scenarios


['Making Sesame-Ginger Asian Salad',
 'Cooking Noodles',
 'Cooking Scrambled Eggs',
 'Cooking Pasta',
 'Making Coffee latte',
 'Cooking an Omelet',
 'Making Chai Tea',
 'Making Cucumber & Tomato Salad',
 'Cooking Tomato & Eggs',
 'Cooking Sushi Rolls',
 'Making Milk Tea']

In [36]:
# filter cooking scenarios
taxonomy_cooking = {}
for scenario in cooking_scenarios:
    taxonomy_cooking[scenario] = keystep_anns['taxonomy'][scenario] 

taxonomy_cooking

{'Making Sesame-Ginger Asian Salad': {'0': {'id': 0,
   'is_leafnode': False,
   'name': 'Making Sesame-Ginger Asian Salad',
   'parent_id': None,
   'parent_name': None},
  '504': {'id': 504,
   'is_leafnode': False,
   'name': 'Get Ingredients',
   'parent_id': 0,
   'parent_name': 'Making Sesame-Ginger Asian Salad'},
  '506': {'id': 506,
   'is_leafnode': True,
   'name': 'Get almonds',
   'parent_id': 504,
   'parent_name': 'Get Ingredients',
   'unique_id': 506},
  '508': {'id': 508,
   'is_leafnode': True,
   'name': 'Get bell peppers',
   'parent_id': 504,
   'parent_name': 'Get Ingredients',
   'unique_id': 508},
  '512': {'id': 512,
   'is_leafnode': True,
   'name': 'Get cardamom',
   'parent_id': 504,
   'parent_name': 'Get Ingredients',
   'unique_id': 512},
  '513': {'id': 513,
   'is_leafnode': True,
   'name': 'Get carrots',
   'parent_id': 504,
   'parent_name': 'Get Ingredients',
   'unique_id': 513},
  '514': {'id': 514,
   'is_leafnode': True,
   'name': 'Get celerie

Notes on annotations:
- just note that there are other interesting text metadata for later
- Taxonomy is interesting, it is a hierarchical action class definition. We could use this later.
- Vocabulary contains all of the leaf node classes.
- I only downloaded the cooking related videos (those with narrations) but vocabulary it contains vocab from other categories. So now lets iterate through the labels for the video samples we downloaded and gather labels.

In [37]:
keystep_anns.keys()

dict_keys(['taxonomy', 'vocabulary', 'ds', 'annotations'])

In [38]:
# keystep_anns['taxonomy']

In [39]:
# build_vocab = {}
# for sample in anns.keys():
#     for segment in anns[sample]['segments']:
#         step_unique_id = segment['step_unique_id']
#         step_name = segment['step_name']

#         if step_unique_id not in build_vocab.keys():
#             build_vocab[step_unique_id] = step_name

# # save
# with open('../../../home/juro4948/gravit/Bridge-Prompt/data/egoexo/egoexo-cooking-json', 'w') as fp:
#     json.dump(build_vocab, fp)

# Preprocess videos - extract frames 
- TODO: Julia: configure cv2 for our GPU

In [40]:
def get_take_id_from_name(take_name):
    for take_id in anns.keys():
        if anns[take_id]['take_name'] == take_name:
            return take_id
    return None

def get_take_name_from_id(take_id):
    if take_id in anns.keys():
        return anns[take_id]['take_name']
    else:
        return None

In [41]:
# list of the filepaths that we are interested in. Format is: [(take_name, path_to_mp4), ...]
paths_to_mp4s = []

# ########## Uncomment to process Takes Images ##########
# take_root = 'egoexo4d/egoexo/takes/'
# take_names = os.listdir(take_root)


# for take_name in take_names:
#     path_to_vids = os.path.join(take_root, take_name, 'frame_aligned_videos')

#     if get_take_id_from_name(take_name) is None:
#         continue

#     if os.path.exists(path_to_vids):
#         if use_downscaled == True:
#             if os.path.exists(os.path.join(path_to_vids, 'downscaled/448')):
#                 all_files = os.listdir(os.path.join(path_to_vids, 'downscaled/448'))
#                 mp4_files = [fn for fn in all_files if fn.endswith('.mp4')]
#                 for fn in mp4_files:
#                     # grab the video that we want
#                     if ego_rgb_code in fn and camera in fn:
#                         paths_to_mp4s.append((take_name, os.path.join(path_to_vids, 'downscaled/448', fn)))
#                         break
#         else:
#             all_files = os.listdir(path_to_vids)
#             mp4_files = [fn for fn in all_files if fn.endswith('.mp4')]
#             for fn in mp4_files:
#                     # grab the video that we want
#                     if ego_rgb_code in fn and camera in fn:
#                         paths_to_mp4s.append((take_name, os.path.join(path_to_vids, fn)))
#                         break
# print(paths_to_mp4s)
# print(f'Length of paths_to_mp4s: {len(paths_to_mp4s)}')


# # dictionary that contains the number of frames for each video
# take_to_vlen = {}

# # first get the number of videos to process
# for take_name, video_file in paths_to_mp4s:
#     # read in the video
#     vid = cv2.VideoCapture(video_file)
#     ret, im = vid.read()
#     n_frames = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
#     take_to_vlen[take_name] = n_frames
# ######################################################################


######### Uncomment to process EgoExo Omnivore features ##########
unique_take_names = set()
take_root = '/local/juro4948/data/egoexo4d/egoexo/features/omnivore_video/'
for fn in os.listdir(take_root):
    take_id = fn.split('_')[0]
    take_name = get_take_name_from_id(take_id)
    if take_name not in unique_take_names and take_name is not None:
        unique_take_names.add(take_name)
take_names = list(unique_take_names)


for take_name in take_names:
    take_id = get_take_id_from_name(take_name)
    for fn in os.listdir(take_root):
        if take_id in fn and camera in fn:
            paths_to_mp4s.append((take_name, os.path.join(take_root, fn)))


print(paths_to_mp4s)
print(f'Length of paths_to_mp4s: {len(paths_to_mp4s)}')


# dictionary that contains the number of frames for each video
take_to_vlen = {}

for take_name, video_file in paths_to_mp4s:
    # read in the video
    video_data = torch.load(video_file)
    vlen= video_data.shape[0]
    n_frames = vlen * 16 + 32  # upsample to 30fps
    take_to_vlen[take_name] = n_frames
#####################################################################


[('fair_cooking_08_2', '/local/juro4948/data/egoexo4d/egoexo/features/omnivore_video/d2e07def-3ea4-4c31-b20c-5f5a7ed52fb9_aria02_rgb.pt'), ('iiith_cooking_122_2', '/local/juro4948/data/egoexo4d/egoexo/features/omnivore_video/a9902d89-3aab-4bce-aa07-720cf12523f6_aria01_rgb.pt'), ('uniandes_cooking_005_2', '/local/juro4948/data/egoexo4d/egoexo/features/omnivore_video/206e9042-02a3-48ac-bed0-45861fe658cb_aria01_rgb.pt'), ('upenn_0711_Cooking_3_5', '/local/juro4948/data/egoexo4d/egoexo/features/omnivore_video/3ca3a186-a650-4f8b-a9d0-f0bdb19bd860_aria01_rgb.pt'), ('georgiatech_cooking_06_03_2', '/local/juro4948/data/egoexo4d/egoexo/features/omnivore_video/13b85191-0a2c-4d0f-a54c-3118f5a96cbb_aria03_rgb.pt'), ('iiith_cooking_128_4', '/local/juro4948/data/egoexo4d/egoexo/features/omnivore_video/c112d4da-5120-47d7-9b05-a4ef68baa226_aria01_rgb.pt'), ('indiana_cooking_21_2', '/local/juro4948/data/egoexo4d/egoexo/features/omnivore_video/a2fc6bb5-ad56-460e-a8ae-64db8080d5ae_aria03_rgb.pt'), ('iiit

In [42]:
from tqdm.notebook import tqdm

In [43]:
import cv2

print("OpenCV version:", cv2.__version__)
print("OpenCV CUDA enabled devices:", cv2.cuda.getCudaEnabledDeviceCount())


OpenCV version: 4.9.0
OpenCV CUDA enabled devices: 0


In [44]:
import cv2

if cv2.cuda.getCudaEnabledDeviceCount() > 0:
    print("OpenCV is using GPU acceleration")
else:
    print("OpenCV is not using GPU acceleration")


OpenCV is not using GPU acceleration


In [45]:
DATASET = os.path.expanduser("egoexo4d/")
url_imgs = DATASET+"preprocessed_old/imgs/"

# incomplete = 0
# completed = 0
# no_annotations = 0
# take_names_to_download = []
# # first get the number of videos to process
# for take_name, video_file in paths_to_mp4s:
#     take_id = get_take_id_from_name(take_name)
#     if take_id == None:
#         no_annotations += 1
#         continue
    
#     video_id = video_file.split('/')[-1].split('.mp4')[0]
#     if os.path.exists(os.path.join(url_imgs, take_name, video_id)) and len(os.listdir(os.path.join(url_imgs, take_name, video_id))) == take_to_vlen[take_name]:
#             completed += 1 
#     else:
#         take_names_to_download.append(take_name)
#         incomplete += 1

# print(f'Status of frames extracted:')
# print(f'Incomplete: {incomplete}, Completed: {completed}, No annotations: {no_annotations}')
        
# count = 0
# for take_name, video_file in tqdm(paths_to_mp4s):
#     if take_name not in take_names_to_download:
#         continue
#     take_id = get_take_id_from_name(take_name)

#     # read in the video
#     vid = cv2.VideoCapture(video_file)
#     ret, im = vid.read()
#     n_frames = take_to_vlen[take_name]

#     # make directory to save frames
#     if not os.path.exists(os.path.join(url_imgs, take_name, video_id)):
#         if not os.path.exists(os.path.join(url_imgs, take_name)):
#             os.mkdir(os.path.join(url_imgs, take_name))
#         os.mkdir(os.path.join(url_imgs, take_name, video_id))
 
#     print(f'Extracting frames from {video_file}...')
#     count += 1
#     for i in range(int(n_frames/downsample_rate)):
#         if i % 1000 == 0:
#             print(i, "of", n_frames/downsample_rate)

#         if i > n_frames:
#             print("New video")
#             break

#         vid.set(cv2.CAP_PROP_POS_FRAMES, i*downsample_rate)
#         ret, im = vid.read()
#         if not ret:
#             print("No image")
#             break

#         im = cv2.resize(im, (256,256))
#         ret = cv2.waitKey(1)
#         if ret >= 0:
#             break
        
#         # TODO: Blake: modify directory structure
#         # Current: egoexo4d/preprocessed/imgs/fair_cooking_07_4/aria01_214-1
#         # Desired: egoexo4d/preprocessed/imgs/aria-214-1/fair_cooking_07_4

#         # Later: egoexo4d/preprocessed/imgs/gopro/fair_cooking_07_4
#         fn = "{}{}/{}/{}_{}.jpg".format(url_imgs, take_name, video_id, 'img', i*downsample_rate)  # TODO: change path # TODO: later for gopro we will need to downsample by half to match aria frame rate
#         success = cv2.imwrite(fn, im)
#         if success == False:
#             print('error')
#     print('Done')

# # print(f'Fail: {fail_count}, Success: {success_count}')
# print(f'Frames extracted from {count} videos.')

# Preprocess annotations

Todo: format the data so that i can easily run bridge-prompt on it: https://github.com/ttlmh/Bridge-Prompt/blob/master/train.py

Below cell saves labels that look like this in the raw egoexo folder:
- start_frame end_frame action_label
- 65 98 get_noodles
- 98 396 add_the_noodles_in_the_boiling_water
- 433 833 stir_noodles_in_the_pot
- 902 945 get_napkin
- 1235 1509 stir_noodles_in_the_pot

In [31]:
# first, must convert the json labels to framestamp-by-framestamp labels and save in raw egoexo takes folder
# note: must make a symlink from GraVi-T annotations folder to preprocessed/annotations/gravit-groundTruth

DATASET = "egoexo4d/"
preprocessed_output_path = DATASET+"preprocessed/annotations/gravit-groundTruth/"
success_count = 0
fail_count = 0
take_dir = 'egoexo4d/egoexo/takes'

if not os.path.exists(preprocessed_output_path):
    os.makedirs(preprocessed_output_path)

for take_name, video_file in paths_to_mp4s:
    take_id = get_take_id_from_name(take_name)
    take_root = os.path.join(take_dir, take_name, 'frame_aligned_videos')
    if take_id == None:
        print(f'Labels not found for take: {video_file} ... Skipping....')
        fail_count += 1
        continue
    else:
        success_count += 1

    label_data = anns[take_id]['segments']

    # Create a list to store the formatted label entries
    formatted_labels = []
    n_frames = take_to_vlen[take_name]
 
    # Calculate frame numbers for each label and create the formatted entries
    for i, label in enumerate(label_data):
        start_time = label['start_time']
        end_time = label['end_time']
        
        if i == 0: # Handle starting frames without annotations
            if start_time > 0:
                formatted_labels.append(f"0 {int(start_time * frame_rate)} action_start")
        
        # Convert start and end times to frame numbers
        start_frame = int(start_time * frame_rate)
        end_frame = int(end_time * frame_rate)
        
        # Get the action label
        action_label = label['step_name']
        words = action_label.split()
        action_label = "_".join([w.lower() for w in words])
        
        # Create the formatted entry and add it to the list
        formatted_entry = f"{start_frame} {end_frame} {action_label}"
        formatted_labels.append(formatted_entry)

    # Handle ending frames without annotations
    start_of_action_end = formatted_labels[-1].split(' ')[1]
    formatted_labels.append(start_of_action_end + ' ' + str(n_frames) + ' ' + 'action_end')

    # # Define the output file path
    output_file_path = os.path.join(take_root, f"{frame_rate}fps_formatted_labels.txt") # The formatted labels should be matched with all the videos of the take bc they are supposed to be frame-aligned
    
    # Write the formatted labels to the output file
    with open(output_file_path, 'w') as output_file:
        for entry in formatted_labels:
            output_file.write(entry + '\n')
    print(f'Formatted labels saved to {output_file_path}')
    print('-------------')

print(f'Fail: {fail_count}, Success: {success_count}')
print(f"Formatted labels saved to {output_file_path}")

Formatted labels saved to egoexo4d/egoexo/takes/minnesota_cooking_050_2/frame_aligned_videos/30fps_formatted_labels.txt
-------------
Formatted labels saved to egoexo4d/egoexo/takes/indiana_cooking_03_2/frame_aligned_videos/30fps_formatted_labels.txt
-------------
Formatted labels saved to egoexo4d/egoexo/takes/georgiatech_cooking_02_02_6/frame_aligned_videos/30fps_formatted_labels.txt
-------------
Formatted labels saved to egoexo4d/egoexo/takes/iiith_cooking_57_2/frame_aligned_videos/30fps_formatted_labels.txt
-------------
Formatted labels saved to egoexo4d/egoexo/takes/iiith_cooking_120_4/frame_aligned_videos/30fps_formatted_labels.txt
-------------
Formatted labels saved to egoexo4d/egoexo/takes/iiith_cooking_122_4/frame_aligned_videos/30fps_formatted_labels.txt
-------------
Formatted labels saved to egoexo4d/egoexo/takes/iiith_cooking_146_2/frame_aligned_videos/30fps_formatted_labels.txt
-------------
Formatted labels saved to egoexo4d/egoexo/takes/upenn_0714_Cooking_1_3/frame_a

Next, preprocess the labels to npy list where idx of label corresponds to idx of the video frame

In [32]:
# list of text labels
"""
this one iterates through all the annotations, reads the labels created above in 1. and creates an array of length n_frames where each 
element in the array is the string action label for the corresponding frame. 
It also forward fills the action labels so there are no gaps between actions
"""

import pandas as pd
import numpy as np

DATASET = os.path.expanduser("egoexo4d/")
annotation_output_path = DATASET+"preprocessed/annotations/gravit-groundTruth/"
if os.path.exists(annotation_output_path) == False:
    os.makedirs(annotation_output_path)

# also save to Gravi-T repo
annotation_output_path = '/home/juro4948/gravit/GraVi-T/data/annotations/egoexo-omnivore-aria/groundTruth'
os.makedirs(annotation_output_path, exist_ok=True)

success_count = 0
fail_count = 0
fail = 0
for take_name, video_file in paths_to_mp4s:
    take_id = get_take_id_from_name(take_name)
    if take_id == None:
        print(f'Labels not found for take: {video_file} ... Skipping....')
        continue

    take_root = f"egoexo4d/egoexo/takes/{take_name}/frame_aligned_videos/"
    take_root_downscaled = f"egoexo4d/egoexo/takes/{take_name}/frame_aligned_videos/downscaled/448"

    if os.path.exists(take_root) == False:
        continue
    if os.path.exists(os.path.join(take_root, f"{frame_rate}fps_formatted_labels.txt")) == False:
        continue

    # try:
    #     # Blake: Extract video from downscaled/448 directory
    #     video_file = [fn for fn in os.listdir(take_root_downscaled) if (ego_rgb_code in fn and camera in fn)][0] # TODO: same as above, we should parse takes.json to get the ego_rgb_code for this take. From visual inspection it seems that it's always 214-1, but we should be sure.   
    # except:
    #     continue # if we didn't download the take, skip

    # Define the path to the formatted labels text file
    formatted_labels_path = os.path.join(take_root, f"{frame_rate}fps_formatted_labels.txt")

    # Load the formatted labels from the text file
    formatted_labels = []
    with open(formatted_labels_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                parts = line.split()
                start_frame, end_frame = int(parts[0]), int(parts[1])
                action_label = ' '.join(parts[2:])
                formatted_labels.append((start_frame, end_frame, action_label))

    # Calculate the total number of frames - need to lead the video or count num_files in the directory
    if take_name not in take_to_vlen.keys():
        print(f'No video found for take: {take_name} ... Skipping....')
        continue

    total_frames = take_to_vlen[take_name]

    # Create an array to store the labels for each frame
    video_labels = np.empty(total_frames, dtype=object)
    video_labels.fill("")

    # Assign labels to corresponding frames
    this_failed = 0
    for start_frame, end_frame, action_label in formatted_labels:
        if action_label == 'action_start' and start_frame != 0:
            print('Error with action_start label. Not at beginning of video.')
            fail += 1
            this_failed = 1
            break
        elif action_label == 'action_end' and end_frame != total_frames:
            print('Error with action_end label. Not at end of video.')
            print(f'end_frame: {end_frame}, total_frames: {total_frames}')
            fail += 1
            this_failed = 1
            break
    
        video_labels[start_frame:end_frame + 1] = action_label

    if this_failed == 1:
        print(f'Failed for take: {take_name}')
        continue

    #### fill empty strings with a none label

    video_labels[video_labels == ""] = np.nan
    ### forwardfill empty strings with the previous label
    video_labels = pd.Series(video_labels).fillna(method='ffill').values


    # Define the output file path for the .npy file
    # output_file_path = os.path.join(take_root, f"{frame_rate}fps_longform_annotations.npy")
    output_file_path = os.path.join(annotation_output_path, f'{take_name}.txt')


    # Write the formatted labels to the output file
    with open(output_file_path, 'w') as output_file:
        for entry in video_labels:
            output_file.write(entry + '\n')
        

  video_labels = pd.Series(video_labels).fillna(method='ffill').values


In [33]:
save_root = '../../../home/juro4948/gravit/Bridge-Prompt/data/egoexo/action_descriptions_id' # label directory for BrP

if not os.path.exists(save_root):
    os.mkdir(save_root)
    print('Created directory')

save_root = save_root + '/30fps'
if not os.path.exists(save_root):
    os.mkdir(save_root)
    print(f'Created directory: {save_root}')

Check that the label length matches the video length

In [34]:
import os
import numpy as np

# Define the directory where the annotations and frames are saved
annotations_dir = "../../../home/juro4948/gravit/Bridge-Prompt/data/egoexo/action_descriptions_id/30fps"
frames_dir = "egoexo4d/preprocessed_old/imgs"

# Get the list of annotation files
annotation_files = os.listdir(annotations_dir)

# Iterate over each annotation file
for annotation_file in annotation_files:
    # Get the session name from the annotation file name
    session_name = annotation_file.split(".")[0]
    
    # Load the annotations
    annotations = np.load(os.path.join(annotations_dir, annotation_file), allow_pickle=True)
    
    # Get the number of frames saved for the session
    frames_path = os.path.join(frames_dir, session_name)
    if len(os.listdir(frames_path)) != 1:
        print(f"Multiple folders found for session {session_name}. this means there are multiple aria or gopro videos for this session. Choosing the first...")
    
    frames_path = os.path.join(frames_path, os.listdir(frames_path)[0])
    num_frames = len(os.listdir(frames_path))

    # print(len(annotations), num_frames)
    
    # Compare the lengths of the annotations and the frames
    if len(annotations) == num_frames:
        pass
    else:
        print(f"The length of annotations does not match the number of frames for session {session_name}")
print('Done checking')


Multiple folders found for session fair_cooking_05_4. this means there are multiple aria or gopro videos for this session. Choosing the first...
Done checking


# Now we must re-arrange the label vocabulary to start the unique ids at 0



In [35]:
# build a dictionary of the mapping between {int: action class}
# save as json
"""
This goes through all the annotations saved in 1) and gets all of the unique action classes and builds a dictionary to transition between the action string and corresponding integer label
"""

actions = set()
vocab = {}

# TODO: Blake: hange this loop to the loop above
for take_id in anns.keys():
    take_name = anns[take_id]['take_name']
    take_root = f"egoexo4d/egoexo/takes/{take_name}/frame_aligned_videos/"  # TODO: cahnge
    
    if os.path.exists(take_root) == False:
        continue

    # Define the path to the formatted labels text file
    formatted_labels_path = os.path.join(take_root, f"{frame_rate}fps_formatted_labels.txt")

    if os.path.exists(formatted_labels_path) == False:
        print(f'File does not exist: {formatted_labels_path}')
        continue
    with open(formatted_labels_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                parts = line.split()
                start_frame, end_frame = int(parts[0]), int(parts[1])
                action_label = ' '.join(parts[2:])
                if action_label not in actions:
                    actions.add(action_label)

formatted_labels = list(actions)

i = 0
for action in formatted_labels:
    vocab[i] = action
    i += 1

# save
with open('../../../home/juro4948/gravit/Bridge-Prompt/data/egoexo/mapping_adj.json', 'w') as fp:
    json.dump(vocab, fp)

print(vocab)
print('action_start' in vocab.values())
print('action_end' in vocab.values())
print(f'Num classes: {len(vocab)}')

{0: 'add_peanut_butter', 1: 'put_away_garlic_cloves', 2: 'put_away_soy_sauce', 3: 'get_onions', 4: 'cut_scallion', 5: 'whisk_the_mix_of_ingredients', 6: 'get_napkin', 7: 'wait_until_water_boils', 8: 'wash_measuring_tool_(scoop_or_spoon_or_cup)', 9: 'put_away_kitchen_towel', 10: 'toss_the_mixture', 11: 'put_away_nutmeg', 12: 'wash_sieve', 13: 'cut_tomato', 14: 'put_away_oyster_sauce', 15: 'get_white_pepper', 16: 'get_dried_herbs', 17: 'put_away_spoon', 18: 'roll_the_sushi_up_from_the_bottom_using_your_mat', 19: 'add_the_coffee_grounds_to_the_filter_in_the_pour-over_device.', 20: 'get_pot_holder', 21: 'wash_cherry_tomatoes', 22: 'add_flour_to_surface', 23: 'tilt_and_rotate_skillet_to_allow_uncooked_egg_to_flow_into_empty_spaces', 24: 'get_measuring_tool_(scoop_or_spoon_or_cup)', 25: 'get_bell_peppers', 26: 'close_the_instant_coffee_jar', 27: 'wash_fork', 28: 'get_spatula', 29: 'put_away_almonds', 30: 'stir_coffee_with_water_mixture', 31: 'get_vinegar', 32: 'get_plate', 33: 'get_cinnamon_

In [36]:
# converts json to txt file
# Save action mapping as txt file for BrP

with open('../../../home/juro4948/gravit/Bridge-Prompt/data/egoexo/mapping_adj.json', 'r') as fp:
    mapping = json.load(fp)

# Process and convert to the desired format
output_lines = []
for key, value in mapping.items():
    words = value.split()
    action_name = "_".join([w.lower() for w in words])
    output_lines.append(f"{key} {action_name}")

# Save the result to a text file
out = '../../../home/juro4948/gravit/Bridge-Prompt/data/egoexo/mapping.txt'
with open(out, "w") as output_file:
    output_file.write("\n".join(output_lines))

# # Save the result to a text file
# out = '/home/juro4948/gravit/GraVi-T/data/annotations/egoexo-omnivore-aria/mapping.txt'
# with open(out, "w") as output_file:
#     output_file.write("\n".join(output_lines))

print(f"Conversion complete. Output saved as {out}.")

Conversion complete. Output saved as /home/juro4948/gravit/GraVi-T/data/annotations/egoexo-omnivore-aria/mapping.txt.


In [37]:
# build reverse mapping
action_to_int = {}
for key, value in mapping.items():
    action_to_int[value] = key

action_to_int['action_start']

'116'

In [44]:
# save integer labels as npy files for BridgePrompt
url_imgs = DATASET+"preprocessed_old/imgs/"

import numpy as np

for take_name, video_file in paths_to_mp4s:
    take_id = get_take_id_from_name(take_name)
    if take_id == None:
        print(f'Labels not found for take: {video_file} ... Skipping....')
        continue

    take_root = f"egoexo4d/egoexo/takes/{take_name}/frame_aligned_videos/"


    # Define the path to the formatted labels text file
    formatted_labels_path = os.path.join(take_root, f"{frame_rate}fps_formatted_labels.txt")

    # Load the formatted labels from the text files
    if os.path.exists(formatted_labels_path) == False:
        print(f'File does not exist: {formatted_labels_path}')
        continue

    formatted_labels = []
    with open(formatted_labels_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line:
                parts = line.split()
                start_frame, end_frame = int(parts[0]), int(parts[1])
                action_label = ' '.join(parts[2:])
                formatted_labels.append((start_frame, end_frame, action_label))

   
    total_frames = take_to_vlen[take_name] # take num frames based on video
    video_id = video_file.split('.mp4')[0]

    # # now read in list of the frames to choose
    # imgs = os.listdir("{}{}/{}".format(url_imgs, take_name, video_id))
    # frame_nums = [int(fn.split('img_')[1].split('.jpg')[0]) for fn in imgs] 
    # frame_nums = sorted(frame_nums)

    # Create an array to store the labels for each frame
    video_labels = np.empty(total_frames, dtype=object)
    video_labels.fill("")

    # Assign labels to corresponding frames
    for start_frame, end_frame, action_label in formatted_labels:
        video_labels[start_frame:end_frame + 1] = action_label

    # video_labels = video_labels[frame_nums]

    # forwardfill empty strings with the previous label
    video_labels[video_labels == ""] = np.nan
    video_labels = pd.Series(video_labels).fillna(method='ffill').values

    # Take integer labels
    video_labels = np.array([action_to_int[label] for label in video_labels])
    print(video_labels)
    
# 
    # Define the output file path for the .npy file
    output_file_path = os.path.join(save_root, f"{take_name}.npy")

    # Save the video labels as a .npy file
    np.save(output_file_path, video_labels)

    print(f"Video labels have been saved to {output_file_path}")


  video_labels = pd.Series(video_labels).fillna(method='ffill').values


['116' '116' '116' ... '397' '397' '397']
Video labels have been saved to ../../../home/juro4948/gravit/Bridge-Prompt/data/egoexo/action_descriptions_id/30fps/minnesota_cooking_050_2.npy
['116' '116' '116' ... '397' '397' '397']
Video labels have been saved to ../../../home/juro4948/gravit/Bridge-Prompt/data/egoexo/action_descriptions_id/30fps/indiana_cooking_03_2.npy
['116' '116' '116' ... '397' '397' '397']
Video labels have been saved to ../../../home/juro4948/gravit/Bridge-Prompt/data/egoexo/action_descriptions_id/30fps/georgiatech_cooking_02_02_6.npy
['116' '116' '116' ... '397' '397' '397']
Video labels have been saved to ../../../home/juro4948/gravit/Bridge-Prompt/data/egoexo/action_descriptions_id/30fps/iiith_cooking_57_2.npy
['116' '116' '116' ... '335' '397' '397']
Video labels have been saved to ../../../home/juro4948/gravit/Bridge-Prompt/data/egoexo/action_descriptions_id/30fps/iiith_cooking_120_4.npy
['116' '116' '116' ... '175' '175' '175']
Video labels have been saved to