In [1]:
import json, os, pickle
from tqdm import tqdm

In [2]:
# obj_categories = ['adult', 'car', 'guitar', 'chair', 'handbag', 'toy', 'baby_seat', 'cat', 'bottle', 'backpack', 'motorcycle', 'ball/sports_ball', 'laptop', 'table', 'surfboard', 'camera', 'sofa', 'screen/monitor', 'bicycle', 'vegetables', 'dog', 'fruits', 'cake', 'cellphone', 'cup', 'bench', 'snowboard', 'skateboard', 'bread', 'bus/truck', 'ski', 'suitcase', 'stool', 'bat', 'elephant', 'fish', 'baby_walker', 'dish', 'watercraft', 'scooter', 'pig', 'refrigerator', 'horse', 'crab', 'bird', 'piano', 'cattle/cow', 'lion', 'chicken', 'camel', 'electric_fan', 'toilet', 'sheep/goat', 'rabbit', 'train', 'penguin', 'hamster/rat', 'snake', 'frisbee', 'aircraft', 'oven', 'racket', 'faucet', 'antelope', 'duck', 'stop_sign', 'sink', 'kangaroo', 'stingray', 'turtle', 'tiger', 'crocodile', 'bear', 'microwave', 'traffic_light', 'panda', 'leopard', 'squirrel']
with open('obj_categories.json', 'r') as f:
    obj_categories = json.load(f)

# obj_to_idx = {}
# for i, obj in enumerate(obj_categories):
#     obj_to_idx[obj] = i
# idx_to_obj = {v:k for k, v in obj_to_idx.items()}
# print(len(obj_categories))

with open('obj_to_idx.pkl', 'rb') as f:
    obj_to_idx = pickle.load(f)
with open('idx_to_obj.pkl', 'rb') as f:
    idx_to_obj = pickle.load(f)

In [6]:
human_categories = ['adult', 'child', 'baby']

def convert_vidor_to_detectron2_label(annot_dir, small_dataset=False, val=False):
    frame_annots = []
    if val:
        with open('keyframes_set.pkl', 'rb') as f:
            keyframes_set = pickle.load(f)
    
    nb_of_videos = nb_of_frames = 0
    for folder in tqdm(os.listdir(annot_dir)):
        for video_json in os.listdir(os.path.join(annot_dir, folder)):
            nb_of_videos += 1
            with open(os.path.join(annot_dir, folder, video_json), 'r') as f:
                annot = json.load(f)

            if small_dataset:
                frame_per_video_count = 0
            for i, frame_label in enumerate(annot['trajectories']):
                objs = []
                
                contains_at_least_one_gt_box = False
                # disable contains_at_least_one_gt_box when generating validation frames 
                # (need to find keyframes instead)
                if val: 
                    contains_at_least_one_gt_box = True
                for obj in frame_label:
                    if obj['generated'] == 0: # only pick manually labeled frames for training instance detector
                        contains_at_least_one_gt_box = True
                    label = obj
                    label['object_class'] = obj_to_idx['person'] if annot['subject/objects'][obj['tid']]['category'] in human_categories else obj_to_idx[annot['subject/objects'][obj['tid']]['category']]
                    objs.append(label)
                if not contains_at_least_one_gt_box:
                    continue
                
                if val:
                    if (folder + '/' + annot['video_id'], i) not in keyframes_set:
                        continue
                        
                frame_annots.append({ # 'image_id': annot['video_id'] + '/' + str(idx),
                    'video_folder': folder,
                    'video_id': annot['video_id'],
                    'frame_id': str(f'{i+1:06d}'), # 1-based index (ava-style)
                    'video_fps': annot['fps'],
                    'height': annot['height'],
                    'width': annot['width'],
                    'middle_frame_timestamp': i,
                    'objs': objs,
                })
                nb_of_frames += 1
                
                if not val and small_dataset:
                    frame_per_video_count += 1
                    if frame_per_video_count == 11:
                        break

    print('nb_of_videos:', nb_of_videos)
    print('nb_of_frames:', nb_of_frames)
    return frame_annots
        
train_annot_dir = 'annotation/training'
val_annot_dir = 'annotation/validation'

# train_frame_annots = convert_vidor_to_detectron2_label(train_annot_dir)
# val_frame_annots = convert_vidor_to_detectron2_label(val_annot_dir)

# train_frame_annots = convert_vidor_to_detectron2_label(train_annot_dir, small_dataset=True)
val_frame_annots = convert_vidor_to_detectron2_label(val_annot_dir, small_dataset=True, val=True)

100%|██████████| 131/131 [00:08<00:00, 16.31it/s]

nb_of_videos: 835
nb_of_frames: 22976





In [9]:
val_frame_annots[:10]

[{'video_folder': '0085',
  'video_id': '5018581116',
  'frame_id': '000011',
  'video_fps': 15,
  'height': 240,
  'width': 320,
  'middle_frame_timestamp': 10,
  'objs': [{'tid': 0,
    'bbox': {'xmin': 87, 'ymin': 2, 'xmax': 206, 'ymax': 98},
    'generated': 1,
    'tracker': 'mosse',
    'object_class': 17},
   {'tid': 1,
    'bbox': {'xmin': 205, 'ymin': 12, 'xmax': 319, 'ymax': 171},
    'generated': 1,
    'tracker': 'mosse',
    'object_class': 17}]},
 {'video_folder': '0085',
  'video_id': '5018581116',
  'frame_id': '000012',
  'video_fps': 15,
  'height': 240,
  'width': 320,
  'middle_frame_timestamp': 11,
  'objs': [{'tid': 0,
    'bbox': {'xmin': 86, 'ymin': 2, 'xmax': 206, 'ymax': 100},
    'generated': 1,
    'tracker': 'mosse',
    'object_class': 17},
   {'tid': 1,
    'bbox': {'xmin': 206, 'ymin': 13, 'xmax': 319, 'ymax': 172},
    'generated': 1,
    'tracker': 'mosse',
    'object_class': 17}]},
 {'video_folder': '0085',
  'video_id': '5018581116',
  'frame_id': '

In [7]:
# with open('train_frame_annots_detectron2.json', 'w') as f:
#     json.dump(train_frame_annots, f)
with open('val_frame_annots_detectron2.json', 'w') as f:
    json.dump(val_frame_annots, f)

# with open('train_frame_annots_detectron2_small_dataset_10imgs.json', 'w') as f:
#     json.dump(train_frame_annots, f)
# with open('val_frame_annots_detectron2_small_dataset_10imgs.json', 'w') as f:
#     json.dump(val_frame_annots, f)

In [8]:
# print(len(train_frame_annots))
print(len(val_frame_annots))

22976
