In [1]:
# Keyframe metadata
combined_keyframes_metadata_filename = None
keyframes_metadata_dir = None

# Object extraction metadata
combined_object_extraction_filename = None
object_extraction_dir = None

# OCR metadata
combined_ocr_metadata_filename = None
ocr_metadata_dir = None

# Tag metadata
combined_tag_metadata_filename = None
tag_metadata_dir = None

# Audio metadata
combined_audio_metadata_filename = None
audio_metadata_dir = None

image_dir = None
# # Final metadata
# final_metadata_filename = None

In [2]:
import os
import json
import cv2
import glob
from tqdm import tqdm

dir_path = os.getcwd()

In [3]:
data_extraction_path = f'{dir_path}/data_extraction'
dataset_path = f'{data_extraction_path}/dataset/AIC_Video'

# Keyframe metadata
if not combined_keyframes_metadata_filename:
    combined_keyframes_metadata_filename = 'keyframes_metadata.json'
    
if not keyframes_metadata_dir:
    keyframes_metadata_dir = f'{data_extraction_path}/transnet/keyframes_metadata'
    
# Object extraction metadata
if not combined_object_extraction_filename:
    combined_object_extraction_filename = 'object_extraction_metadata.json'
    
if not object_extraction_dir:
    object_extraction_dir = f'{data_extraction_path}/metadata/object_extraction/object_detection'
    
# OCR metadata
if not combined_ocr_metadata_filename:
    combined_ocr_metadata_filename = 'ocr_metadata.json'
    
if not ocr_metadata_dir:
    ocr_metadata_dir = f'{data_extraction_path}/metadata/ocr'
    
# Tag metadata
if not combined_tag_metadata_filename:
    combined_tag_metadata_filename = 'tag_metadata.json'
    
if not tag_metadata_dir:
    tag_metadata_dir = f'{data_extraction_path}/metadata/tag'
    
# Audio metadata
if not combined_audio_metadata_filename:
    combined_audio_metadata_filename = 'audio_metadata.json'
    
if not audio_metadata_dir:
    audio_metadata_dir = f'{data_extraction_path}/audio/audio_recognition'

if not image_dir:
    image_dir = f'{data_extraction_path}/transnet'
       
# # final_metadata
# if not final_metadata_filename:
#     final_metadata_filename = 'final_metadata.json'

In [4]:
def get_image_size(image_path):
    """
    Detects the width and height of an image using OpenCV.
    
    Args:
        image_path (str): The path to the image file.

    Returns:
        tuple: (width, height) of the image.
    """
    img = cv2.imread(image_path)
    if img is not None:
        height, width = img.shape[:2]
        return width, height
    else:
        raise ValueError("Could not load image from the provided path.")

In [5]:
def parse_file_path_from_dir(dir, ext='.json'):
    all_paths = {}

    for part in sorted(os.listdir(dir)):
        data_part_path = os.path.join(dir, part)

        if os.path.isdir(data_part_path):
            # e.g: handle 'od/L01_V001' structure (nested folder)
            data_part = os.path.basename(part)
            all_paths[data_part] = []

            files = sorted(glob.glob(os.path.join(data_part_path, f'*{ext}')))
            all_paths[data_part] = files
        else:
            # e.g: handle flatten file 'ocr/L01_V001_000_ocr.json'
            data_part = os.path.splitext(os.path.basename(part))[0]
            all_paths[data_part] = []

            if part.endswith(ext):
                all_paths[data_part].append(data_part_path)

    return all_paths

# Combine keyframe metadata

In [6]:
def combine_keyframe_metadata_json_files(directory, image_dir, output_file):
    combined_data = {}
    all_file_path = parse_file_path_from_dir(directory)
            
    for file_paths in tqdm(all_file_path.values(), desc="processing data folder"):
        file_path = file_paths[0]
        with open(file_path, 'r') as file:
            data = json.load(file)
            
        for frame_data in tqdm(data.values(), desc=f"processing frame in {file_path.split('/')[-1]}"):
            image_path = f"{image_dir}/{frame_data['frame_path']}"
            img_width, img_height = get_image_size(image_path)
            frame_data['width'] = img_width
            frame_data['height'] = img_height
            
        combined_data.update(data)

    sorted_combined_data = {
        key: combined_data[key] for key in sorted(combined_data)}
    with open(output_file, 'w') as outfile:
        json.dump(sorted_combined_data, outfile)
    
    print(
        f'Combined keyframe metadata successful: {output_file} - total: {len(sorted_combined_data)}')
    

combine_keyframe_metadata_json_files(keyframes_metadata_dir, image_dir, combined_keyframes_metadata_filename)

processing frame in L01_V001.json: 100%|██████████| 272/272 [00:03<00:00, 80.69it/s]
processing frame in L01_V001_extra.json: 100%|██████████| 933/933 [00:04<00:00, 212.12it/s]
processing frame in L01_V002.json: 100%|██████████| 216/216 [00:02<00:00, 92.64it/s]
processing frame in L01_V002_extra.json: 100%|██████████| 711/711 [00:03<00:00, 227.90it/s]
processing data folder: 100%|██████████| 4/4 [00:13<00:00,  3.31s/it]


Combined keyframe metadata successful: keyframes_metadata.json - total: 2132


# Combine object extraction metadata

In [7]:
def preprocess_object_detection(data):
    organized_data = {'objects': dict(), 'counts': dict()}
    for item in data:
        label = item['label']
        if label not in organized_data['objects']:
            organized_data['objects'][label] = []
            organized_data['counts'][label] = 0
        organized_data['objects'][label].append({
            "score": item['score'],
            "box": item['box']
        })
        organized_data['counts'][label] += 1

    return organized_data

def combine_object_extraction_metadata_json_files(directory, output_file):
    combined_data = {}
    all_file_path = parse_file_path_from_dir(directory)

    for video_folder, file_paths in tqdm(all_file_path.items(), desc="processing video folder"):
        # video_name_part = video_folder.split('_')  # L01, V001, extra
        # video_id = '_'.join(video_name_part[:2]) # L01_V001
        # is_extra = '_extra' if len(video_name_part) == 3 else ''
        for file_path in tqdm(file_paths, desc=f"processing file in {video_folder}"):
            file = file_path.split('/')[-1] # 000139_detection.json, etc.
            file_name = file.split('.')[0].split('_')  # [000139, detection]
            file_id, file_model = file_name[0], file_name[1]
            key = f'{video_folder}_{int(file_id):06d}_{file_model}'
            with open(file_path, 'r') as f:
                data = json.load(f)
            if not data:
                continue
            combined_data[key] = preprocess_object_detection(data)
                
    sorted_combined_data = {
        key: combined_data[key] for key in sorted(combined_data)}
    with open(output_file, 'w') as f:
        json.dump(sorted_combined_data, f)

    print(f'Combined object detection metadata successful: {output_file} - total: {len(sorted_combined_data)}')
    

combine_object_extraction_metadata_json_files(object_extraction_dir, combined_object_extraction_filename)

processing file in L01_V001: 100%|██████████| 203/203 [00:00<00:00, 16243.35it/s]
processing file in L01_V001_extra: 100%|██████████| 595/595 [00:00<00:00, 22822.02it/s]
processing file in L01_V002: 100%|██████████| 159/159 [00:00<00:00, 21231.91it/s]
processing file in L01_V002_extra: 100%|██████████| 420/420 [00:00<00:00, 8129.25it/s]
processing video folder: 100%|██████████| 4/4 [00:00<00:00, 35.39it/s]


Combined object detection metadata successful: object_extraction_metadata.json - total: 1377


# Combine OCR

In [8]:
keyframe_metadata_file = f"{dir_path}/keyframes_metadata.json"

def combine_ocr_metadata_json_file(directory, output_file):
    combined_data = {}
    all_file_path = parse_file_path_from_dir(directory)
    
    for file_paths in tqdm(all_file_path.values(), desc="processing data folder"):
        file_path = file_paths[0]
        with open(file_path, 'r') as file:
            data = json.load(file)
        if not data:
            continue

        # e.g ocr/L01_V001_extra.json => L01_V001_extra
        video_name = file_path.split('/')[-1].split('.')[0]
        
        for frame_img, ocr_data in tqdm(data.items(), desc=f"processing data in {file_path}"):
            frame_id = frame_img.split('.')[0] # 001.jpg => 001
            key = f"{video_name}_{int(frame_id):06d}_ocr"
            combined_data[key] = ocr_data
            
    sorted_combined_data = {
        key: combined_data[key] for key in sorted(combined_data)}
    with open(output_file, 'w') as f:
        json.dump(sorted_combined_data, f)
        
    print(
        f'Combined ocr metadata successful: {output_file} - total: {len(sorted_combined_data)}')
    
combine_ocr_metadata_json_file(ocr_metadata_dir, combined_ocr_metadata_filename)

processing data in /home/jiggle/personal/competition/hcm-ai/pipeline-hcm-ai/notebooks/data_extraction/metadata/ocr/L01_V001.json: 100%|██████████| 266/266 [00:00<00:00, 440459.88it/s]
processing data in /home/jiggle/personal/competition/hcm-ai/pipeline-hcm-ai/notebooks/data_extraction/metadata/ocr/L01_V001_extra.json: 100%|██████████| 829/829 [00:00<00:00, 985287.05it/s]
processing data in /home/jiggle/personal/competition/hcm-ai/pipeline-hcm-ai/notebooks/data_extraction/metadata/ocr/L01_V002.json: 100%|██████████| 213/213 [00:00<00:00, 283344.99it/s]
processing data in /home/jiggle/personal/competition/hcm-ai/pipeline-hcm-ai/notebooks/data_extraction/metadata/ocr/L01_V002_extra.json: 100%|██████████| 614/614 [00:00<00:00, 783242.90it/s]
processing data folder: 100%|██████████| 4/4 [00:00<00:00, 209.39it/s]


Combined ocr metadata successful: ocr_metadata.json - total: 1922


# Combine tag

In [9]:
keyframe_metadata_file = f"{dir_path}/keyframes_metadata.json"

def combine_multi_tag_metadata_json_file(directory, output_file):
    combined_data = {}
    all_file_path = parse_file_path_from_dir(directory)
    
    for file_paths in tqdm(all_file_path.values(), desc="processing data folder"):
        file_path = file_paths[0]
        with open(file_path, 'r') as file:
            data = json.load(file)
        if not data:
            continue
        
        # e.g tag/L01_V001_extra.json => L01_V001_extra
        video_name = file_path.split('/')[-1].split('.')[0]

        for frame_img, tag_data in tqdm(data.items(), desc=f"processing data in {file_path}"):
            frame_id = frame_img.split('.')[0]  # 001.jpg => 001
            key = f"{video_name}_{int(frame_id):06d}_tag"
            combined_data[key] = tag_data
    
    sorted_combined_data = {
        key: combined_data[key] for key in sorted(combined_data)}
    with open(output_file, 'w') as f:
        json.dump(sorted_combined_data, f)

    print(
        f'Combined multi tag metadata successful: {output_file} - total: {len(sorted_combined_data)}')
    
combine_multi_tag_metadata_json_file(tag_metadata_dir, combined_tag_metadata_filename)

processing data in /home/jiggle/personal/competition/hcm-ai/pipeline-hcm-ai/notebooks/data_extraction/metadata/tag/L01_V001.json: 100%|██████████| 272/272 [00:00<00:00, 917082.55it/s]
processing data in /home/jiggle/personal/competition/hcm-ai/pipeline-hcm-ai/notebooks/data_extraction/metadata/tag/L01_V001_extra.json: 100%|██████████| 933/933 [00:00<00:00, 461146.08it/s]
processing data in /home/jiggle/personal/competition/hcm-ai/pipeline-hcm-ai/notebooks/data_extraction/metadata/tag/L01_V002.json: 100%|██████████| 216/216 [00:00<00:00, 589824.00it/s]
processing data in /home/jiggle/personal/competition/hcm-ai/pipeline-hcm-ai/notebooks/data_extraction/metadata/tag/L01_V002_extra.json: 100%|██████████| 711/711 [00:00<00:00, 1136490.15it/s]
processing data folder: 100%|██████████| 4/4 [00:00<00:00, 210.16it/s]


Combined multi tag metadata successful: tag_metadata.json - total: 2132


# Combine Audio

In [10]:
# keyframe_metadata_file = f"{dir_path}/keyframes_metadata.json"

# def combine_audio_tag_metadata_json_file(audio_directory, output_file, keyframe_metadata_file):
#     combined_data = {}

#     if os.path.exists(output_file):
#         with open(output_file, 'r') as existing_file:
#             combined_data = json.load(existing_file)

#     with open(keyframe_metadata_file, "r") as keyframe_file:
#         keyframe_metadata_dict = json.load(keyframe_file)
        
#     for sub_dir in (os.listdir(audio_directory)):
#         sub_dir_path = f'{audio_directory}/{sub_dir}'
#         for audio_json in (os.listdir(sub_dir_path)):
#             video_id = f"{sub_dir}_{audio_json.split('.')[0]}"
#             audio_json_path = f"{sub_dir_path}/{audio_json}"
#             with open(audio_json_path, "r") as audio_file:
#                 audio_list = json.load(audio_file)
#             for item in audio_list:
#                 for keyframe_key, keyframe_value in keyframe_metadata_dict.items():
#                     frame_idx = keyframe_key.split("_")[-1]
#                     start, end = item['segment_id']
#                     video_id_keyframe = keyframe_value['video_path'].split(".")[0].split('/')[-1]
#                     if (video_id == video_id_keyframe) and (int(frame_idx) >= start) and (int(frame_idx) <= end):
#                         combined_data[f"{keyframe_key}_audio"] = item['transcription']
                        
                        
#     fill_empty_audio_combined_data = {}
#     keys_combined_data = combined_data.keys() # Taking all keys in combined data dict
#     for key_keyframe in keyframe_metadata_dict.keys():
#         if f"{key_keyframe}_audio" not in keys_combined_data:
#             fill_empty_audio_combined_data[f"{key_keyframe}_audio"] = {"transcription" : ""}
#     combined_data.update(fill_empty_audio_combined_data)
    
    
#     keys = list(combined_data.keys())
#     keys.sort()
#     sorted_combined_data = {i : combined_data[i] for i in keys}
#     with open(output_file, 'w') as outfile:
#         json.dump(sorted_combined_data, outfile)
    
# combine_audio_tag_metadata_json_file(audio_metadata_dir, combined_audio_metadata_filename, keyframe_metadata_file)

# Combined final file

In [11]:
# def combined_json_file(json_files, output_file):
#     combined_data = {}
    
#     for file_name, data in json_files.items():
#         path = data['path']
#         key_ext = data.get('key_extension', '')
#         with open(path, 'r') as f:
#             json_data = json.load(f)
#         print(f"Preprocessing metadata file: {file_name}")
#         for key, data in json_data.items():
#             if key_ext:
#                 key = key.replace(f'_{key_ext}', '')
#             if key not in combined_data:
#                 combined_data[key] = {}
                
#             combined_data[key][key_ext] = data
            
#     with open(output_file, 'w') as f:
#         json.dump(combined_data, f)

#     print(f'Combined final metadata successful: {output_file}')

In [12]:
# json_files = {
#     'keyframe_metadata': {
#         'path': combined_keyframes_metadata_filename,
#         'key_extension': 'keyframe',
#     },
#     'object_extraction': {
#         'path': combined_object_extraction_filename,
#         'key_extension': 'detection'
#     }
# }

# # combined_json_file(json_files, final_metadata_filename)