In [18]:
import os
import shutil
aokvqa_dir = os.getenv('AOKVQA_DIR', r"C:\workspace\misc\5980\aokvqa")
coco_dir = os.getenv('COCO_DIR', r"C:\workspace\misc\5980\coco")
coco_filtered_dir = os.getenv('COCO_FILTERED_DIR', r"C:\workspace\misc\5980\coco_filtered")

In [None]:
from load_aokvqa import load_aokvqa, get_coco_path
val_aokvqa_dataset = load_aokvqa(aokvqa_dir, 'val')  
train_aokvqa_dataset = load_aokvqa(aokvqa_dir, 'train')  
test_aokvqa_dataset = load_aokvqa(aokvqa_dir, 'test')

In [15]:
print(f"Train dataset size: {len(train_aokvqa_dataset)}")
print(f"Validation dataset size: {len(val_aokvqa_dataset)}")
print(f"Test dataset size: {len(test_aokvqa_dataset)}")

Train dataset size: 17056
Validation dataset size: 1145
Test dataset size: 6702


In [16]:
dataset_example = test_aokvqa_dataset[0]
dataset_example

{'split': 'test',
 'image_id': 487715,
 'question_id': '22dfoxvWKwTS6myafdKHfc',
 'question': 'Who married a woman that has a similar with the sign next to the chocolate avec sign?',
 'choices': ['dom perignon', 'cesare borgia', 'mick jagger', 'ice-t'],
 'difficult_direct_answer': False}

In [None]:
def copy_relevant_images_from_coco_dataset(
        dataset, 
        dest_folder,
        get_coco_path_func,
        coco_dir, 
        split='val'
    ):
    """
    Copies the image corresponding to the dataset entry at idx to dest_folder/split.

    Args:
        dataset: list of dicts, each dict contains 'image_id'
        dest_folder: destination folder to copy the image to
        get_coco_path_func: function to get image path (e.g., get_coco_path)
        coco_dir: COCO directory path
        split: dataset split ('val', 'train', 'test')
    """
    for idx, entry in enumerate(dataset):
        entry = dataset[idx]
        image_path = get_coco_path_func(split, entry['image_id'], coco_dir)
        split_folder = os.path.join(dest_folder, f"{split}2017")
        os.makedirs(split_folder, exist_ok=True)
        shutil.copy(image_path, split_folder)
        if idx % 100 == 0:
            print(f"Progress: {idx}/{len(dataset)}, {idx/len(dataset)*100:.2f}%")
    print(f"Copied {coco_dir} to {split_folder}")

In [19]:
copy_relevant_images_from_coco_dataset(
    val_aokvqa_dataset, 
    dest_folder=coco_filtered_dir,
    get_coco_path_func=get_coco_path,
    coco_dir=coco_dir,
    split='val'
)

Progress: 0/1145, 0.00%
Progress: 100/1145, 8.73%
Progress: 200/1145, 17.47%
Progress: 300/1145, 26.20%
Progress: 400/1145, 34.93%
Progress: 500/1145, 43.67%
Progress: 600/1145, 52.40%
Progress: 700/1145, 61.14%
Progress: 800/1145, 69.87%
Progress: 900/1145, 78.60%
Progress: 1000/1145, 87.34%
Progress: 1100/1145, 96.07%
Copied C:\workspace\misc\5980\coco to C:\workspace\misc\5980\coco_filtered\val


In [20]:
copy_relevant_images_from_coco_dataset(
    test_aokvqa_dataset, 
    dest_folder=coco_filtered_dir,
    get_coco_path_func=get_coco_path,
    coco_dir=coco_dir,
    split='test'
)

Progress: 0/6702, 0.00%
Progress: 100/6702, 1.49%
Progress: 200/6702, 2.98%
Progress: 300/6702, 4.48%
Progress: 400/6702, 5.97%
Progress: 500/6702, 7.46%
Progress: 600/6702, 8.95%
Progress: 700/6702, 10.44%
Progress: 800/6702, 11.94%
Progress: 900/6702, 13.43%
Progress: 1000/6702, 14.92%
Progress: 1100/6702, 16.41%
Progress: 1200/6702, 17.91%
Progress: 1300/6702, 19.40%
Progress: 1400/6702, 20.89%
Progress: 1500/6702, 22.38%
Progress: 1600/6702, 23.87%
Progress: 1700/6702, 25.37%
Progress: 1800/6702, 26.86%
Progress: 1900/6702, 28.35%
Progress: 2000/6702, 29.84%
Progress: 2100/6702, 31.33%
Progress: 2200/6702, 32.83%
Progress: 2300/6702, 34.32%
Progress: 2400/6702, 35.81%
Progress: 2500/6702, 37.30%
Progress: 2600/6702, 38.79%
Progress: 2700/6702, 40.29%
Progress: 2800/6702, 41.78%
Progress: 2900/6702, 43.27%
Progress: 3000/6702, 44.76%
Progress: 3100/6702, 46.25%
Progress: 3200/6702, 47.75%
Progress: 3300/6702, 49.24%
Progress: 3400/6702, 50.73%
Progress: 3500/6702, 52.22%
Progress: 3

In [None]:
# copy_relevant_images_from_coco_dataset(
#     train_aokvqa_dataset, 
#     dest_folder=coco_filtered_dir,
#     get_coco_path_func=get_coco_path,
#     coco_dir=coco_dir,
#     split='train'
# )

In [None]:
def copy_image_from_dataset(dataset, dest_folder, get_coco_path_func, coco_dir, split='val', idx=0):
    """
    Copies the image corresponding to the dataset entry at idx to dest_folder/split.

    Args:
        dataset: list of dicts, each dict contains 'image_id'
        dest_folder: destination folder to copy the image to
        get_coco_path_func: function to get image path (e.g., get_coco_path)
        coco_dir: COCO directory path
        split: dataset split ('val', 'train', 'test')
        idx: index of the dataset entry to copy
    """
    entry = dataset[idx]
    image_path = get_coco_path_func(split, entry['image_id'], coco_dir)
    split_folder = os.path.join(dest_folder, split)
    os.makedirs(split_folder, exist_ok=True)
    shutil.copy(image_path, split_folder)
    print(f"Copied {image_path} to {split_folder}")