# id to name dict for generated data

In [1]:
from pathlib import Path
import json
from PIL import Image

import shutil

In [3]:
# Define the base directory
dataset_dir = Path("/mnt/ssd2/xin/repo/DART/Liebherr_Product")
meta_dir = dataset_dir / "metadata"

repo_dir = Path(".")
orig_image_dir = repo_dir / "generated_data_orig"
jpg_image_dir = repo_dir / "generated_data_jpg"
new_image_dir = repo_dir / "generated_data"

## png to jpg


In [None]:
for path in orig_image_dir.rglob("*.png"):
    img = Image.open(path)
    img = img.convert("RGB")
    new_path = (jpg_image_dir / path.relative_to(orig_image_dir)).with_suffix(".jpg")
    if new_path.exists():
        continue
    new_path.parent.mkdir(parents=True, exist_ok=True)
    img.save(new_path)

## file count

In [5]:
def count_files_in_directory(directory):
    total_files = 0
    for file in directory.rglob("*"):
        if file.is_file():
            total_files += 1
    return total_files


file_count = count_files_in_directory(jpg_image_dir)
print(f"There are {file_count} files in '{jpg_image_dir}'")

There are 68220 files in 'generated_data_jpg'


In [6]:
file_count = count_files_in_directory(orig_image_dir)
print(f"There are {file_count} files in '{orig_image_dir}'")

There are 3520 files in 'generated_data_orig'


## rename

In [11]:
# function
def restructure_and_rename(src_dir, dst_dir, old_map=None, prefix="d", copy=False):
    """
    1. restructure the directroy to a similar structure as the original dataset
    2. rename files with padded 0 and then add prefix
    """
    total_files = count_files_in_directory(src_dir)
    total_digits = len(str(total_files - 1))  # Number of digits in the largest ID

    if old_map is not None:
        old_ids = set(old_map.keys())
        current_id = len(old_ids)
        name_map = old_map
    else:
        old_ids = None
        current_id = 0
        name_map = {}  # Dictionary to store new ID and old filename mappings

    for path in sorted(src_dir.rglob("*.jpg")):
        new_value = path.relative_to(src_dir).as_posix()
        if new_value in name_map.values():
            continue
        # old path structure: image_dir/MODEL_NAME/OBJ_/INSTANCE/XXXX.jpg
        new_file_stem = str(current_id).zfill(total_digits)
        if old_ids is not None:
            while new_file_stem in old_ids:
                current_id += 1
                new_file_stem = str(current_id).zfill(total_digits)
        obj_ = path.parent.parent.name
        obj = obj_.replace("_", " ")
        new_filename = f"{prefix}{new_file_stem}{path.suffix}"
        new_path = dst_dir / obj / new_filename
        new_path.parent.mkdir(parents=True, exist_ok=True)
        name_map[new_filename] = new_value
        if copy:
            shutil.copy(path, new_path)
        current_id += 1
    return name_map

### Once first rename is DONE, and should NOT be run again

In [None]:
id_to_name = restructure_and_rename(jpg_image_dir, new_image_dir, copy=True)
print(len(id_to_name))

# save
with open(repo_dir / "id_to_name_gen.json", "w") as f:
    json.dump(id_to_name, f, indent=4)

## move files around

In [None]:
# # move files back to old image according to the old_id_to_name mapping
# for id in old_id_to_name:
#     obj_ = old_id_to_name[id].split('/')[1]
#     obj = obj_.replace('_', ' ')
#     cur_path = new_image_dir / obj / id
#     if not cur_path.exists():
#         continue
#     else:
#         # move back to old path
#         old_path = jpg_image_dir / old_id_to_name[id]
#         shutil.move(cur_path, old_path)

In [None]:
# # copy files from jpg_image_dir to new_image_dir according to the old_id_to_name mapping
# for id in old_id_to_name:
#     obj_ = old_id_to_name[id].split('/')[1]
#     obj = obj_.replace('_', ' ')
#     new_path = new_image_dir / obj / id
#     if new_path.exists():
#         continue
#     else:
#         old_path = jpg_image_dir / old_id_to_name[id]
#         new_path.parent.mkdir(parents=True, exist_ok=True)
#         shutil.copy(old_path, new_path)

## append new data


In [None]:
with open(repo_dir / "id_to_name_gen.json", "r") as f:
    old_id_to_name = json.load(f)

id_to_name = restructure_and_rename(
    jpg_image_dir, new_image_dir, old_id_to_name, copy=False
)
print(len(id_to_name))

In [None]:
# copy based on new id_to_name mapping
count = 0
for id in id_to_name:
    obj_ = id_to_name[id].split("/")[1]
    obj = obj_.replace("_", " ")
    new_path = new_image_dir / obj / id
    if new_path.exists():
        continue
    else:
        count += 1
        old_path = jpg_image_dir / id_to_name[id]
        new_path.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy(old_path, new_path)
print(f"Copied {count} files")

In [None]:
# save, update the id_to_name mapping
with open(repo_dir / "id_to_name_gen.json", "w") as f:
    json.dump(id_to_name, f, indent=4)

## copy the saved id_to_name_gen.json to metadata_dir

In [None]:
import shutil

shutil.copy(repo_dir / "id_to_name_gen.json", meta_dir / "id_to_name_gen.json")