In [1]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}
</style>

In [2]:
from PIL import Image
from pathlib import Path
import json
from concurrent.futures import ThreadPoolExecutor
from itertools import repeat
from tqdm.notebook import tqdm

DATA_PATH = Path.cwd().parent / "data" if Path.cwd().name == "src" else Path.cwd() / "data"

BASE_INPUT = DATA_PATH / "base_input"
BASE_SOURCE = DATA_PATH / "base_jpg"
NOBG_INPUT = DATA_PATH / "nobg_input"
NOBG_SOURCE = DATA_PATH / "nobg_png"

In [3]:
for outer in [BASE_INPUT, NOBG_INPUT]:
    for inner in ['train', 'valid']:
        for label in range(0, 100+1):
            Path(outer / inner / f"class_{label}").mkdir(parents=True, exist_ok=True)

In [5]:
with Path(DATA_PATH / 'food101-train-mappings.json').open() as f:
    train_maps = json.load(f)

with Path(DATA_PATH / 'food101-validation-mappings.json').open() as f:
    val_maps = json.load(f)


In [4]:
def move_image(image_path, output_path, mapping):
    class_folder = mapping[image_path.name.replace("png", "jpg")]
    if Path(output_path / f"class_{class_folder}" / image_path.name).exists(): return
    Image.open(image_path).save(output_path / f"class_{class_folder}" / image_path.name)

In [6]:
for output_path, source_path in [(BASE_INPUT, BASE_SOURCE), (NOBG_INPUT, NOBG_SOURCE)]:
    for output_point, source_point, mappings in [('train', 'food101-train', train_maps), ('valid', 'food101-validation', val_maps)]:
        source_files = list(Path(source_path / source_point).glob("*.jpg" if "jpg" in str(source_path) else "*.png"))

        with tqdm(total=len(source_files), desc=str(source_path / source_point)) as pbar, ThreadPoolExecutor(max_workers=32) as executor:
            for _ in executor.map(move_image, source_files, repeat(output_path / output_point), repeat(mappings)):
                pbar.update(1)

/workspaces/ds340-project/data/base_jpg/food101-train:   0%|          | 0/75750 [00:00<?, ?it/s]

/workspaces/ds340-project/data/base_jpg/food101-validation:   0%|          | 0/25250 [00:00<?, ?it/s]

/workspaces/ds340-project/data/nobg_png/food101-train:   0%|          | 0/75750 [00:00<?, ?it/s]

/workspaces/ds340-project/data/nobg_png/food101-validation:   0%|          | 0/25250 [00:00<?, ?it/s]

In [7]:
def rename_file(target_name, target_path):
    target_path.rename(target_path.parent / f"{target_name}{target_path.suffix}")

In [8]:
for outer in [BASE_INPUT, NOBG_INPUT]:
    for inner in ['train', 'valid']:
        with tqdm(total=len(list(Path(outer / inner).rglob("*.*"))), desc=f"{outer}/{inner}") as pbar:
            for label in range(0, 100+1):
                files = [(idx+1, filename) for idx, filename in enumerate(Path(outer / inner / f"class_{label}").glob("*.*"))]
                with ThreadPoolExecutor(max_workers=32) as executor:
                    for _ in executor.map(rename_file, *zip(*files)):
                        pbar.update(1)

/workspaces/ds340-project/data/base_input/train:   0%|          | 0/75750 [00:00<?, ?it/s]

/workspaces/ds340-project/data/base_input/valid:   0%|          | 0/25250 [00:00<?, ?it/s]

/workspaces/ds340-project/data/nobg_input/train:   0%|          | 0/75750 [00:00<?, ?it/s]

/workspaces/ds340-project/data/nobg_input/valid:   0%|          | 0/25250 [00:00<?, ?it/s]