In [1]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}
</style>

In [2]:
from PIL import Image
from pathlib import Path
import json
from concurrent.futures import ThreadPoolExecutor
from itertools import repeat
from tqdm.notebook import tqdm, trange
from shutil import rmtree 
import os

DATA_PATH = Path.cwd().parent / "data" if Path.cwd().name == "src" else Path.cwd() / "data"

BASE_INPUT = DATA_PATH / "base_input"
BASE_SOURCE = DATA_PATH / "base_jpg"
NOBG_INPUT = DATA_PATH / "nobg_input"
NOBG_SOURCE = DATA_PATH / "nobg_jpg"

DATA_PATH

PosixPath('/workspaces/ds340-project/data')

## Wiping and Creating Folders

In [3]:
def remake(dir):
    if dir.exists():
        rmtree(dir)
    dir.mkdir(parents=True)

In [4]:
for outer in [BASE_INPUT, NOBG_INPUT]:
    for inner in ['train', 'valid', 'test']:
        with tqdm(total=101, desc=str(outer / inner)) as pbar, ThreadPoolExecutor() as executor:
            for _ in executor.map(remake, [Path(outer / inner / f"class_{label}") for label in range(0, 100+1)]):
                pbar.update(1)

/workspaces/ds340-project/data/base_input/train:   0%|          | 0/101 [00:00<?, ?it/s]

/workspaces/ds340-project/data/base_input/valid:   0%|          | 0/101 [00:00<?, ?it/s]

/workspaces/ds340-project/data/base_input/test:   0%|          | 0/101 [00:00<?, ?it/s]

/workspaces/ds340-project/data/nobg_input/train:   0%|          | 0/101 [00:00<?, ?it/s]

/workspaces/ds340-project/data/nobg_input/valid:   0%|          | 0/101 [00:00<?, ?it/s]

/workspaces/ds340-project/data/nobg_input/test:   0%|          | 0/101 [00:00<?, ?it/s]

In [5]:
with Path(DATA_PATH / 'food101-train-mappings.json').open() as f:
    train_maps = json.load(f)

with Path(DATA_PATH / 'food101-validation-mappings.json').open() as f:
    val_maps = json.load(f)


## Moving Images to Folders

In [6]:
def move_image(image_path, output_path, mapping):
    class_folder = mapping[image_path.name]
    if Path(output_path / f"class_{class_folder}" / image_path.name).exists(): return
    Image.open(image_path).save(output_path / f"class_{class_folder}" / image_path.name)

In [7]:
for output_path, source_path in [(BASE_INPUT, BASE_SOURCE), (NOBG_INPUT, NOBG_SOURCE)]:
    for output_point, source_point, mappings in [('train', 'food101-train', train_maps), ('valid', 'food101-validation', val_maps)]:
        source_files = list(Path(source_path / source_point).glob("*.jpg"))

        with tqdm(total=len(source_files), desc=str(f"{source_path.name}/{source_point}")) as pbar, ThreadPoolExecutor() as executor:
            for _ in executor.map(move_image, source_files, repeat(output_path / output_point), repeat(mappings)):
                pbar.update(1)

base_jpg/food101-train:   0%|          | 0/75750 [00:00<?, ?it/s]

base_jpg/food101-validation:   0%|          | 0/25250 [00:00<?, ?it/s]

nobg_jpg/food101-train:   0%|          | 0/75750 [00:00<?, ?it/s]

nobg_jpg/food101-validation:   0%|          | 0/25250 [00:00<?, ?it/s]

In [8]:
def rename_file(target_name, target_path):
    target_path.rename(target_path.parent / f"{target_name}{target_path.suffix}")

In [9]:
for outer in [BASE_INPUT, NOBG_INPUT]:
    for inner in ['train', 'valid']:
        with tqdm(total=len(list(Path(outer / inner).rglob("*.*"))), desc=f"{outer}/{inner}") as pbar:
            for label in range(0, 100+1):
                files = [(idx+1, filename) for idx, filename in enumerate(Path(outer / inner / f"class_{label}").glob("*.*"))]
                with ThreadPoolExecutor() as executor:
                    for _ in executor.map(rename_file, *zip(*files)):
                        pbar.update(1)

/workspaces/ds340-project/data/base_input/train:   0%|          | 0/75750 [00:00<?, ?it/s]

/workspaces/ds340-project/data/base_input/valid:   0%|          | 0/25250 [00:00<?, ?it/s]

/workspaces/ds340-project/data/nobg_input/train:   0%|          | 0/75750 [00:00<?, ?it/s]

/workspaces/ds340-project/data/nobg_input/valid:   0%|          | 0/25250 [00:00<?, ?it/s]

## Moving Images to Test Folders

In [10]:
def move_test_image(image_path):
    Image.open(image_path).save(Path(str(image_path).replace("valid", "test")))
    os.remove(image_path)

In [11]:
for outer in [BASE_INPUT, NOBG_INPUT]:
    with tqdm(total=101*100, desc=f"{outer}/{inner}") as pbar:
        for label in range(0, 100+1):
            files = [file for file in Path(outer / 'valid' / f"class_{label}").glob("*.*") if int(file.name[:-4]) > 150]
            with ThreadPoolExecutor() as executor:
                for _ in executor.map(move_test_image, files):
                    pbar.update(1)

/workspaces/ds340-project/data/base_input/valid:   0%|          | 0/10100 [00:00<?, ?it/s]

/workspaces/ds340-project/data/nobg_input/valid:   0%|          | 0/10100 [00:00<?, ?it/s]