In [1]:
from pathlib import Path
import polars as pl
from PIL import Image
import json
from concurrent.futures import ThreadPoolExecutor
from io import BytesIO
from itertools import repeat

DATA_PATH = Path.cwd().parent / "data"

In [2]:
def process_base(row, target_dataset: str):
    image, filename, label = row
    Image.open(BytesIO(image)).save(f"{DATA_PATH}/base_jpg/{target_dataset}/{filename}")
    return filename, label

In [5]:
for dataset in ["food101-train", "food101-validation"]:
    print(f"Processing {dataset}")

    data = pl.scan_parquet(DATA_PATH / f"{dataset}.parquet").select(pl.col("image"), pl.col("label")).unnest("image").collect()

    print(f"Read {data.select(pl.len()).item()} images")

    with ThreadPoolExecutor() as executor:
        results = dict(executor.map(process_base, data.iter_rows(), repeat(dataset)))

    print(f"Processed {len(results)} images")

    with Path(DATA_PATH / f"{dataset}-mappings.json").open("w") as f:
        json.dump(results, f, indent=4)
        
    print(f"Saved mappings to {dataset}-mappings.json")

Processing food101-train
Read 75750 images
Processed 75750 images
Saved mappings to food101-train-mappings.json
Processing food101-validation
Read 25250 images
Processed 25250 images
Saved mappings to food101-validation-mappings.json
