In [None]:
#| default_exp prepare_images

# "Process images" bit of bedmap pipeline

bedmap does three things:
- processes images through to embeddings
- creates layouts from embeddings
- builds web assets

This notebook does "process image" part (first bullet above):
1. Validates images
2. Creates thumbnails
3. Generates embeddings

Design goals:
- terse (<1000 LOC)
- performant (10k images in 10 minutes)
- readable

In [5]:
#| export

from pathlib import Path
import shutil

import daft
from humanize import naturalsize
from tqdm import tqdm

## 0. Create dataframe with images

In [6]:
# dataframe from folder of images

In [7]:
# dataframe from glob of images

In [8]:
#| export
# download Huggingface dataset and create folder

def dl_hf_images(dataset_name: str = "kvriza8/microscopy_images",
                 dir: Path = None,
                 max_images: int = 64,
                 overwrite: bool = True,
                 format: str = "png") -> None:
    
    from datasets import load_dataset

    dataset = load_dataset(dataset_name, split="train", streaming=True)
    if overwrite:
        shutil.rmtree(dir, ignore_errors=True)
        dir.mkdir(parents=True, exist_ok=True)

    image_paths = []
    for i, img_row in enumerate(tqdm(iter(dataset), total=max_images)):
        if i >= max_images:
            break
        img = img_row["image"]
        image_paths += [(dir / f"{i}.{format}")]
        img.save(image_paths[-1])

    print(f"Size of images on disk: {naturalsize(sum([p.stat().st_size for p in image_paths]))}")

    return None

## 1. Validate images