In [None]:
#| default_exp prepare_images

# "Process images" bit of bedmap pipeline

bedmap does three things:
- processes images through to embeddings
- creates layouts from embeddings
- builds web assets

This notebook does "process image" part (first bullet above):
1. Validates images
2. Creates thumbnails
3. Generates embeddings

Design goals:
- terse (<1000 LOC)
- performant (10k images in 10 minutes)
- readable

In [None]:
#| export

from pathlib import Path
import shutil

import daft
from humanize import naturalsize
from tqdm import tqdm

## 0. Create dataframe with images

In [None]:
#| export

def _image_glob_pattern(directory: str | Path) -> str:
    """Convert a directory path into a glob pattern that matches common image formats.
    
    Args:
        directory: Directory path to search for images
        
    Returns:
        Glob pattern string that matches common image formats
    """
    exts = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp', 'tiff', 'tif']
    dir_str = str(directory).rstrip('/')
    return f"{dir_str}/**/*.{{{','.join(exts)}}}"
    

In [None]:
#| hide

_image_glob_pattern("my/dir")

'my/dir/**/*.{jpg,jpeg,png,gif,bmp,webp,tiff,tif}'

In [None]:
#| export

def _df_images_from_pattern(pattern: str | Path) -> daft.DataFrame:
    """Create a dataframe from a glob pattern of images.
    
    Args:
        pattern: Directory or glob pattern to match image files
        
    Returns:
        DataFrame with columns:
        - path: Path to the file
        - size: Size in bytes
    """
    pattern = str(pattern)
    if '*' not in pattern: pattern = _image_glob_pattern(pattern)
    df_img = daft.from_glob_path(pattern).with_column_renamed("path", "img_path")
    df_img = df_img.with_column("img_name", df_img["img_path"].str.split("/").list.get(-1).cast(str))
    df_img = df_img.with_column(
        "img", daft.col("img_path").url.download(on_error="null").image.decode(on_error="null", mode="RGB")
    )
    return df_img

In [None]:
#| hide

test_dir = "../tests/test-data/smithsonian_butterflies_10/jpgs"
df_imgpaths = _df_images_from_pattern(test_dir)
df_imgpaths.show(1)

img_path Utf8,size Int64,num_rows Int64,img_name Utf8,img Image[RGB]
file://../tests/test-data/smithsonian_butterflies_10/jpgs/30aeb051d-ee0d-4c5b-8a85-a8da7baef5fd.jpg,149480,,30aeb051d-ee0d-4c5b-8a85-a8da7baef5fd.jpg,


In [None]:
#| export

def dl_hf_images(dataset_name: str = "kvriza8/microscopy_images",
                 dir: Path = None,
                 max_images: int = 64,
                 overwrite: bool = True,
                 format: str = "png") -> None:
    """Download images from a Hugging Face dataset.
    
    Args:
        dataset_name: Name of the Hugging Face dataset to download from
        dir: Directory to save images to. If None, creates a directory named after the dataset
        max_images: Maximum number of images to download
        overwrite: Whether to overwrite existing directory if it exists
        format: Image format to save as (png, jpg, etc)
        
    Returns:
        None
    """
    from datasets import load_dataset

    dataset = load_dataset(dataset_name, split="train", streaming=True)
    if overwrite:
        shutil.rmtree(dir, ignore_errors=True)
        dir.mkdir(parents=True, exist_ok=True)

    image_paths = []
    for i, img_row in enumerate(tqdm(iter(dataset), total=max_images)):
        if i >= max_images:
            break
        img = img_row["image"]
        image_paths += [(dir / f"{i}.{format}")]
        img.save(image_paths[-1])

    print(f"Size of images on disk: {naturalsize(sum([p.stat().st_size for p in image_paths]))}")

    return None

## 1. Validate images

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()