In [None]:
#| default_exp prepare_images

In [None]:
#| hide

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# "Process images" bit of bedmap pipeline

## bedmap overview

bedmap does three things:
- processes images through to embeddings
- creates layouts from embeddings
- builds web assets

Design goals:
- terse (<500 LOC)
- performant (10k images in 10 minutes)
- readable

## process_images

This notebook does "process image" part (first bullet above):
1. Validates images
2. Creates thumbnails
3. Generates embeddings


In [None]:
#| export

from pathlib import Path
import shutil
import sys

import daft
from humanize import naturalsize
from tqdm import tqdm
from loguru import logger

from bedmap.validate_images import validate_images
from bedmap.create_thumbnails import create_thumbnails
from bedmap.embed_images import embed_images
from bedmap.config import ImageConfig

from fastcore.test import test_eq

In [None]:
#| export

# easy timestamps

logger.remove()
logger.add(sys.stdout, level="INFO")

2

In [None]:
#| export

cfg = ImageConfig()

BATCH_SIZE = 4

## 0. Create dataframe with images

In [None]:
#| export

def _image_glob_pattern(directory: str | Path) -> str:
    """Convert a directory path into a glob pattern that matches common image formats.
    
    Args:
        directory: Directory path to search for images
        
    Returns:
        Glob pattern string that matches common image formats
    """
    exts = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'webp', 'tiff', 'tif']
    dir_str = str(directory).rstrip('/')
    return f"{dir_str}/**/*.{{{','.join(exts)}}}"
    

In [None]:
#| hide

test_eq(_image_glob_pattern("my/dir"),
        "my/dir/**/*.{jpg,jpeg,png,gif,bmp,webp,tiff,tif}")

In [None]:
#| export

def _df_images_from_pattern(pattern: str | Path) -> daft.DataFrame:
    """Create a dataframe from a glob pattern of images.
    
    Args:
        pattern: Directory or glob pattern to match image files
        
    Returns:
        DataFrame with columns:
        - path: Path to the file
        - size: Size in bytes
    """
    pattern = str(pattern)
    if '*' not in pattern: pattern = _image_glob_pattern(pattern)
    df_img = daft.from_glob_path(pattern).with_column_renamed("path", "img_path")
    df_img = df_img.with_column("img_name", df_img["img_path"].str.split("/").list.get(-1).cast(str))
    df_img = df_img.with_column(
        "img", daft.col("img_path").url.download(on_error="null").image.decode(on_error="null", mode="RGB")
    ).exclude("num_rows")
    return df_img

In [None]:
#| hide

test_dir = "../tests/test-data/smithsonian_butterflies_10/jpgs"
images_df = _df_images_from_pattern(test_dir)
images_df.show(1)

img_path Utf8,size Int64,img_name Utf8,img Image[RGB]
file://../tests/test-data/smithsonian_butterflies_10/jpgs/30aeb051d-ee0d-4c5b-8a85-a8da7baef5fd.jpg,149480,30aeb051d-ee0d-4c5b-8a85-a8da7baef5fd.jpg,


In [None]:
#| export

def dl_hf_images(dataset_name: str = "kvriza8/microscopy_images",
                 dir: Path = None,
                 max_images: int = 64,
                 overwrite: bool = True,
                 format: str = "png") -> None:
    """Download images from a Hugging Face dataset.
    
    Args:
        dataset_name: Name of the Hugging Face dataset to download from
        dir: Directory to save images to. If None, creates a directory named after the dataset
        max_images: Maximum number of images to download
        overwrite: Whether to overwrite existing directory if it exists
        format: Image format to save as (png, jpg, etc)
        
    Returns:
        None
    """
    from datasets import load_dataset

    dataset = load_dataset(dataset_name, split="train", streaming=True)
    if overwrite:
        shutil.rmtree(dir, ignore_errors=True)
        dir.mkdir(parents=True, exist_ok=True)

    image_paths = []
    for i, img_row in enumerate(tqdm(iter(dataset), total=max_images)):
        if i >= max_images:
            break
        img = img_row["image"]
        image_paths += [(dir / f"{i}.{format}")]
        img.save(image_paths[-1])

    print(f"Size of images on disk: {naturalsize(sum([p.stat().st_size for p in image_paths]))}")

    return None

## 2. Create embeddings column on dataframe

In [None]:
#| export

def _embed_images_for_df(df: daft.DataFrame, model_name: str, batch_size: int) -> daft.DataFrame:
    """
    Embed images for a given dataframe.
    """
    ## Surely there's a cleaner way to get the paths out

    paths = [Path(i["img_path"].lstrip("file:/")) for i in df.select("img_path").to_pylist()]
    embeds = embed_images(paths, model_name=model_name, batch_size=batch_size)
    embeds_type = daft.DataType.embedding(daft.DataType.float32(), embeds.shape[-1])
    embeds_series = daft.Series.from_numpy(embeds).cast(embeds_type)

    return images_df.with_column("embeds", daft.lit(embeds_series))

## 3. Options for "prepare images" pipelines

prepare_images_to_embeddings: creates dataframe, validates, creates thumbs, does embeddings

In [None]:
#| export

def prepare_embeddings(pattern: str | Path, thumbnail_height: int,
                        model_name: str, batch_size: int) -> daft.DataFrame:
    """
    Start with a pattern of images and end with a dataframe of images with embeddings.

    Args:
        pattern: str | Path
        thumbnail_height: int
        model_name: str
        batch_size: int
    """
    df = _df_images_from_pattern(pattern)
    df, _ = validate_images(df)
    df = create_thumbnails(df, height=thumbnail_height)
    df = _embed_images_for_df(df, model_name=model_name, batch_size=batch_size)
    return df

In [None]:
#| hide

df_with_embeddings = prepare_embeddings(pattern=test_dir, thumbnail_height=cfg.thumbnail_size,
                   model_name=cfg.model_name, batch_size=BATCH_SIZE)

df_with_embeddings.show(2)

img_path Utf8,size Int64,img_name Utf8,img Image[RGB],embeds List[Embedding[Float32; 384]]
file://../tests/test-data/smithsonian_butterflies_10/jpgs/30aeb051d-ee0d-4c5b-8a85-a8da7baef5fd.jpg,149480,30aeb051d-ee0d-4c5b-8a85-a8da7baef5fd.jpg,,"[<Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>]"
file://../tests/test-data/smithsonian_butterflies_10/jpgs/9fea3150-a3d4-11ed-aeea-e36f1256f233.jpg,82911,9fea3150-a3d4-11ed-aeea-e36f1256f233.jpg,,"[<Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>, <Embedding>]"


In [None]:
# | hide

import nbdev
nbdev.nbdev_export()

  import pkg_resources,importlib
