In [42]:
#| default_exp clip_plot

In [43]:
#| hide

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Imports and setup

### Unconditional imports

In [44]:
#|export

from __future__ import division
import warnings

warnings.filterwarnings("ignore")

In [45]:
#| export

# print separately that we're loading dependencies, as this can take a while
# and we want to give immediate feedback the program is starting
from clip_plot.utils import timestamp
print(timestamp(), "Beginning to load dependencies")


2025-10-13 14:52:29.205299: Beginning to load dependencies


In [46]:
#|export

from fastcore.all import call_parse, in_ipython, Param, store_true
from tqdm.auto import tqdm

from clip_plot.from_tables import glob_to_tables, table_to_meta
from clip_plot.web_config import get_clip_plot_root, copy_web_assets
from clip_plot.embeddings import get_embeddings, write_embeddings
from clip_plot.metadata import get_manifest, write_metadata
from clip_plot.images import create_atlases_and_thumbs, ImageFactory

In [47]:
#| export

from shutil import rmtree
from pathlib import Path
import uuid
import sys
import os
import pandas as pd
import numpy as np

### Image processing imports

Note that I have removed the "copy-web-only" conditional import path for now

`nbdev` does not like cells to have cells to have code and imports in the same cell:

https://nbdev.fast.ai/getting_started.html#q-what-is-the-warning-found-a-cell-containing-mix-of-imports-and-computations.-please-use-separate-cells

I think this may mean we don't get to do conditional imports. If we find a code path that really should have conditional imports, we can see if there is a workaround. For now, I don't feel "copy web only" is a very important functionality to keep.

### Optional install imports

In [48]:
#| export

_DEFAULTS = {
    "images": None,
    "tables": None,
    "meta_dir": None,
    "output_dir": "output",
    "max_images": None,
    "use_cache": True,
    "cluster_preproc_dims": -1,
    "min_cluster_size": 20,
    "max_clusters": 10,
    "atlas_size": 2048,
    "cell_size": 32,
    "lod_cell_height": 128, # Why is not in parser?
    "embed_model": "timm/convnext_tiny.dinov3_lvd1689m",
    "n_neighbors": [15],
    "min_dist": [0.01],
    "umap_on_full_dims": False,
    "n_components": 2,
    "metric": "correlation",
    "pointgrid_fill": 0.05,
    "min_size": 100,
    "min_score": 0.3,
    "min_vertices": 18,
    "plot_id": str(uuid.uuid1()),
    "seed": 24,
    "n_clusters": 12,
    "geojson": None,
}

# handle truncated images in PIL (managed by Pillow)
PILLoadTruncated  = True

"""
NB: Keras Image class objects return image.size as w,h
    Numpy array representations of images return image.shape as h,w,c
"""

'\nNB: Keras Image class objects return image.size as w,h\n    Numpy array representations of images return image.shape as h,w,c\n'


## Entry

`project_images` will kick off all the main functions for the module

In [49]:
#| export

def _project_images(imageEngine, embeds: np.ndarray | None = None, **kwargs):
    """
    Main method for embedding user images, projecting to 2D, and creating visualization
    It would be nice to list out the image processing steps before getting started
    """
    if not isinstance(kwargs["n_neighbors"], list):
        kwargs.update({"n_neighbors": list(kwargs["n_neighbors"])})
    if not isinstance(kwargs["min_dist"], list):
        kwargs.update({"n_neighbors": list(kwargs["n_neighbors"])})

    print(timestamp(), "Starting image processing pipeline.")

    copy_web_assets(output_dir=kwargs['output_dir'],
                    tagline=kwargs['tagline'], logo=kwargs["logo"])
    if kwargs["copy_web_only"]:
        print(timestamp(), "Done!")
        sys.exit()

    np.random.seed(kwargs["seed"])
    write_metadata(imageEngine)

    kwargs["atlas_dir"], atlas_data = create_atlases_and_thumbs(imageEngine, kwargs["plot_id"], kwargs["use_cache"])

    if embeds is None:
        kwargs["vecs"] = get_embeddings(imageEngine, model_name=kwargs["embed_model"])
    else:
        kwargs["vecs"] = embeds

    get_manifest(imageEngine, atlas_data, **kwargs)
    # write_images(imageEngine)
    print(timestamp(), "Done!")

## Carlo's Test Functions

> Need to remove later

In [50]:
#| export

copy_root_dir = get_clip_plot_root()

def test_butterfly_duplicate(config):
    test_images = copy_root_dir/"tests/smithsonian_butterflies_10/jpgs_duplicates/**/*.jpg"
    test_output_dir = copy_root_dir/"tests/smithsonian_butterflies_10/output_test_temp"
    meta_dir = copy_root_dir/"tests/smithsonian_butterflies_10/meta_data/good_meta.csv"
    if Path(test_output_dir).exists():
        rmtree(test_output_dir)

    config["images"] = test_images.as_posix()
    config["output_dir"] = test_output_dir.as_posix()
    config["meta_dir"] = meta_dir.as_posix()
    config["plot_id"] = "test_diff"

    return config

def test_butterfly_missing_meta(config):
    test_images = copy_root_dir/"tests/smithsonian_butterflies_10/jpgs/*.jpg"
    test_output_dir = copy_root_dir/"tests/smithsonian_butterflies_10/output_test_temp"
    meta_dir = copy_root_dir/"tests/smithsonian_butterflies_10/meta_data/meta_missing_entry.csv"
    if Path(test_output_dir).exists():
        rmtree(test_output_dir)

    config["images"] = test_images.as_posix()
    config["output_dir"] = test_output_dir.as_posix()
    config["meta_dir"] = meta_dir.as_posix()
    config["plot_id"] = "test_diff"

    return config

def test_no_meta_dir(config):
    test_images = copy_root_dir/"tests/smithsonian_butterflies_10/jpgs/*.jpg"
    test_output_dir = copy_root_dir/"tests/smithsonian_butterflies_10/output_test_temp"
    if Path(test_output_dir).exists():
        rmtree(test_output_dir)

    config["images"] = test_images.as_posix()
    config["output_dir"] = test_output_dir.as_posix()
    config["plot_id"] = "test_diff"

    return config


# Project images

Command-line function thanks to @call_parse decorator

In [51]:
#| export

@call_parse
def project_images_cli(images:Param(type=str,
                        help="path or glob of images to process"
                        )=_DEFAULTS["images"],
                tables:Param(type=str,
                        help="path or glob of tables with image_path and embed_path columns (and optionally metadata)"
                        )=None,
                metadata:Param(type=str,
                        help="path to a csv or glob of JSON files with image metadata (see readme for format)"
                        )=_DEFAULTS["meta_dir"],
                tagline:Param(type=str,
                        help="tagline for image web page"
                        )="Images arranged by visual similarity",
                logo:Param(type=str,
                        help="path to a small, squarish logo -- SVG is best"
                        )=None,
                max_images:Param(type=int,
                        help="maximum number of images to process"
                        )=_DEFAULTS["max_images"],
                use_cache:Param(type=store_true,
                        help="given inputs identical to prior inputs, load outputs from cache"
                        )=_DEFAULTS["use_cache"],
                cluster_preproc_dims:Param(type=int,
                        help="number of dims to reduce to prior to clustering. -1 means don't reduce",
                        required=False
                        )=_DEFAULTS["cluster_preproc_dims"],
                min_cluster_size:Param(type=int,
                        help="the minimum number of images in a cluster",
                        required=False
                        )=_DEFAULTS["min_cluster_size"],
                max_clusters:Param(type=int,
                        help="the maximum number of clusters to return",
                        required=False
                        )=_DEFAULTS["max_clusters"],
                output_dir:Param(type=str,
                        help="the directory to which outputs will be saved",
                        required=False
                        )=_DEFAULTS["output_dir"],
                cell_size:Param(type=int,
                        help="the size of atlas cells in px",
                        required=False
                        )=_DEFAULTS["cell_size"],
                embed_model:Param(type=str,
                        help="pre-trained model from timm library to use to create embedding",
                        required=False
                        )=_DEFAULTS["embed_model"],
                n_neighbors:Param(type=int,
                        nargs="+",
                        help="the n_neighbors arguments for UMAP"
                        )=_DEFAULTS["n_neighbors"],
                min_dist:Param(type=float,
                        nargs="+",
                        help="the min_dist arguments for UMAP"
                        )=_DEFAULTS["min_dist"],
                umap_on_full_dims:Param(type=store_true,
                        help="skip PCA (faster dimensionality reduction) prior to UMAP"
                        )=_DEFAULTS["umap_on_full_dims"],
                n_components:Param(type=int,
                        help="the n_components argument for UMAP"
                        )=_DEFAULTS["n_components"],
                metric:Param(type=str,
                        help="the metric argument for umap"
                        )=_DEFAULTS["metric"],
                pointgrid_fill:Param(type=float,
                        help="float 0:1 that determines sparsity of jittered distributions (lower means more sparse)"
                        )=_DEFAULTS["pointgrid_fill"],
                copy_web_only:Param(type=store_true,
                        help="update ./output/assets without reprocessing data"
                        )=False,
                min_size:Param(type=float,
                        help="min size of cropped images"
                        )=_DEFAULTS["min_size"],
                shuffle:Param(type=store_true,
                        help="shuffle the input images before data processing begins"
                        )=False,
                plot_id:Param(type=str,
                        help="unique id for a plot; useful for resuming processing on a started plot"
                        )=_DEFAULTS["plot_id"],
                seed:Param(type=int, help="seed for random processes"
                           )=_DEFAULTS["seed"],
                n_clusters:Param(type=int,
                        help="number of clusters if using kmeans"
                        )=_DEFAULTS["n_clusters"],
                geojson:Param(type=str,
                        help="path to a GeoJSON file with shapes to be rendered on a map"
                        )=_DEFAULTS["geojson"]
                ):
                "Convert a folder of images into a clip-plot visualization"

                # grab local variables as configuration dict
                config = dict(locals())


                # some parameters exist in _DEFAULTS but not in the function signature
                default_only_keys = set(_DEFAULTS.keys() - config.keys())
                default_only = {k:_DEFAULTS[k] for k in default_only_keys}
                config.update(default_only)

                options = {
                        'shuffle': config['shuffle'], 
                        'seed': config['seed'], 
                        'max_images': config['max_images'], 
                        'atlas_size': config['atlas_size'], 
                        'cell_size': config['cell_size'], 
                        'lod_cell_height': config['lod_cell_height'], 
                        'validate': True, 
                }

                if not tables:
                        embeds = None
                        table = None
                else:
                        if images is not None: raise ValueError("Provide either tables or images parameter, not both.")
                        print(timestamp(), "Loading tables")
                        table = glob_to_tables(tables)
                        config["images"] = list(table.image_path.values)
                        images = config["images"]
                        print(timestamp(), "Loading embeddings from disk")
                        embeds = np.array([np.load(e) for e in tqdm(table.embed_path)])

                data_dir = os.path.join(config["output_dir"], "data")
                imageEngine = ImageFactory(config['images'], data_dir, config['meta_dir'], options)

                # grab metadata from table if provided
                if table is not None:
                        imageEngine.meta_headers, imageEngine.metadata = table_to_meta(table)

                print(f"Config to project images: {str(config)}")

                _project_images(imageEngine, embeds, **config)

In [52]:
#| export

# awful workaround because I think call_parse only works with sys.argv (cli)
project_images = project_images_cli.__wrapped__

### Embed images without building visualization

Outputs a table linking the embedding files to the images

Test with bash command:
```
clipplot_embed_images --images "tests/smithsonian_butterflies_10/jpgs/*.jpg" --output_dir "tests/smithsonian_butterflies_10/DELETEME_embed_only" --metadata "tests/smithsonian_butterflies_10/meta_data/good_meta.csv"
```

In [53]:
#| export

@call_parse
def embed_images_cli(images:Param(type=str,
                        help="path or glob of images to process"
                        )=_DEFAULTS["images"],
                embed_model:Param(type=str,
                        help="pre-trained model from timm library to use to create embedding",
                        required=False
                        )=_DEFAULTS["embed_model"],
                output_dir:Param(type=str,
                        help="the directory to which outputs will be saved",
                        required=False
                        )=_DEFAULTS["output_dir"],
                metadata:Param(type=str,
                        help="path to a csv or glob of JSON files with image metadata (see readme for format)"
                        )=_DEFAULTS["meta_dir"],
                table_id:Param(type=str,
                        help="identifier for table that links embeddings to images and (optionally) metadata",
                        required=False
                        )=str(uuid.uuid1()),
                table_format:Param(type=str,
                        choices=["parquet", "csv"],
                        help="format for table linking embeddings, images, and metadata",
                        required=False
                        )="parquet"
                ):
                "Embed a folder of images and save embeddings as .npy file to disk"

                # using Path.cwd() to handle ../ names -- not sure if this is superstitious
                data_dir = Path.cwd() / Path(output_dir).resolve() / "data"

                imageEngine = ImageFactory(img_path=images, data_dir=data_dir, meta_dir=metadata)

                embeddings = get_embeddings(imageEngine, model_name=embed_model)

                def _model_shortname(n: str) -> str:
                        return "__".join(n.split("/")[-2:])

                embs_dir = data_dir/f"embeddings_{_model_shortname(embed_model)}"
                embs_dir.mkdir(parents=True, exist_ok=True)
                emb_paths = write_embeddings(embeddings, imageEngine.filenames, embs_dir)

                df = pd.DataFrame({"image_path": imageEngine.image_paths,
                                   "image_filename": imageEngine.filenames,
                                   "embed_path": [str(e) for e in emb_paths]})

                if len(imageEngine.metadata) > 0:
                        df_meta = pd.DataFrame(imageEngine.metadata)
                        df_meta = df_meta.rename(columns={"filename": "image_filename"})
                        # drop "image_path" column if df_meta has it
                        if "image_path" in df_meta.columns:
                                df_meta = df_meta.drop(columns=["image_path"])

                        df = df.merge(df_meta.drop_duplicates(["image_filename"]), on="image_filename")

                ## standardize sort order of table
                # put standard columns first if they exist in df
                standard_cols = pd.Index(["image_path", "image_filename", "embed_path", "category", "tags", "x", "y"])
                cols_sorted = standard_cols.intersection(df.columns)
                # append non-standard columns, sorted alphabetically
                cols_sorted = cols_sorted.append(df.columns.difference(standard_cols).sort_values())
                df = df[cols_sorted]

                if table_format == "csv":
                        df.to_csv(data_dir / f"EmbedImages__{table_id}.csv", index=False)
                else: df.to_parquet(data_dir / f"EmbedImages__{table_id}.parquet", index=False)

In [54]:
#| export

# awful workaround because I think call_parse only works with sys.argv (cli)
embed_images = embed_images_cli.__wrapped__

In [55]:
#| hide

def test_butterfly():
    test_images = copy_root_dir/"tests/smithsonian_butterflies_10/jpgs/*.jpg"
    test_output_dir = copy_root_dir/"tests/smithsonian_butterflies_10/output_test_temp"
    meta = copy_root_dir/"tests/smithsonian_butterflies_10/meta_data/good_meta.csv"
    if Path(test_output_dir).exists():
        rmtree(test_output_dir)
    
    project_images(images=test_images.as_posix(),
                   output_dir=test_output_dir.as_posix(),
                   metadata=meta.as_posix(),
                   plot_id="test_diff",
                   )

    return

In [56]:
#| hide

if __name__ == "__main__" and in_ipython():
    test_butterfly()


2025-10-13 14:52:30.733645: First three paths: /home/wsanger/git/clip-plot/tests/smithsonian_butterflies_10/jpgs/30aeb051d-ee0d-4c5b-8a85-a8da7baef5fd.jpg
/home/wsanger/git/clip-plot/tests/smithsonian_butterflies_10/jpgs/9fea3150-a3d4-11ed-aeea-e36f1256f233.jpg
/home/wsanger/git/clip-plot/tests/smithsonian_butterflies_10/jpgs/329a4c094-8536-4396-be70-3d9b5d0744d9.jpg
2025-10-13 14:52:30.733870: Validating input images


  0%|          | 0/12 [00:00<?, ?it/s]

2025-10-13 14:52:31.007714: No metadata directory was provided.
Config to project images: {'images': '/home/wsanger/git/clip-plot/tests/smithsonian_butterflies_10/jpgs/*.jpg', 'tables': None, 'metadata': '/home/wsanger/git/clip-plot/tests/smithsonian_butterflies_10/meta_data/good_meta.csv', 'tagline': 'Images arranged by visual similarity', 'logo': None, 'max_images': None, 'use_cache': True, 'cluster_preproc_dims': -1, 'min_cluster_size': 20, 'max_clusters': 10, 'output_dir': '/home/wsanger/git/clip-plot/tests/smithsonian_butterflies_10/output_test_temp', 'cell_size': 32, 'embed_model': 'timm/convnext_tiny.dinov3_lvd1689m', 'n_neighbors': [15], 'min_dist': [0.01], 'umap_on_full_dims': False, 'n_components': 2, 'metric': 'correlation', 'pointgrid_fill': 0.05, 'copy_web_only': False, 'min_size': 100, 'shuffle': False, 'plot_id': 'test_diff', 'seed': 24, 'n_clusters': 12, 'geojson': None, 'meta_dir': None, 'min_score': 0.3, 'lod_cell_height': 128, 'atlas_size': 2048, 'min_vertices': 18}


TypeError: copy_web_assets() got an unexpected keyword argument 'output_dir'

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()