In [None]:
#| default_exp hotspots

In [None]:
#| hide
%load_ext autoreload
%autoreload 2

In [None]:
#| export

import multiprocessing
from collections import defaultdict
from pathlib import Path

import numpy as np
# from hdbscan import HDBSCAN
from sklearn.preprocessing import normalize
from sklearn.cluster import HDBSCAN

from clip_plot.configuration import ClusterSpec
from clip_plot.images import ImageFactory
from clip_plot.utils import get_json_path, timestamp, write_json


In [None]:
#| export

def get_cluster_model(min_cluster_size: int = 15):
    """Return model with .fit() method that can be used to cluster input vectors
    """
    return HDBSCAN(
        n_jobs=-1,
        # core_dist_n_jobs=multiprocessing.cpu_count(),
        min_cluster_size=min_cluster_size,
        min_samples=1,
        # approx_min_span_tree=False,
        metric="cosine",
        cluster_selection_method="eom",
        alpha=1.0,
        allow_single_cluster=False,
        algorithm="brute"
    )

In [None]:
#| export

def get_hotspots(imageEngine: ImageFactory,
                 vecs: np.ndarray,
                 data_dir: Path, plot_id: str,
                 cluster_spec: ClusterSpec,
                 layout_name: str = "umap_base_layout",
                ):
    """Return the stable clusters from the condensed tree of connected components from the density graph
    """
    print(timestamp(), "Clustering data with HDBSCAN")
    model = get_cluster_model(cluster_spec.min_cluster_size)
    # vecs = vecs / np.linalg.norm(vecs, axis=1, keepdims=True)
    X_norm = normalize(vecs, norm="l2")
    z = model.fit(X_norm)

    # create a map from cluster label to image indices in cluster
    d = defaultdict(lambda: defaultdict(list))
    for idx, i in enumerate(z.labels_):
        if i != -1:
            d[i]["images"].append(idx)
            d[i]["img"] = imageEngine[idx].unique_name
            d[i]["layout"] = layout_name

    # remove massive clusters
    deletable = []
    for i in d:
        # find percent of images in cluster
        image_percent = len(d[i]["images"]) / len(vecs)
        # determine if image or area percent is too large
        if image_percent > 0.5:
            deletable.append(i)
    for i in deletable:
        del d[i]

    # sort the clusers by size and then label the clusters
    clusters = d.values()
    clusters = sorted(clusters, key=lambda i: len(i["images"]), reverse=True)
    for idx, i in enumerate(clusters):
        i["label"] = f"Cluster {idx + 1}"

    # slice off the first `max_clusters`
    clusters = clusters[: cluster_spec.max_clusters]

    # save the hotspots to disk and return the path to the saved json
    print(timestamp(), "Found", len(clusters), "hotspots")
    json_path = get_json_path(data_dir, "hotspots", plot_id, "hotspot",)
    write_json(json_path, data_dir=data_dir, obj=clusters)
    return json_path

In [None]:
#|hide
import nbdev

nbdev.nbdev_export()