In [None]:
# importing modules 
%matplotlib inline
import matplotlib.pyplot as plt
import os
import random
import sys
import astropy.table
import numpy as np
import matplotlib.pyplot as plt
sys.path.insert(0,os.path.dirname(os.getcwd()))
import btk
import btk.config, btk.plot_utils
import hdbscan

In [None]:
# code for generating galaxies: https://github.com/LSSTDESC/BlendingToolKit/blob/master/notebooks/custom_sampling_function.ipynb
# input catalog name
catalog_name = os.path.join('data', 'sample_input_catalog.fits')

# load parameters
# max_number = maximum number of galaxies in image (10), batch_size = number of images (100)
param = btk.config.Simulation_params(catalog_name, max_number=10, batch_size=100)
np.random.seed(param.seed)

# load input catalog
catalog = btk.get_input_catalog.load_catalog(param)

# generate catalogs of blended objects 
blend_generator = btk.create_blend_generator.generate(param, catalog)

# generates observing conditions for the selected survey_name and all input bands
observing_generator = btk.create_observing_generator.generate(param)

# generate images of blends in all the observing bands
draw_blend_generator = btk.draw_blends.generate(param, blend_generator, observing_generator)

# generates new batch_size number of blends
blend_results = next(draw_blend_generator)
output = blend_results
blend_images = output['blend_images']
isolated_images = output['isolated_images']
blend_list = output['blend_list']
obs_cond = output['obs_condition']

# plot blended images
plot = False
if (plot): btk.plot_utils.plot_blends(blend_images[0:10], blend_list[0:10], limits=(30,90))

In [None]:
# plotting them in multi-color
btk.plot_utils.plot_blends(blend_images[:20], blend_list[:20], limits=(30,90))

The reported noise level in the image conforms well with their actual histograms:

In [None]:
# get the background and noise of the images
sky_level = []
for oc in obs_cond[0]: # same values of each obs_cond?
    sky_level.append(oc.mean_sky_level)
background = np.array(sky_level)
std_background = np.sqrt(background)
std_sum = np.sqrt((std_background**2).sum())

# show histogram comparison, single band only
bins = np.linspace(-3*std_background[0], 3*std_background[0], 50)
plt.hist(blend_images[:,:,:,0].flatten(), bins=bins, density=True);
from scipy.stats import norm
plt.plot(bins, norm.pdf(bins, scale=std_background[0]))

Let's try the same intensity normalization we've been using before:

In [None]:
def normalize_channels(img, std_sum):
    # normalise data: sum normalization for band amplitudes
    # prevent division by zero (or close to) by cutting normalization off at noise level
    return img / np.maximum(std_sum, img.sum(axis=-1))[:,:,None]

def prepare_data(img, threshold, alpha=1):
    # select pixels whose sum is above threshold,
    # normalize their intensities, and xy values multiplied with alpha to extend feature vector
    Ny, Nx, C = img.shape
    mask = img.sum(axis=-1) > threshold
    img_ = normalize_channels(img, threshold)
    
    x, y = np.meshgrid(np.arange(Nx), np.arange(Ny))
    arrays = [alpha * x.flatten(), alpha * y.flatten()] + [img_[:,:,c].flatten() for c in range(C)]
    data = np.stack(arrays, axis=1)
    return data, mask


i = 0
Ny, Nx, C = blend_images[i].shape
mcs = 10
data, mask = prepare_data(blend_images[i], std_sum * 3, alpha=0.01)
clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs, min_samples=5, cluster_selection_method='leaf', allow_single_cluster=True)
labels = clusterer.fit_predict(data)
uniq_labels = np.unique(labels)
print(uniq_labels)
label_img = labels.reshape(mask.shape)

# plot result
fig, axes = plt.subplots(1, 2, figsize=(12,6))
axes[0].imshow(blend_images[i].sum(axis=-1))
axes[0].scatter(blend_list[i]['dx'], blend_list[i]['dy'], color='r', marker='x')
axes[1].imshow(label_img, cmap='viridis')

The problem with this approach is that the images are pretty noisy, which suggest high values of $\alpha$ to dominate the pixel mutual distances for clustering. However, no setting of $\alpha$ I've tried works well with bright and faint sources. Fundamentally, we need to modify the metric (which is by default Euclidean) to something more appropriate for this problem.

I will now use a custom distance metric and precompute it, so that clustering directly sees the pairwise distance matrix.

In particular, I compute distance as

$$
1 - r(v'_i,v'_j) + \alpha^2 d(x_i, x_j)^2
$$

that is the combination of color term and a spatial distance term. The first is the Pearson correlation coefficient $r$ of the values (that is the band intensities) in pixels $i$ and $j$; the second the squared distance of their xy positions in the image, scaled by $\alpha$ to allow for the relevant importance of spatial features to be altered.

To standardize the results of the correlation coefficient, we need to account for the different variance in each channel, so

$$
v'_i = v_i / \sigma_{bg}
$$

where the last terms is the vector of std of the background noise in each channel. This give a $N\times N$ distance matrix, which will be passed to HDBSCAN.

As success metric, we compute the Intersection-over-Union (a [standard measure](https://www.pyimagesearch.com/2016/11/07/intersection-over-union-iou-for-object-detection/) for detection algorithms) of the labeled pixels (ignoring the -1 outlier pixels) against the true footprints of detectable objects (pixels where the true image of the single source is above the detection threshold). The result is a matrix $Y$ with shape $N_{clustered} \times N_{detectable}$, which is ideally close to the identity matrix.

Because the clustering labels can come in any order and can have a different number as the detectable objects, we compute the cross-correlation matrix $C = Y^\top Y$ of the labels with respect to the true indices. This is a common clustering metric (e.g. arXiv:1508.04306). The squared deviation from the identity $\mathbb{1}_{N_{detectable}}$ is thus our optimization loss function.

In [None]:
def sim_matrix(img, threshold, alpha, normalization=None):
    from scipy.stats import t as studentt
    Ny, Nx, C = img.shape
    
    # work on pixels above threshold
    mask = img.sum(axis=-1) > threshold
    
    if normalization is None:
        _img = img
        
    else:
        _img = img / normalization
        
    # compute Pearson r for color distance
    r = np.corrcoef(_img[mask, :].reshape(-1, C))
    
    # compute pairwise Euclidean distance
    x, y = np.meshgrid(np.arange(Nx), np.arange(Ny))
    xy = np.stack((x[mask].flatten(), y[mask].flatten()), axis=1)
    R2 = ((xy[:,None] - xy[None,:])**2).sum(axis=-1) / 2

    # combine color and spatial distance, with alpha scaling
    dist = (1 - r) + alpha**2 * R2
    return dist, mask

def iou_matrix(footprints, uniq_labels, label_img):
    # compute intersection over union for every pair of true and clustered
    has_object = footprints.any(axis=(1,2))
    num_objects = has_object.sum()
    if uniq_labels[0] == -1:
        num_clustered = len(uniq_labels)-1
    else:
        num_clustered = len(uniq_labels)
    
    iou = np.zeros((num_clustered, num_objects))
    for ll in uniq_labels:
        if ll > -1:
            fp_label = label_img == ll
            for ii, fp_true in enumerate(footprints[has_object]):
                union = (fp_true | fp_label).sum()
                intersection = (fp_true & fp_label).sum()
                norm = np.sqrt(fp_true.sum() * fp_label.sum())
                iou_ = intersection / union
                iou[ll][ii] = iou_
    return iou

In [None]:
i = 0
mcs = 5

clusterer = hdbscan.HDBSCAN(metric='precomputed', 
                            min_cluster_size=mcs, 
                            min_samples=1, 
                            cluster_selection_method='eom',
                            allow_single_cluster=True,
                           )

# alpha and threshold values from optimimzation below
# can pick any positive number for alpha, and something of order std_sum for threshold
if mcs == 10:
    alpha, threshold = 1.65473326e-01, 2.80763024e+03 # 1.07263079e-01, 2.80763543e+03
elif mcs == 5:
    alpha, threshold = 1.53615598e-01, 2.80763018e+03
    
# all channels unit variance
normalization =  std_background[None,None,:]
    
# compute detection mask and color/spatial distances
X, mask = sim_matrix(blend_images[i], threshold, alpha, normalization=normalization)

# cluster distance matrix
labels = clusterer.fit_predict(X)
uniq_labels = np.unique(labels)
print(uniq_labels)
label_img = np.ones(mask.shape) * -2
label_img[mask] = labels

# check overlap with true footprints
footprints = isolated_images[i].sum(axis=-1) > std_sum
has_object = footprints.any(axis=(1,2))
num_objects = has_object.sum()
Y = iou_matrix(footprints, uniq_labels, label_img)

# plot result
fig, axes = plt.subplots(1, 2, figsize=(12,6))
import scarlet.display
norm = scarlet.display.LinearPercentileNorm(blend_images[i].sum(axis=-1), percentiles=[10,99])
axes[0].imshow(scarlet.display.img_to_rgb(blend_images[i].sum(axis=-1), norm=norm))
#axes[0].scatter(blend_list[i]['dx'], blend_list[i]['dy'], color='r', marker='x')
for i,obj in enumerate(blend_list[i]):
    axes[0].text(obj['dx'], obj['dy'], '{}'.format(i), color='r')
axes[0].imshow(label_img, cmap='jet', alpha=0.5)


# iou is ideally a one-hot encoding of the index of the matching source
# the cluster label are randomly permutated, so compute cross-correlation matrix of true indices
D = np.sqrt(Y.T @ Y)
# use the sqaured deviation from identity as loss function
_loss = ((D - np.eye(num_objects))**2).sum()
axes[1].imshow(D, vmin=0, vmax=1)
axes[1].set_xticks(np.arange(num_objects))
axes[1].set_xticklabels(np.flatnonzero(has_object))
axes[1].set_yticks(np.arange(num_objects))
axes[1].set_yticks(np.arange(num_objects))
print(_loss)

In [None]:
# combine the abive in single function
def cl_loss(alpha, threshold, clusterer, img, fp_threshold=None, normalization=None):
    # cluster data
    X, mask = sim_matrix(img, threshold, alpha, normalization=normalization)
    labels = clusterer.fit_predict(X)
    uniq_labels = np.unique(labels)
    label_img = np.ones(mask.shape) * -2
    label_img[mask] = labels

    # compare clustering label image to footprints
    if fp_threshold is None:
        fp_threshold = threshold

    footprints = isolated_images[i].sum(axis=-1) > fp_threshold
    Y = iou_matrix(footprints, uniq_labels, label_img)
    D = np.sqrt(Y.T @ Y)
    _loss = ((D - np.eye(D.shape[0]))**2).sum()
    return _loss


mcs = 10
clusterer = hdbscan.HDBSCAN(metric='precomputed', 
                            min_cluster_size=mcs, 
                            min_samples=1, 
                            cluster_selection_method='eom',
                            allow_single_cluster=True,
                           )

# compute detection mask and color/spatial distances
threshold = std_sum * 5
normalization = std_background[None,None,:]
alpha = 5e-2
img = blend_images[0]
cl_loss(alpha, threshold, clusterer, img, fp_threshold=std_sum, normalization=normalization)

In [None]:
# perform optimization of alpha and threshold
from scipy.optimize import minimize

# running this will cost your computer ~1 hour of its life...
mcs = 10
clusterer = hdbscan.HDBSCAN(metric='precomputed', 
                            min_cluster_size=mcs, 
                            min_samples=1, 
                            cluster_selection_method='eom',
                            allow_single_cluster=True,
                           )

loss = lambda p: np.sum([ cl_loss(p[0], p[1], clusterer, img, fp_threshold=std_sum, normalization=normalization) for img in blend_images ])
minimize(loss, (1e-1, std_sum * 3), bounds=((0, 1), (std_sum, std_sum * 10)), options={'maxiter': 50, 'eps': (1e-2, std_sum)})

In [None]:
# now with mcs = 5
mcs = 5
clusterer = hdbscan.HDBSCAN(metric='precomputed', 
                            min_cluster_size=mcs, 
                            min_samples=1, 
                            cluster_selection_method='eom',
                            allow_single_cluster=True,
                           )

from scipy.optimize import minimize
loss = lambda p: np.sum([ cl_loss(p[0], p[1], clusterer, img, fp_threshold=std_sum, normalization=normalization) for img in blend_images ])
minimize(loss, (1e-1, std_sum * 3), bounds=((0, 1), (std_sum, std_sum * 10)), options={'maxiter': 50, 'eps': (1e-2, std_sum)})

The results with these settings are quite good for the fainter sources, but the algorithm is struggling with the overlap between brighter sources. In particular, detecting a moderately bright source in the footprint of a very bright source (like in #4) remains challenging. This problem is caused by having only one scaling term $\alpha$ for the entire population. 

Brighter objects could probably work with a smaller setting of $\alpha$ because the color information is very accurate, while small and faint source depends more heavily on the spatial distance information.

As a result, one needs to either set $\alpha$ adaptively or work with some multi-scale approach (evaluating at a grid of $\alpha$ and then picking the best).

---

from earlier ...

In [None]:
# find the detection efficiency of the HDBSCAN for a set of images with known numbers of clusters
# function of minimum cluster size parameter

# using parameter values from 5-50, in increments of 5
min_cluster_sizes = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# the detection efficiency using 'eom' on raw data
eom_detection_efficiency = []
# the detection efficiency using 'eom' on normalised data
eom_norm_detection_efficiency = []
# the detection efficiency using 'leaf' on raw data
leaf_detection_efficiency = []
# the detection efficiency using 'leaf' on normalised data
leaf_norm_detection_efficiency = []

# find the detection efficiency for each value used for the minimum cluster size parameter
for mcs in min_cluster_sizes:
    eom_efficiency = []
    eom_norm_efficiency = []
    leaf_efficiency = []
    leaf_norm_efficiency = []
    
    # get results using both 'eom' and 'leaf' cluster selection methods
    eom_clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs)
    leaf_clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs, cluster_selection_method='leaf') 

    # get the average detection efficiency from each image
    for i in range(len(blend_list)):
        # get the actual number of clusters in this image
        true_centers = np.stack([blend_list[i]['dx'], blend_list[i]['dy']]).T
        actual_k = len(true_centers)
                
        # put each band along with x and y dimensions into one array
        img = blend_images[i:i+1][0].reshape(-1, 6)
        n = 120
        x, y = np.meshgrid(0.1*np.arange(n), 0.1*np.arange(n))
        arrays = [x.flatten(), y.flatten()]
        for j in range(6):
            arrays.append(img[:, j])
        data = np.stack(arrays, axis=1)

        # normalise data
        norm_data = data.astype('float') #- background
        norm_data /= (np.maximum(std_background, data.sum(axis=1)[:, None])) # reducing number of radical outliers
    
        # find number of clusters detected for raw data using 'eom'
        eom_clusterer.fit(data)
        detected_k = max(eom_clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        eom_efficiency.append(detected_k/actual_k)
        
        # find number of clusters detected for normalised data using 'eom'
        eom_clusterer.fit(norm_data)
        detected_k = max(eom_clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        eom_norm_efficiency.append(detected_k/actual_k)
        
        # find number of clusters detected for raw data using 'leaf'
        leaf_clusterer.fit(data)
        detected_k = max(leaf_clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        leaf_efficiency.append(detected_k/actual_k)
        
        # find number of clusters detected for normalised data using 'leaf'
        leaf_clusterer.fit(data)
        detected_k = max(leaf_clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        leaf_norm_efficiency.append(detected_k/actual_k)
    
    # take the efficiency to be the mean of the number of clusters detected divided by the actual number of clusters for each image
    eom_detection_efficiency.append(np.mean(eom_efficiency))
    eom_norm_detection_efficiency.append(np.mean(eom_norm_efficiency))
    leaf_detection_efficiency.append(np.mean(leaf_efficiency))
    leaf_norm_detection_efficiency.append(np.mean(leaf_norm_efficiency))

In [None]:
# plotting the detection efficiency as a function of the minimum cluster size parameter

# plotting raw and normalised data using 'eom' selection method
eom_fig = plt.figure()
eom_ax = eom_fig.add_subplot(121)
eom_norm_ax = eom_fig.add_subplot(122)
eom_sc = eom_ax.scatter(min_cluster_sizes, eom_detection_efficiency)
eom_norm_sc = eom_norm_ax.scatter(min_cluster_sizes, eom_norm_detection_efficiency)

# plotting raw and normalised data using 'leaf' selection method
leaf_fig = plt.figure()
leaf_ax = leaf_fig.add_subplot(121)
leaf_norm_ax = leaf_fig.add_subplot(122)
leaf_sc = leaf_ax.scatter(min_cluster_sizes, leaf_detection_efficiency)
leaf_norm_sc = leaf_norm_ax.scatter(min_cluster_sizes, leaf_norm_detection_efficiency)

# labelling and organising 'eom' axes 
eom_ax.set_title("EOM Selection on Raw Data")
eom_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
eom_ax.set_xlabel("Minimum Cluster Size Parameter Value")
eom_norm_ax.set_title("EOM Selection on Normalised Data")
eom_norm_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
eom_norm_ax.set_xlabel("Minimum Cluster Size Parameter Value")
eom_fig.tight_layout()

# labelling and organising 'leaf' axes
leaf_ax.set_title("Leaf Selection on Raw Data")
leaf_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
leaf_ax.set_xlabel("Minimum Cluster Size Parameter Value")
leaf_norm_ax.set_title("Leaf Selection on Normalised Data")
leaf_norm_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
leaf_norm_ax.set_xlabel("Minimum Cluster Size Parameter Value")
leaf_fig.tight_layout()

In [None]:
# find the detection efficiency of the HDBSCAN for a set of images with known numbers of clusters
# function of number of clusters in imagee

# using parameter values from 5-50, in increments of 5
min_cluster_sizes = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# the number of clusters in each image
clusters = [] 
all_clusters = []
# the detection efficiency using 'eom' on raw data
eom_detection_efficiency = []
total_eom_detection_efficiency = []
# the detection efficiency using 'eom' on normalised data
eom_norm_detection_efficiency = []
total_eom_norm_detection_efficiency = []
# the detection efficiency using 'leaf' on raw data
leaf_detection_efficiency = []
total_leaf_detection_efficiency = []
# the detection efficiency using 'leaf' on normalised data
leaf_norm_detection_efficiency = []
total_leaf_norm_detection_efficiency = []

# find the average number of clusters detected using HDBSCAN for each image
for i in range(len(blend_list)):
    eom_efficiency = []
    eom_norm_efficiency = []
    leaf_efficiency = []
    leaf_norm_efficiency = []
    
    # put each band along with x and y dimensions into one array
    img = blend_images[i:i+1][0].reshape(-1, 6)
    x, y = np.meshgrid(0.1*np.arange(120), 0.1*np.arange(120))
    arrays = [x.flatten(), y.flatten()]
    for j in range(6):
        arrays.append(img[:, j])
    data = np.stack(arrays, axis=1)

    # normalise data
    norm_data = data.astype('float') #- background # works much better without subtracting the background- why? 
    norm_data /= (np.maximum(std_background, data.sum(axis=1)[:, None])) # reducing number of radical outliers

    # get the actual number of clusters from the image
    true_centers = np.stack([blend_list[i]['dx'], blend_list[i]['dy']]).T
    actual_k = len(true_centers)
    clusters.append(actual_k)

    # use HDBSCAN to estimate the number of clusters using a series of values for the minimum cluster size parameter
    for mcs in min_cluster_sizes:
        all_clusters.append(actual_k)
        
        # clustering using HDBSCAN with the 'eom' selection method
        clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs, cluster_selection_method='eom')
        
        # find the number of clusters using raw data
        clusterer.fit(data)
        detected_k = max(clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        eom_efficiency.append(detected_k)
        total_eom_detection_efficiency.append(detected_k/actual_k)
        
        # find the number of clusters using normalised data
        clusterer.fit(norm_data)
        detected_k = max(clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        eom_norm_efficiency.append(detected_k)
        total_eom_norm_detection_efficiency.append(detected_k/actual_k)
        
        # clustering using HDBSCAN with the 'leaf' selection method
        clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs, cluster_selection_method='leaf')
        
        # find the number of clusters using raw data
        clusterer.fit(data)
        detected_k = max(clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1 
        leaf_efficiency.append(detected_k)
        total_leaf_detection_efficiency.append(detected_k/actual_k)
        
        # find the number of clusters using normalised data
        clusterer.fit(norm_data)
        detected_k = max(clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        leaf_norm_efficiency.append(detected_k)
        total_leaf_norm_detection_efficiency.append(detected_k/actual_k)
    
    # take the efficiency to be the mean of the number of clusters detected divided by the actual number of clusters for every image
    eom_detection_efficiency.append(np.mean(eom_efficiency)/actual_k)
    eom_norm_detection_efficiency.append(np.mean(eom_norm_efficiency)/actual_k)
    leaf_detection_efficiency.append(np.mean(leaf_efficiency)/actual_k)
    leaf_norm_detection_efficiency.append(np.mean(leaf_norm_efficiency)/actual_k)

In [None]:
# plotting the detection efficiency as a function of actual number of clusters
# using a colour map to show the average magnitude of the galaxies in an image
cm = plt.cm.get_cmap('RdYlBu')

# plotting raw and normalised data using 'eom' selection method
eom_fig = plt.figure()
eom_ax = eom_fig.add_subplot(121)
eom_norm_ax = eom_fig.add_subplot(122)
eom_sc = eom_ax.scatter(clusters, eom_detection_efficiency, c=magnitudes, cmap=cm)
eom_norm_sc = eom_norm_ax.scatter(clusters, eom_norm_detection_efficiency, c=magnitudes, cmap=cm)
# add average ratio for number of clusters?
plt.colorbar(eom_sc, ax=eom_ax)
eom_cb = plt.colorbar(eom_norm_sc, ax=eom_norm_ax)

# plotting raw and normalised data using 'leaf' selection method
leaf_fig = plt.figure()
leaf_ax = leaf_fig.add_subplot(121)
leaf_norm_ax = leaf_fig.add_subplot(122)
leaf_sc = leaf_ax.scatter(clusters, leaf_detection_efficiency, c=magnitudes, cmap=cm)  # magnitudes vs bulge magnitudes
leaf_norm_sc = leaf_norm_ax.scatter(clusters, leaf_norm_detection_efficiency, c=magnitudes, cmap=cm)
# add average ratio for number of clusters?
plt.colorbar(leaf_sc, ax=leaf_ax)
leaf_cb = plt.colorbar(leaf_norm_sc, ax=leaf_norm_ax)

# labelling and organising 'eom' axes 
eom_ax.set_title("EOM Selection on Raw Data")
eom_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
eom_ax.set_xlabel("Actual Number of Clusters")
eom_cb.set_label("Average Magnitude of Clusters")
eom_norm_ax.set_title("EOM Selection on Normalised Data")
eom_norm_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
eom_norm_ax.set_xlabel("Actual Number of Clusters")
eom_fig.tight_layout()

# labelling and organising 'leaf' axes
leaf_ax.set_title("Leaf Selection on Raw Data")
leaf_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
leaf_ax.set_xlabel("Actual Number of Clusters")
leaf_cb.set_label("Average Magnitude of Clusters")
leaf_norm_ax.set_title("Leaf Selection on Normalised Data")
leaf_norm_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
leaf_norm_ax.set_xlabel("Actual Number of Clusters")
leaf_fig.tight_layout()

In [None]:
# plotting the detection efficiency as a function of actual number of clusters for every parameter value
# using a colour map to show the minimum cluster size parameter
cm = plt.cm.get_cmap('Spectral')
min_cluster_vals = [mc for mc in min_cluster_sizes for i in range(len(blend_list))]

# plotting raw and normalised data using 'eom' selection method
eom_fig = plt.figure()
eom_ax = eom_fig.add_subplot(121)
eom_norm_ax = eom_fig.add_subplot(122)
eom_sc = eom_ax.scatter(all_clusters, total_eom_detection_efficiency, c=min_cluster_vals, cmap=cm)  # magnitudes vs bulge magnitudes
eom_norm_sc = eom_norm_ax.scatter(all_clusters, total_eom_norm_detection_efficiency, c=min_cluster_vals, cmap=cm)
# add average ratio for number of clusters?
plt.colorbar(eom_sc, ax=eom_ax)
eom_cb = plt.colorbar(eom_norm_sc, ax=eom_norm_ax)

# plotting raw and normalised data using 'leaf' selection method
leaf_fig = plt.figure()
leaf_ax = leaf_fig.add_subplot(121)
leaf_norm_ax = leaf_fig.add_subplot(122)
leaf_sc = leaf_ax.scatter(all_clusters, total_leaf_detection_efficiency, c=min_cluster_vals, cmap=cm)  # magnitudes vs bulge magnitudes
leaf_norm_sc = leaf_norm_ax.scatter(all_clusters, total_leaf_norm_detection_efficiency, c=min_cluster_vals, cmap=cm)
# add average ratio for number of clusters?
plt.colorbar(leaf_sc, ax=leaf_ax)
leaf_cb = plt.colorbar(leaf_norm_sc, ax=leaf_norm_ax)

# labelling and organising 'eom' axes 
eom_ax.set_title("EOM Selection on Raw Data")
eom_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
eom_ax.set_xlabel("Actual Number of Clusters")
eom_cb.set_label("Minimum Cluster Size Parameter Value")
eom_norm_ax.set_title("EOM Selection on Normalised Data")
eom_norm_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
eom_norm_ax.set_xlabel("Actual Number of Clusters")
eom_fig.tight_layout()

# labelling and organising 'leaf' axes
leaf_ax.set_title("Leaf Selection on Raw Data")
leaf_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
leaf_ax.set_xlabel("Actual Number of Clusters")
leaf_cb.set_label("Minimum Cluster Size Parameter Value")
leaf_norm_ax.set_title("Leaf Selection on Normalised Data")
leaf_norm_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
leaf_norm_ax.set_xlabel("Actual Number of Clusters")
leaf_fig.tight_layout()

In [None]:
background, std_background

In [None]:
img = blend_images[0:1][0].reshape(-1, 6)
n = 120
x, y = np.meshgrid(0.1*np.arange(n), 0.1*np.arange(n))
arrays = [x.flatten(), y.flatten()]
for j in range(6):
    arrays.append(img[:, j])
data = np.stack(arrays, axis=1)
plt.imshow(img.reshape(n, n, 6)[:, :, 0])
plt.colorbar()

In [None]:
# showing the clustering results for 5 different images

# code for generating galaxies: https://github.com/LSSTDESC/BlendingToolKit/blob/master/notebooks/custom_sampling_function.ipynb
# input catalog name
catalog_name = os.path.join(os.path.dirname(os.getcwd()), 'data', 'sample_input_catalog.fits')

# load parameters
param = btk.config.Simulation_params(catalog_name, max_number=10, batch_size=5)
np.random.seed(param.seed)

# load input catalog
catalog = btk.get_input_catalog.load_catalog(param)

# generate catalogs of blended objects 
blend_generator = btk.create_blend_generator.generate(param, catalog)

# generates observing conditions for the selected survey_name and all input bands
observing_generator = btk.create_observing_generator.generate(param)

# generate images of blends in all the observing bands
draw_blend_generator = btk.draw_blends.generate(param, blend_generator, observing_generator)

# generates new batch_size number of blends
blend_results = next(draw_blend_generator)
output = blend_results
blend_images = output['blend_images']
isolated_images = output['isolated_images']
blend_list = output['blend_list']
obs_cond = output['obs_condition']

# get clustering results using 'leaf' selection method and a minimum cluster size of 10 for normalised data
for i in range(len(blend_list)):
    # put each band along with x and y dimensions into one array
    img = blend_images[i:i+1][0].reshape(-1, 6)
    n = 120
    x, y = np.meshgrid(0.1*np.arange(n), 0.1*np.arange(n))
    arrays = [x.flatten(), y.flatten()]
    for j in range(6):
        arrays.append(img[:, j])
    data = np.stack(arrays, axis=1)

    # normalise data
    norm_data = data.astype('float') #- background # works much better without subtracting the background- why? 
    mask = norm_data[:, 2:].sum(axis=1) < std_background
    norm_data[:, 2:] /= (np.maximum(std_background, norm_data[:, 2:].sum(axis=1)[:, None])) # reducing number of radical outliers
    norm_data[:, 2:][mask] = 0
    #norm_data[:, :2] /= n

    # use HDBSCAN to get a clustering result
    clusterer = hdbscan.HDBSCAN(min_cluster_size=10, cluster_selection_method='leaf')
    clusterer.fit(norm_data)
    labels = clusterer.labels_.reshape(n, n)
    
    # plot images
    fig = plt.figure()
    ax_data = fig.add_subplot(121)
    ax_hdb = fig.add_subplot(122)
    ax_data.imshow(blend_images[i:i+1][0][:, :, :].sum(axis=-1), origin='lower')
    #ax_data.imshow(norm_data.sum(axis=-1).reshape(n,n), origin='lower')
    ax_hdb.imshow(labels, origin='lower')
    #ax_hdb.hist(norm_data.sum(axis=-1).flatten(), bins=100)
    
    #btk.plot_utils.plot_blends(blend_images[i:i+1], blend_list[i:i+1], limits=(30,90))
plt.show()


In [None]:
btk.plot_utils.plot_blends(blend_images, blend_list, limits=(30,90))