In [None]:
# importing modules 
%matplotlib inline
import matplotlib.pyplot as plt
import os
import random
import sys
import astropy.table
import numpy as np
import matplotlib.pyplot as plt
sys.path.insert(0,os.path.dirname(os.getcwd()))
import btk
import btk.config, btk.plot_utils
import hdbscan

In [None]:
# code for generating galaxies: https://github.com/LSSTDESC/BlendingToolKit/blob/master/notebooks/custom_sampling_function.ipynb
# input catalog name
catalog_name = os.path.join(os.path.dirname(os.getcwd()), 'data', 'sample_input_catalog.fits')

# load parameters
# max_number = maximum number of galaxies in image (10), batch_size = number of images (100)
param = btk.config.Simulation_params(catalog_name, max_number=10, batch_size=5)
np.random.seed(param.seed)

# load input catalog
catalog = btk.get_input_catalog.load_catalog(param)

# generate catalogs of blended objects 
blend_generator = btk.create_blend_generator.generate(param, catalog)

# generates observing conditions for the selected survey_name and all input bands
observing_generator = btk.create_observing_generator.generate(param)

# generate images of blends in all the observing bands
draw_blend_generator = btk.draw_blends.generate(param, blend_generator, observing_generator)

# generates new batch_size number of blends
blend_results = next(draw_blend_generator)
output = blend_results
blend_images = output['blend_images']
isolated_images = output['isolated_images']
blend_list = output['blend_list']
obs_cond = output['obs_condition']

# plot blended images
plot = False
if (plot): btk.plot_utils.plot_blends(blend_images[0:10], blend_list[0:10], limits=(30,90))

In [None]:
# get the background and noise of the images
sky_level = []
for oc in obs_cond[0]: # same values of each obs_cond?
    sky_level.append(oc.mean_sky_level)
background = np.array(sky_level).sum()
std_background = np.sqrt(background)

# make histogram
n = 120
total = []
for i in range(n):
    for j in range(n):
        total.append(blend_images[0][i, j].sum())
n, bins = np.histogram(total)
mids = 0.5*(bins[1:] + bins[:-1])
mean = np.average(mids, weights=n)
var = np.average((mids - mean)**2, weights=n)
std = np.sqrt(var)

print("Histogram: " + str(std))
print("Observing conditions: " + str(std_background))
plt.hist(total)

In [None]:
# get average magnitudes of galaxies in catalog
magnitudes = []
bulge_magnitudes = []
for img in blend_list:
    mags = []
    for gal in img:
        mag = gal['i_ab'] 
        mags.append(mag)
        
        # colour = gal['g_ab'] - gal['i_ab']

    magnitudes.append(np.mean(mags))

In [None]:
# find the detection efficiency of the HDBSCAN for a set of images with known numbers of clusters
# function of minimum cluster size parameter

# using parameter values from 5-50, in increments of 5
min_cluster_sizes = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# the detection efficiency using 'eom' on raw data
eom_detection_efficiency = []
# the detection efficiency using 'eom' on normalised data
eom_norm_detection_efficiency = []
# the detection efficiency using 'leaf' on raw data
leaf_detection_efficiency = []
# the detection efficiency using 'leaf' on normalised data
leaf_norm_detection_efficiency = []

# find the detection efficiency for each value used for the minimum cluster size parameter
for mcs in min_cluster_sizes:
    eom_efficiency = []
    eom_norm_efficiency = []
    leaf_efficiency = []
    leaf_norm_efficiency = []
    
    # get results using both 'eom' and 'leaf' cluster selection methods
    eom_clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs)
    leaf_clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs, cluster_selection_method='leaf') 

    # get the average detection efficiency from each image
    for i in range(len(blend_list)):
        # get the actual number of clusters in this image
        true_centers = np.stack([blend_list[i]['dx'], blend_list[i]['dy']]).T
        actual_k = len(true_centers)
                
        # put each band along with x and y dimensions into one array
        img = blend_images[i:i+1][0].reshape(-1, 6)
        n = 120
        x, y = np.meshgrid(0.1*np.arange(n), 0.1*np.arange(n))
        arrays = [x.flatten(), y.flatten()]
        for j in range(6):
            arrays.append(img[:, j])
        data = np.stack(arrays, axis=1)

        # normalise data
        norm_data = data.astype('float') #- background
        norm_data /= (np.maximum(std_background, data.sum(axis=1)[:, None])) # reducing number of radical outliers
    
        # find number of clusters detected for raw data using 'eom'
        eom_clusterer.fit(data)
        detected_k = max(eom_clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        eom_efficiency.append(detected_k/actual_k)
        
        # find number of clusters detected for normalised data using 'eom'
        eom_clusterer.fit(norm_data)
        detected_k = max(eom_clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        eom_norm_efficiency.append(detected_k/actual_k)
        
        # find number of clusters detected for raw data using 'leaf'
        leaf_clusterer.fit(data)
        detected_k = max(leaf_clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        leaf_efficiency.append(detected_k/actual_k)
        
        # find number of clusters detected for normalised data using 'leaf'
        leaf_clusterer.fit(data)
        detected_k = max(leaf_clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        leaf_norm_efficiency.append(detected_k/actual_k)
    
    # take the efficiency to be the mean of the number of clusters detected divided by the actual number of clusters for each image
    eom_detection_efficiency.append(np.mean(eom_efficiency))
    eom_norm_detection_efficiency.append(np.mean(eom_norm_efficiency))
    leaf_detection_efficiency.append(np.mean(leaf_efficiency))
    leaf_norm_detection_efficiency.append(np.mean(leaf_norm_efficiency))

In [None]:
# plotting the detection efficiency as a function of the minimum cluster size parameter

# plotting raw and normalised data using 'eom' selection method
eom_fig = plt.figure()
eom_ax = eom_fig.add_subplot(121)
eom_norm_ax = eom_fig.add_subplot(122)
eom_sc = eom_ax.scatter(min_cluster_sizes, eom_detection_efficiency)
eom_norm_sc = eom_norm_ax.scatter(min_cluster_sizes, eom_norm_detection_efficiency)

# plotting raw and normalised data using 'leaf' selection method
leaf_fig = plt.figure()
leaf_ax = leaf_fig.add_subplot(121)
leaf_norm_ax = leaf_fig.add_subplot(122)
leaf_sc = leaf_ax.scatter(min_cluster_sizes, leaf_detection_efficiency)
leaf_norm_sc = leaf_norm_ax.scatter(min_cluster_sizes, leaf_norm_detection_efficiency)

# labelling and organising 'eom' axes 
eom_ax.set_title("EOM Selection on Raw Data")
eom_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
eom_ax.set_xlabel("Minimum Cluster Size Parameter Value")
eom_norm_ax.set_title("EOM Selection on Normalised Data")
eom_norm_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
eom_norm_ax.set_xlabel("Minimum Cluster Size Parameter Value")
eom_fig.tight_layout()

# labelling and organising 'leaf' axes
leaf_ax.set_title("Leaf Selection on Raw Data")
leaf_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
leaf_ax.set_xlabel("Minimum Cluster Size Parameter Value")
leaf_norm_ax.set_title("Leaf Selection on Normalised Data")
leaf_norm_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
leaf_norm_ax.set_xlabel("Minimum Cluster Size Parameter Value")
leaf_fig.tight_layout()

In [None]:
# find the detection efficiency of the HDBSCAN for a set of images with known numbers of clusters
# function of number of clusters in imagee

# using parameter values from 5-50, in increments of 5
min_cluster_sizes = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
# the number of clusters in each image
clusters = [] 
all_clusters = []
# the detection efficiency using 'eom' on raw data
eom_detection_efficiency = []
total_eom_detection_efficiency = []
# the detection efficiency using 'eom' on normalised data
eom_norm_detection_efficiency = []
total_eom_norm_detection_efficiency = []
# the detection efficiency using 'leaf' on raw data
leaf_detection_efficiency = []
total_leaf_detection_efficiency = []
# the detection efficiency using 'leaf' on normalised data
leaf_norm_detection_efficiency = []
total_leaf_norm_detection_efficiency = []

# find the average number of clusters detected using HDBSCAN for each image
for i in range(len(blend_list)):
    eom_efficiency = []
    eom_norm_efficiency = []
    leaf_efficiency = []
    leaf_norm_efficiency = []
    
    # put each band along with x and y dimensions into one array
    img = blend_images[i:i+1][0].reshape(-1, 6)
    x, y = np.meshgrid(0.1*np.arange(120), 0.1*np.arange(120))
    arrays = [x.flatten(), y.flatten()]
    for j in range(6):
        arrays.append(img[:, j])
    data = np.stack(arrays, axis=1)

    # normalise data
    norm_data = data.astype('float') #- background # works much better without subtracting the background- why? 
    norm_data /= (np.maximum(std_background, data.sum(axis=1)[:, None])) # reducing number of radical outliers

    # get the actual number of clusters from the image
    true_centers = np.stack([blend_list[i]['dx'], blend_list[i]['dy']]).T
    actual_k = len(true_centers)
    clusters.append(actual_k)

    # use HDBSCAN to estimate the number of clusters using a series of values for the minimum cluster size parameter
    for mcs in min_cluster_sizes:
        all_clusters.append(actual_k)
        
        # clustering using HDBSCAN with the 'eom' selection method
        clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs, cluster_selection_method='eom')
        
        # find the number of clusters using raw data
        clusterer.fit(data)
        detected_k = max(clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        eom_efficiency.append(detected_k)
        total_eom_detection_efficiency.append(detected_k/actual_k)
        
        # find the number of clusters using normalised data
        clusterer.fit(norm_data)
        detected_k = max(clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        eom_norm_efficiency.append(detected_k)
        total_eom_norm_detection_efficiency.append(detected_k/actual_k)
        
        # clustering using HDBSCAN with the 'leaf' selection method
        clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs, cluster_selection_method='leaf')
        
        # find the number of clusters using raw data
        clusterer.fit(data)
        detected_k = max(clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1 
        leaf_efficiency.append(detected_k)
        total_leaf_detection_efficiency.append(detected_k/actual_k)
        
        # find the number of clusters using normalised data
        clusterer.fit(norm_data)
        detected_k = max(clusterer.labels_)   # number of distinct things- noise as a cluster
        if (detected_k == -1): detected_k = 0 # does labelling start at 0 or 1
        leaf_norm_efficiency.append(detected_k)
        total_leaf_norm_detection_efficiency.append(detected_k/actual_k)
    
    # take the efficiency to be the mean of the number of clusters detected divided by the actual number of clusters for every image
    eom_detection_efficiency.append(np.mean(eom_efficiency)/actual_k)
    eom_norm_detection_efficiency.append(np.mean(eom_norm_efficiency)/actual_k)
    leaf_detection_efficiency.append(np.mean(leaf_efficiency)/actual_k)
    leaf_norm_detection_efficiency.append(np.mean(leaf_norm_efficiency)/actual_k)

In [None]:
# plotting the detection efficiency as a function of actual number of clusters
# using a colour map to show the average magnitude of the galaxies in an image
cm = plt.cm.get_cmap('RdYlBu')

# plotting raw and normalised data using 'eom' selection method
eom_fig = plt.figure()
eom_ax = eom_fig.add_subplot(121)
eom_norm_ax = eom_fig.add_subplot(122)
eom_sc = eom_ax.scatter(clusters, eom_detection_efficiency, c=magnitudes, cmap=cm)
eom_norm_sc = eom_norm_ax.scatter(clusters, eom_norm_detection_efficiency, c=magnitudes, cmap=cm)
# add average ratio for number of clusters?
plt.colorbar(eom_sc, ax=eom_ax)
eom_cb = plt.colorbar(eom_norm_sc, ax=eom_norm_ax)

# plotting raw and normalised data using 'leaf' selection method
leaf_fig = plt.figure()
leaf_ax = leaf_fig.add_subplot(121)
leaf_norm_ax = leaf_fig.add_subplot(122)
leaf_sc = leaf_ax.scatter(clusters, leaf_detection_efficiency, c=magnitudes, cmap=cm)  # magnitudes vs bulge magnitudes
leaf_norm_sc = leaf_norm_ax.scatter(clusters, leaf_norm_detection_efficiency, c=magnitudes, cmap=cm)
# add average ratio for number of clusters?
plt.colorbar(leaf_sc, ax=leaf_ax)
leaf_cb = plt.colorbar(leaf_norm_sc, ax=leaf_norm_ax)

# labelling and organising 'eom' axes 
eom_ax.set_title("EOM Selection on Raw Data")
eom_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
eom_ax.set_xlabel("Actual Number of Clusters")
eom_cb.set_label("Average Magnitude of Clusters")
eom_norm_ax.set_title("EOM Selection on Normalised Data")
eom_norm_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
eom_norm_ax.set_xlabel("Actual Number of Clusters")
eom_fig.tight_layout()

# labelling and organising 'leaf' axes
leaf_ax.set_title("Leaf Selection on Raw Data")
leaf_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
leaf_ax.set_xlabel("Actual Number of Clusters")
leaf_cb.set_label("Average Magnitude of Clusters")
leaf_norm_ax.set_title("Leaf Selection on Normalised Data")
leaf_norm_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
leaf_norm_ax.set_xlabel("Actual Number of Clusters")
leaf_fig.tight_layout()

In [None]:
# plotting the detection efficiency as a function of actual number of clusters for every parameter value
# using a colour map to show the minimum cluster size parameter
cm = plt.cm.get_cmap('Spectral')
min_cluster_vals = [mc for mc in min_cluster_sizes for i in range(len(blend_list))]

# plotting raw and normalised data using 'eom' selection method
eom_fig = plt.figure()
eom_ax = eom_fig.add_subplot(121)
eom_norm_ax = eom_fig.add_subplot(122)
eom_sc = eom_ax.scatter(all_clusters, total_eom_detection_efficiency, c=min_cluster_vals, cmap=cm)  # magnitudes vs bulge magnitudes
eom_norm_sc = eom_norm_ax.scatter(all_clusters, total_eom_norm_detection_efficiency, c=min_cluster_vals, cmap=cm)
# add average ratio for number of clusters?
plt.colorbar(eom_sc, ax=eom_ax)
eom_cb = plt.colorbar(eom_norm_sc, ax=eom_norm_ax)

# plotting raw and normalised data using 'leaf' selection method
leaf_fig = plt.figure()
leaf_ax = leaf_fig.add_subplot(121)
leaf_norm_ax = leaf_fig.add_subplot(122)
leaf_sc = leaf_ax.scatter(all_clusters, total_leaf_detection_efficiency, c=min_cluster_vals, cmap=cm)  # magnitudes vs bulge magnitudes
leaf_norm_sc = leaf_norm_ax.scatter(all_clusters, total_leaf_norm_detection_efficiency, c=min_cluster_vals, cmap=cm)
# add average ratio for number of clusters?
plt.colorbar(leaf_sc, ax=leaf_ax)
leaf_cb = plt.colorbar(leaf_norm_sc, ax=leaf_norm_ax)

# labelling and organising 'eom' axes 
eom_ax.set_title("EOM Selection on Raw Data")
eom_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
eom_ax.set_xlabel("Actual Number of Clusters")
eom_cb.set_label("Minimum Cluster Size Parameter Value")
eom_norm_ax.set_title("EOM Selection on Normalised Data")
eom_norm_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
eom_norm_ax.set_xlabel("Actual Number of Clusters")
eom_fig.tight_layout()

# labelling and organising 'leaf' axes
leaf_ax.set_title("Leaf Selection on Raw Data")
leaf_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
leaf_ax.set_xlabel("Actual Number of Clusters")
leaf_cb.set_label("Minimum Cluster Size Parameter Value")
leaf_norm_ax.set_title("Leaf Selection on Normalised Data")
leaf_norm_ax.set_ylabel("Ratio of Detected Clusters to Actual Clusters")
leaf_norm_ax.set_xlabel("Actual Number of Clusters")
leaf_fig.tight_layout()

In [None]:
background, std_background

In [None]:
img = blend_images[0:1][0].reshape(-1, 6)
n = 120
x, y = np.meshgrid(0.1*np.arange(n), 0.1*np.arange(n))
arrays = [x.flatten(), y.flatten()]
for j in range(6):
    arrays.append(img[:, j])
data = np.stack(arrays, axis=1)
plt.imshow(img.reshape(n, n, 6)[:, :, 0])
plt.colorbar()

In [None]:
# showing the clustering results for 5 different images

# code for generating galaxies: https://github.com/LSSTDESC/BlendingToolKit/blob/master/notebooks/custom_sampling_function.ipynb
# input catalog name
catalog_name = os.path.join(os.path.dirname(os.getcwd()), 'data', 'sample_input_catalog.fits')

# load parameters
param = btk.config.Simulation_params(catalog_name, max_number=10, batch_size=5)
np.random.seed(param.seed)

# load input catalog
catalog = btk.get_input_catalog.load_catalog(param)

# generate catalogs of blended objects 
blend_generator = btk.create_blend_generator.generate(param, catalog)

# generates observing conditions for the selected survey_name and all input bands
observing_generator = btk.create_observing_generator.generate(param)

# generate images of blends in all the observing bands
draw_blend_generator = btk.draw_blends.generate(param, blend_generator, observing_generator)

# generates new batch_size number of blends
blend_results = next(draw_blend_generator)
output = blend_results
blend_images = output['blend_images']
isolated_images = output['isolated_images']
blend_list = output['blend_list']
obs_cond = output['obs_condition']

# get clustering results using 'leaf' selection method and a minimum cluster size of 10 for normalised data
for i in range(len(blend_list)):
    # put each band along with x and y dimensions into one array
    img = blend_images[i:i+1][0].reshape(-1, 6)
    n = 120
    x, y = np.meshgrid(0.1*np.arange(n), 0.1*np.arange(n))
    arrays = [x.flatten(), y.flatten()]
    for j in range(6):
        arrays.append(img[:, j])
    data = np.stack(arrays, axis=1)

    # normalise data
    norm_data = data.astype('float') #- background # works much better without subtracting the background- why? 
    mask = norm_data[:, 2:].sum(axis=1) < std_background
    norm_data[:, 2:] /= (np.maximum(std_background, norm_data[:, 2:].sum(axis=1)[:, None])) # reducing number of radical outliers
    norm_data[:, 2:][mask] = 0
    #norm_data[:, :2] /= n

    # use HDBSCAN to get a clustering result
    clusterer = hdbscan.HDBSCAN(min_cluster_size=10, cluster_selection_method='leaf')
    clusterer.fit(norm_data)
    labels = clusterer.labels_.reshape(n, n)
    
    # plot images
    fig = plt.figure()
    ax_data = fig.add_subplot(121)
    ax_hdb = fig.add_subplot(122)
    ax_data.imshow(blend_images[i:i+1][0][:, :, :].sum(axis=-1), origin='lower')
    #ax_data.imshow(norm_data.sum(axis=-1).reshape(n,n), origin='lower')
    ax_hdb.imshow(labels, origin='lower')
    #ax_hdb.hist(norm_data.sum(axis=-1).flatten(), bins=100)
    
    #btk.plot_utils.plot_blends(blend_images[i:i+1], blend_list[i:i+1], limits=(30,90))
plt.show()


In [None]:
btk.plot_utils.plot_blends(blend_images, blend_list, limits=(30,90))