In [None]:
# load the modules
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import scipy
from scipy.sparse.csgraph import minimum_spanning_tree

def adjacency(X, alpha):
    Gd = (X[None,:,:] - X[:,None,:])**2
    return np.sqrt(Gd[:,:,0:2].sum(axis=-1) + alpha*Gd[:,:,2])

def cost(X, alpha, show_plot=False):
    A = adjacency(X, alpha)
    T = minimum_spanning_tree(A).toarray()
    perc = 95 # depends on expected rate of outliers
    cut = np.percentile(T[T>0],[perc])[0]
    T[T>cut] = 0
    
    graph = scipy.sparse.csr_matrix(T)
    n_components, labels = scipy.sparse.csgraph.connected_components(graph)
    intra_dist = []
    for c in range(1, n_components):
        sel = labels == c
        if sum(sel) > 1:
            dist_c = A[sel,:][:,sel]
            intra_dist.append(dist_c[dist_c>0].mean())
    all_mean = A[A>0].mean()

    if show_plot:
        fig = plt.figure()
        ax = fig.add_subplot(111, aspect='equal')
        ax.scatter(X[:,0], X[:,1], c=labels, alpha=0.1, cmap='prism')
        i,j = np.where(T>0)
        plt.quiver(X[i,0], X[i,1], X[j,0]-X[i,0], X[j,1]-X[i,1], angles='xy', scale_units='xy', scale=1, headwidth=0, headaxislength=0, headlength=0, minlength=0)
        fig.tight_layout()

    return np.mean(intra_dist) / all_mean

In [None]:
# load the data
data = np.load("clusters.npy")
ra0, dec0 = data['RA'].mean(), data['DEC'].mean()
X = np.dstack(((ra0-data['RA'])*np.cos(np.radians(data['DEC'])), data['DEC']-dec0, data['Z']))[0]  
m = len(data)   # number of data points
print(m, data.dtype.names)

In [None]:
# creating adjacency matrices for different values of alpha
alpha = 1000
A = adjacency(X, alpha)
plt.imshow(A)
plt.colorbar()

In [None]:
T = minimum_spanning_tree(A).toarray()
perc = 95 # depends on expected rate of outliers
cut = np.percentile(T[T>0],[perc])[0]
T[T>cut] = 0
i,j = np.where(T>0)

fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax.scatter(X[:,0], X[:,1], alpha=0.1)
plt.quiver(X[i,0], X[i,1], X[j,0]-X[i,0], X[j,1]-X[i,1], angles='xy', scale_units='xy', scale=1, headwidth=0, headaxislength=0, headlength=0, minlength=0)
fig.tight_layout()

In [None]:
graph = scipy.sparse.csr_matrix(T)
n_components, labels = scipy.sparse.csgraph.connected_components(graph)

fig = plt.figure()
ax = fig.add_subplot(111, aspect='equal')
ax.scatter(X[:,0], X[:,1], c=labels, alpha=0.1, cmap='prism')
plt.quiver(X[i,0], X[i,1], X[j,0]-X[i,0], X[j,1]-X[i,1], angles='xy', scale_units='xy', scale=1, headwidth=0, headaxislength=0, headlength=0, minlength=0)
fig.tight_layout()

In [None]:
alphas = 10.**np.arange(-6, 6, 1)
plt.semilogx(alphas, [cost(X, a) for a in alphas])