In [29]:
import dask
import numpy as np
import sklearn
from matplotlib import pyplot as plt
from matplotlib import cm
import copy
import astropy
import hdbscan
import pandas as pd
from astropy.coordinates import Distance
from astropy import units as u
from astropy.cosmology import WMAP7
from astropy.io import fits
from collections import Counter

In [2]:
y = pd.read_csv('rcsed_iGrID.csv')

In [3]:
sdss_indx = list(y[~y.iGrID.isna()].index)
fltr = pd.notna(y.iGrID)

#y = y.loc[sdss_indx,:].to_numpy().flatten()

#sdss_labels = pd.Series(y)

In [4]:
hdul = fits.open('rcsed_v2_clean.fits', memmap=astropy.io.fits.Conf.use_memmap.defaultvalue, lazy_load_hdus=True)

In [5]:
cols = hdul[1].columns

In [6]:
data = hdul[1].data

In [7]:
hdul.close()
del hdul

In [8]:
DATA = pd.DataFrame(np.array(data).byteswap().newbyteorder())
del data

In [9]:
ra_dec_z = DATA[['ra','dec','z']]
del DATA

In [10]:
ra_dec_z['z']=Distance(unit=u.Mpc, z = ra_dec_z['z'], cosmology = WMAP7)

In [11]:
from sklearn.preprocessing import StandardScaler

stdscaler = StandardScaler().fit(ra_dec_z) #[sdss_indx])

ra_dec_z_scaled = stdscaler.transform(ra_dec_z)

In [53]:
pred = -np.ones(ra_dec_z.shape[0])

In [54]:
area_flt = ra_dec_z.ra < 40
area_x = ra_dec_z_scaled[area_flt]

hdbScan = hdbscan.hdbscan_.HDBSCAN( min_cluster_size=3, #2,
                                    min_samples=15, #3,
                                    algorithm='boruvka_kdtree',   #'boruvka_kdtree', 
                                    leaf_size=50, #30,
                                    approx_min_span_tree=True,
                                    gen_min_span_tree=True,
                                    core_dist_n_jobs=-1,
                                    cluster_selection_method='eom',
                                    allow_single_cluster=False,
                                    prediction_data=False,
                                    match_reference_implementation=False).fit(area_x)
b1 = 0
b2 = 5000000
clusters = {i: set() for i in range(max(hdbScan.labels_) + 1)}
rcsed_labels = hdbScan.labels_
b3 = max(hdbScan.labels_) + 1
ind = ra_dec_z[area_flt].index.values
for i in range(len(rcsed_labels)):
    if rcsed_labels[i]==-1:
        rcsed_labels[i]=b2
        b2 += 1
        clusters[rcsed_labels[i]] = set([ind[i]])
    else:
        rcsed_labels[i] = rcsed_labels[i] + b1
        clusters[rcsed_labels[i]].add(ind[i])
    pred[ind[i]] = rcsed_labels[i]
    
b1 = b3

In [55]:
l_border = 20
r_border = 60
while r_border <= 360:
    area_flt = (ra_dec_z.ra > l_border) & (ra_dec_z.ra < r_border)
    area_x = ra_dec_z_scaled[area_flt]

    hdbScan = hdbscan.hdbscan_.HDBSCAN( min_cluster_size=3, #2,
                                        min_samples=15, #3,
                                        algorithm='boruvka_kdtree',   #'boruvka_kdtree', 
                                        leaf_size=50, #30,
                                        approx_min_span_tree=True,
                                        gen_min_span_tree=True,
                                        core_dist_n_jobs=-1,
                                        cluster_selection_method='eom',
                                        allow_single_cluster=False,
                                        prediction_data=False,
                                        match_reference_implementation=False).fit(area_x)
    rcsed_labels_next = hdbScan.labels_
    b3 = max(hdbScan.labels_) + 1 + b1
    ind = ra_dec_z[area_flt].index.values
    new_clusters = {b1 + i: set() for i in range(max(hdbScan.labels_) + 1)}
    for i in range(len(rcsed_labels_next)):
        if rcsed_labels_next[i] == -1:
            rcsed_labels_next[i] = b2
            b2 += 1
            new_clusters[rcsed_labels_next[i]] = set([ind[i]])
        else:
            rcsed_labels_next[i] = rcsed_labels_next[i] + b1
            new_clusters[rcsed_labels_next[i]].add(ind[i])

    for cl in new_clusters:
        unique, counts = np.unique(pred[list(new_clusters[cl])], return_counts=True)
        counts = counts/ len(list(new_clusters[cl]))
        if unique[0] != -1 and unique[0] < 5000000 and counts[0] > 0.35:
            pred[list(new_clusters[cl])] = unique[0]
            clusters[unique[0]].update(new_clusters[cl])
        else:
            pred[list(new_clusters[cl])] = cl
            clusters[cl] = new_clusters[cl]
    b1 = b3
    qual_flt = (ra_dec_z.ra < r_border) & fltr
    true = y[qual_flt].to_numpy().flatten()

    fms = round(sklearn.metrics.fowlkes_mallows_score(true, pred[qual_flt]),5)
    ars = round(sklearn.metrics.adjusted_rand_score(true, pred[qual_flt]),5)
    nmi = round(sklearn.metrics.normalized_mutual_info_score(true, pred[qual_flt]),5)

    print('(%d, %d) is finished. Quality: %f  %f  %f ' % (l_border, r_border, fms, ars, nmi))
    
    l_border += 20
    r_border += 20

(20, 60) is finished. Quality: 0.601430  0.587610  0.950020 
(40, 80) is finished. Quality: 0.601640  0.587810  0.950010 
(60, 100) is finished. Quality: 0.595050  0.579960  0.949230 
(80, 120) is finished. Quality: 0.564320  0.540810  0.949020 
(100, 140) is finished. Quality: 0.497060  0.461700  0.952330 
(120, 160) is finished. Quality: 0.468050  0.424690  0.953700 
(140, 180) is finished. Quality: 0.459150  0.434200  0.954520 
(160, 200) is finished. Quality: 0.440460  0.429120  0.954660 
(180, 220) is finished. Quality: 0.446700  0.431640  0.955240 
(200, 240) is finished. Quality: 0.457040  0.441710  0.955420 
(220, 260) is finished. Quality: 0.424350  0.418330  0.954900 


KeyboardInterrupt: 