#For more details on HDBSCAN visit
https://towardsdatascience.com/lightning-talk-clustering-with-hdbscan-d47b83d1b03a & https://hdbscan.readthedocs.io/en/latest/basic_hdbscan.html

In [1]:
from sklearn.datasets import make_blobs
import pandas as pd

In [2]:
blobs, labels = make_blobs(n_samples=2000, n_features=10)

In [3]:
pd.DataFrame(blobs).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-2.880988,-8.703589,-4.807355,4.902799,-4.005807,8.59488,8.354137,1.382939,1.376176,-3.981377
1,-8.469241,4.450537,-1.693337,-3.643738,9.449458,5.643644,-8.561103,4.040472,5.403714,0.588033
2,5.888124,-8.417432,6.628366,9.195927,-6.851009,-3.503006,-4.771303,10.939476,8.7549,9.351643
3,-7.410875,2.457496,-1.897547,-1.421795,11.754936,7.260875,-9.641232,5.097369,4.079352,0.24506
4,-1.675448,-9.189448,-2.373614,6.731458,-6.934062,8.082055,9.35476,0.9641,1.960519,-3.857369


In [4]:
!pip install hdbscan



In [5]:
import hdbscan

In [6]:
clusterer = hdbscan.HDBSCAN()

In [7]:
clusterer.fit(blobs)

HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
        approx_min_span_tree=True, cluster_selection_epsilon=0.0,
        cluster_selection_method='eom', core_dist_n_jobs=4,
        gen_min_span_tree=False, leaf_size=40,
        match_reference_implementation=False, memory=Memory(location=None),
        metric='euclidean', min_cluster_size=5, min_samples=None, p=None,
        prediction_data=False)

In [8]:
clusterer.labels_

array([2, 0, 1, ..., 2, 0, 1])

In [9]:
clusterer.labels_.max()

2

In [10]:
clusterer.probabilities_

array([0.79025391, 0.72424175, 0.71754808, ..., 0.7692224 , 0.69274035,
       0.79638977])

# What about different metrics?

In [11]:
clusterer = hdbscan.HDBSCAN(metric='manhattan')
clusterer.fit(blobs)
clusterer.labels_

array([2, 0, 1, ..., 2, 0, 1])

In [12]:
hdbscan.dist_metrics.METRIC_MAPPING

{'arccos': hdbscan.dist_metrics.ArccosDistance,
 'braycurtis': hdbscan.dist_metrics.BrayCurtisDistance,
 'canberra': hdbscan.dist_metrics.CanberraDistance,
 'chebyshev': hdbscan.dist_metrics.ChebyshevDistance,
 'cityblock': hdbscan.dist_metrics.ManhattanDistance,
 'cosine': hdbscan.dist_metrics.ArccosDistance,
 'dice': hdbscan.dist_metrics.DiceDistance,
 'euclidean': hdbscan.dist_metrics.EuclideanDistance,
 'hamming': hdbscan.dist_metrics.HammingDistance,
 'haversine': hdbscan.dist_metrics.HaversineDistance,
 'infinity': hdbscan.dist_metrics.ChebyshevDistance,
 'jaccard': hdbscan.dist_metrics.JaccardDistance,
 'kulsinski': hdbscan.dist_metrics.KulsinskiDistance,
 'l1': hdbscan.dist_metrics.ManhattanDistance,
 'l2': hdbscan.dist_metrics.EuclideanDistance,
 'mahalanobis': hdbscan.dist_metrics.MahalanobisDistance,
 'manhattan': hdbscan.dist_metrics.ManhattanDistance,
 'matching': hdbscan.dist_metrics.MatchingDistance,
 'minkowski': hdbscan.dist_metrics.MinkowskiDistance,
 'p': hdbscan.dis

# Distance matrices

In [13]:
from sklearn.metrics.pairwise import pairwise_distances

In [14]:
distance_matrix = pairwise_distances(blobs)
clusterer = hdbscan.HDBSCAN(metric='precomputed')
clusterer.fit(distance_matrix)
clusterer.labels_

array([1, 0, 2, ..., 1, 0, 2])