## This notebook compares the predictions from the Python hdbscan module to the Tribuo HDBSCAN* implementation

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import normalized_mutual_info_score
import hdbscan

In [2]:
df_train = pd.read_csv('../../../data/basic-gaussians-train.csv')
df_predict = pd.read_csv('../../../data/basic-gaussians-predict.csv')

In [3]:
print(df_train.shape)
print(df_predict.shape)

(1980, 3)
(20, 3)


In [4]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=7, prediction_data=True).fit(df_train)

In [5]:
# print(clusterer.labels_)

In [6]:
hdbscan_predict_labels, strengths = hdbscan.approximate_predict(clusterer, df_predict)

In [7]:
hdbscan_predict_label_list = hdbscan_predict_labels.tolist()
print(hdbscan_predict_label_list)

[2, 0, 2, 2, 0, 2, 1, 1, 2, 0, 0, 0, 0, 1, 1, 2, 1, 2, 2, 1]


In [8]:
# check the mutual information between results
def evaluate(result1, result2):
    print('Result Evaluation')
    print('Normalized MI = %.2f' % normalized_mutual_info_score(result1, result2))
    print('Adjusted MI = %.2f' % adjusted_mutual_info_score(result1, result2))

In [9]:
# these are the cluster assignments from the gaussians - declared in notebook: data setup scikit-learn gaussians Tribuo Unit Tests
actual_cluster_labels = [2, 3, 1, 3, 0, 2, 1, 1, 0, 2, 1, 0, 3, 0, 2, 0, 1, 3, 3, 3, 3, 1, 0, 3, 3, 3, 3, 3, 1, 0, 2, 2, 1, 3, 2, 3, 0, 2, 0, 1, 2, 0, 0, 3, 2, 2, 2, 2, 2, 2, 3, 0, 1, 3, 1, 3, 2, 3, 1, 3, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 0, 3, 2, 2, 1, 0, 2, 0, 3, 2, 0, 0, 3, 1, 2, 1, 0, 0, 2, 2, 1, 3, 1, 1, 0, 3, 2, 2, 3, 3, 1, 1, 0, 1, 3, 3, 1, 3, 2, 2, 1, 2, 0, 2, 2, 1, 1, 2, 3, 1, 3, 2, 2, 0, 0, 2, 1, 1, 2, 0, 0, 0, 2, 0, 1, 2, 0, 0, 2, 1, 1, 3, 1, 1, 0, 2, 0, 2, 1, 3, 2, 0, 3, 2, 3, 0, 2, 1, 1, 1, 2, 2, 1, 1, 3, 0, 0, 3, 2, 0, 2, 3, 0, 2, 0, 2, 0, 3, 0, 2, 0, 0, 0, 0, 1, 1, 1, 0, 3, 1, 0, 3, 3, 2, 3, 0, 2, 1, 1, 1, 0, 1, 2, 3, 0, 0, 1, 0, 1, 3, 0, 2, 0, 1, 3, 3, 0, 1, 2, 2, 2, 0, 0, 3, 3, 2, 0, 1, 3, 1, 2, 0, 1, 0, 3, 0, 2, 2, 3, 2, 2, 1, 2, 0, 3, 1, 1, 3, 3, 0, 3, 3, 3, 2, 2, 0, 1, 2, 0, 2, 3, 1, 1, 3, 3, 3, 1, 3, 1, 3, 3, 3, 2, 3, 0, 1, 1, 1, 1, 1, 1, 3, 3, 2, 3, 0, 2, 3, 1, 3, 3, 0, 1, 1, 1, 2, 1, 2, 3, 2, 3, 2, 1, 1, 1, 2, 1, 0, 1, 3, 3, 1, 2, 1, 1, 1, 2, 0, 2, 0, 1, 3, 3, 2, 0, 3, 2, 2, 2, 1, 1, 2, 1, 3, 1, 3, 2, 3, 1, 2, 0, 2, 0, 2, 0, 1, 1, 3, 0, 1, 2, 0, 1, 1, 3, 1, 1, 2, 1, 2, 2, 3, 3, 3, 0, 2, 0, 2, 1, 1, 1, 1, 2, 1, 2, 0, 2, 0, 2, 2, 3, 2, 3, 0, 3, 2, 1, 3, 2, 3, 1, 3, 3, 3, 3, 0, 1, 3, 2, 1, 2, 2, 3, 0, 3, 1, 0, 1, 0, 3, 0, 0, 1, 0, 0, 0, 1, 0, 2, 0, 2, 3, 1, 2, 2, 3, 2, 1, 3, 0, 3, 3, 2, 1, 2, 2, 1, 3, 3, 0, 2, 0, 0, 3, 3, 2, 3, 3, 2, 0, 0, 0, 0, 3, 1, 1, 0, 1, 3, 0, 3, 3, 3, 2, 2, 2, 3, 2, 0, 0, 2, 3, 3, 1, 3, 3, 0, 3, 3, 3, 0, 2, 3, 3, 0, 1, 2, 3, 1, 1, 1, 3, 3, 0, 1, 0, 0, 1, 0, 2, 0, 1, 1, 1, 2, 2, 3, 3, 2, 1, 1, 3, 3, 2, 0, 2, 0, 1, 0, 1, 2, 2, 3, 2, 0, 2, 2, 2, 1, 2, 1, 0, 3, 3, 1, 1, 1, 0, 0, 1, 2, 1, 3, 1, 0, 2, 0, 0, 0, 1, 0, 1, 0, 3, 0, 1, 1, 3, 2, 1, 2, 0, 0, 0, 2, 0, 3, 2, 3, 2, 3, 1, 1, 1, 0, 0, 3, 1, 2, 0, 0, 2, 2, 3, 2, 3, 1, 2, 1, 3, 3, 3, 0, 0, 3, 2, 1, 1, 1, 0, 0, 3, 2, 0, 0, 3, 2, 2, 2, 3, 2, 0, 0, 1, 1, 1, 1, 3, 0, 3, 3, 3, 1, 2, 3, 2, 0, 0, 2, 3, 0, 0, 2, 3, 3, 1, 1, 1, 3, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 0, 1, 2, 2, 3, 3, 0, 0, 2, 3, 3, 3, 1, 0, 1, 3, 3, 2, 2, 1, 3, 2, 1, 0, 3, 0, 0, 0, 3, 3, 3, 3, 2, 0, 1, 0, 1, 3, 0, 0, 0, 1, 3, 1, 1, 2, 3, 2, 1, 2, 2, 3, 2, 2, 0, 1, 0, 0, 0, 0, 1, 3, 2, 2, 1, 1, 2, 3, 1, 2, 2, 1, 0, 0, 1, 2, 1, 3, 1, 0, 3, 3, 1, 3, 2, 2, 2, 1, 1, 1, 2, 1, 0, 3, 0, 2, 0, 2, 0, 0, 1, 2, 2, 0, 1, 0, 2, 2, 0, 1, 2, 2, 0, 1, 3, 0, 1, 0, 1, 0, 2, 2, 3, 3, 0, 1, 0, 2, 1, 2, 0, 1, 3, 0, 0, 0, 0, 0, 0, 1, 1, 3, 2, 0, 3, 1, 1, 1, 3, 0, 0, 2, 0, 3, 3, 1, 3, 1, 3, 0, 0, 0, 0, 3, 3, 0, 2, 2, 3, 2, 1, 3, 2, 1, 1, 0, 2, 3, 3, 2, 0, 2, 3, 3, 0, 0, 2, 1, 2, 2, 3, 2, 1, 1, 2, 3, 1, 3, 0, 1, 1, 2, 1, 3, 3, 1, 0, 2, 0, 2, 1, 2, 0, 0, 1, 1, 1, 0, 1, 3, 1, 2, 0, 2, 1, 1, 1, 0, 3, 0, 2, 1, 2, 0, 3, 2, 2, 2, 1, 0, 3, 0, 2, 1, 0, 0, 1, 3, 3, 2, 3, 2, 3, 2, 0, 1, 3, 3, 1, 0, 3, 3, 0, 3, 1, 1, 1, 2, 1, 0, 1, 0, 1, 3, 1, 1, 1, 2, 1, 3, 1, 0, 2, 0, 3, 2, 1, 0, 0, 3, 0, 1, 0, 2, 2, 2, 2, 0, 1, 0, 0, 0, 2, 0, 2, 0, 0, 3, 2, 1, 0, 3, 3, 0, 0, 1, 0, 3, 0, 2, 1, 0, 2, 2, 3, 3, 2, 0, 0, 1, 2, 2, 2, 2, 1, 2, 1, 3, 2, 1, 0, 0, 1, 0, 2, 2, 2, 0, 3, 2, 1, 2, 0, 1, 0, 3, 3, 2, 3, 2, 0, 3, 3, 3, 0, 3, 1, 2, 0, 2, 2, 1, 3, 3, 1, 1, 2, 3, 1, 0, 1, 3, 0, 0, 3, 3, 2, 2, 0, 0, 2, 0, 1, 3, 0, 3, 3, 2, 0, 0, 2, 0, 1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 0, 1, 2, 3, 2, 2, 0, 3, 0, 2, 2, 2, 0, 3, 0, 2, 1, 0, 0, 0, 0, 3, 0, 3, 1, 3, 2, 3, 1, 2, 2, 2, 0, 3, 3, 3, 1, 3, 1, 3, 1, 0, 1, 2, 0, 3, 3, 3, 0, 1, 0, 0, 1, 2, 0, 2, 1, 3, 2, 3, 3, 3, 2, 3, 3, 0, 1, 0, 3, 2, 2, 0, 3, 3, 0, 1, 3, 1, 3, 2, 1, 3, 1, 0, 0, 0, 3, 1, 3, 2, 2, 1, 0, 2, 3, 0, 0, 2, 1, 2, 1, 2, 1, 1, 0, 2, 3, 0, 3, 2, 1, 2, 0, 0, 1, 0, 0, 1, 3, 2, 0, 2, 0, 3, 2, 0, 2, 1, 0, 3, 2, 0, 1, 0, 0, 0, 1, 1, 0, 3, 3, 2, 1, 2, 3, 1, 1, 3, 0, 3, 2, 2, 3, 1, 3, 0, 2, 2, 3, 1, 0, 1, 1, 1, 2, 2, 2, 3, 2, 2, 2, 3, 2, 0, 3, 0, 1, 3, 3, 2, 1, 2, 2, 2, 3, 2, 0, 1, 1, 3, 2, 0, 3, 2, 2, 3, 0, 1, 3, 2, 1, 1, 3, 3, 1, 1, 0, 2, 1, 1, 3, 0, 0, 3, 2, 3, 2, 2, 3, 3, 2, 3, 3, 2, 2, 2, 3, 3, 1, 2, 3, 1, 1, 2, 0, 3, 2, 3, 1, 1, 1, 2, 1, 3, 0, 3, 3, 2, 3, 2, 0, 2, 3, 1, 3, 2, 1, 2, 3, 0, 0, 0, 1, 2, 0, 3, 2, 0, 3, 1, 1, 3, 2, 0, 2, 0, 1, 2, 0, 0, 3, 3, 1, 1, 2, 0, 1, 3, 0, 3, 3, 0, 3, 2, 3, 1, 0, 0, 1, 0, 2, 1, 1, 3, 2, 1, 0, 0, 0, 2, 1, 1, 1, 0, 0, 1, 0, 2, 2, 1, 0, 2, 3, 3, 1, 0, 3, 3, 1, 3, 3, 2, 1, 0, 3, 1, 1, 1, 2, 0, 0, 0, 1, 1, 0, 3, 1, 3, 0, 1, 2, 1, 2, 3, 0, 2, 3, 1, 3, 2, 2, 0, 1, 2, 0, 0, 0, 1, 3, 2, 3, 3, 0, 1, 0, 0, 0, 0, 0, 0, 1, 2, 1, 3, 1, 0, 3, 2, 0, 3, 3, 3, 3, 0, 3, 0, 1, 2, 0, 2, 1, 3, 3, 0, 0, 0, 3, 2, 3, 3, 2, 3, 0, 3, 0, 1, 2, 2, 3, 1, 2, 2, 2, 3, 0, 3, 3, 1, 2, 2, 0, 3, 3, 2, 0, 2, 2, 1, 0, 3, 0, 1, 3, 0, 0, 3, 0, 0, 0, 3, 1, 0, 2, 0, 3, 1, 3, 0, 1, 2, 3, 0, 0, 0, 3, 1, 3, 0, 3, 2, 0, 2, 2, 0, 3, 1, 0, 0, 0, 0, 1, 3, 1, 3, 3, 1, 3, 0, 0, 1, 0, 3, 1, 1, 0, 3, 3, 3, 1, 0, 0, 0, 2, 0, 0, 3, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 2, 2, 2, 1, 2, 3, 1, 2, 0, 3, 2, 2, 0, 2, 1, 2, 0, 0, 0, 0, 2, 3, 3, 2, 3, 1, 2, 1, 0, 1, 1, 2, 3, 1, 1, 2, 0, 1, 0, 3, 3, 1, 2, 1, 3, 3, 1, 2, 2, 3, 0, 3, 0, 3, 0, 2, 0, 2, 0, 3, 1, 1, 3, 3, 2, 0, 3, 3, 0, 0, 3, 2, 0, 2, 0, 3, 2, 2, 0, 3, 3, 3, 2, 2, 1, 2, 1, 3, 0, 2, 3, 1, 0, 1, 2, 2, 1, 0, 0, 2, 0, 1, 2, 0, 2, 2, 1, 0, 0, 0, 3, 1, 1, 0, 2, 2, 3, 3, 3, 3, 3, 3, 2, 3, 3, 0, 1, 1, 0, 0, 0, 2, 3, 0, 2, 0, 1, 1, 2, 1, 1, 1, 0, 1, 3, 2, 0, 3, 1, 3, 3, 2, 0, 1, 2, 0, 0, 0, 1, 1, 3, 0, 3, 3, 2, 3, 1, 1, 0, 1, 3, 0, 0, 2, 1, 2, 3, 3, 1, 0, 1, 1, 3, 1, 1, 1, 1, 1, 2, 0, 2, 3, 1, 3, 1, 1, 2, 2, 2, 0, 3, 3, 0, 3, 2, 3, 2, 0, 1, 3, 1, 1, 1, 2, 2, 2, 2, 3, 3, 0, 1, 2, 3, 2, 3, 2, 1, 0, 3, 0, 1, 3, 2, 2, 2, 1, 2, 2, 0, 1, 3, 2, 3, 2, 3, 3, 2, 0, 2, 3, 2, 1, 1, 1, 2, 3, 3, 0, 2, 1, 1, 2, 0, 2, 1, 1, 1, 1, 0, 2, 3, 3, 1, 3, 3, 0, 3, 2, 0, 0, 0, 1, 2, 1, 1, 0, 1, 1, 2, 3, 3, 0, 3, 2, 3, 3, 0, 1, 1, 2, 2, 2, 3, 1, 2, 1, 1, 1, 0, 0, 3, 0, 2, 2, 1, 0, 1, 3, 2, 2, 0, 1, 3, 0, 2, 0, 1, 0, 0, 0, 3, 1, 1, 0, 0, 3, 0, 0, 3, 2, 1, 3, 1, 1, 3, 2, 1, 0, 0, 0, 2, 2, 2, 1, 3, 2, 2, 3, 2, 2, 0, 3, 2, 3, 0, 0, 1, 3, 1, 2, 1, 2, 3, 2, 0, 1, 1, 2, 0, 1, 1, 1, 2, 3, 3, 2, 1, 1, 2, 2, 1, 0, 1, 2, 2, 2, 1, 0, 2, 2, 1, 1, 3, 2, 0, 0, 2, 3, 1, 1, 0, 2, 2, 2, 2, 1, 1, 3, 1, 0, 3, 1]


In [10]:
# evaluate the hdbscan prediction labels with the actual lables
evaluate(hdbscan_predict_label_list, actual_cluster_labels[1980:])

Result Evaluation
Normalized MI = 0.89
Adjusted MI = 0.87


In [11]:
# these are the labels from the notebook: Tribuo Predictions Tribuo 4.3 Unit Tests
tribuo_predict_label_list = [5, 3, 5, 5, 3, 5, 4, 4, 5, 3, 3, 3, 3, 4, 4, 5, 4, 5, 5, 4]

In [12]:
# evaluate the Tribuo prediction labels with the actual lables
evaluate(tribuo_predict_label_list, actual_cluster_labels[1980:])

Result Evaluation
Normalized MI = 0.89
Adjusted MI = 0.87


In [13]:
# check the MI between the prediction labels from hdbscan model and the tribuo model
evaluate(hdbscan_predict_label_list, tribuo_predict_label_list)

Result Evaluation
Normalized MI = 1.00
Adjusted MI = 1.00
