## This notebook compares the predictions from the Python hdbscan module to the Tribuo HDBSCAN* java implementation using the dataset: 5000 records, 3 centers, 4 features

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import normalized_mutual_info_score
import hdbscan

In [2]:
df_train = pd.read_csv('../../../data/big-gaussians-3centers-train.csv')
df_predict = pd.read_csv('../../../data/big-gaussians-3centers-predict.csv')

In [3]:
print(df_train.shape)
print(df_predict.shape)

(4000, 4)
(1000, 4)


In [4]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=8, prediction_data=True).fit(df_train)

In [5]:
# print(clusterer.labels_)

In [7]:
hdbscan_predict_labels, strengths = hdbscan.approximate_predict(clusterer, df_predict)

In [10]:
hdbscan_predict_label_list = hdbscan_predict_labels.tolist()
# print(hdbscan_predict_label_list)

In [11]:
# check the mutual information between results
def evaluate(result1, result2):
    print('Result Evaluation')
    print('Normalized MI = %.2f' % normalized_mutual_info_score(result1, result2))
    print('Adjusted MI = %.2f' % adjusted_mutual_info_score(result1, result2))

In [12]:
# these are the cluster assignments from the gaussians - declared in notebook: scikit-learn Gaussians5000C3F4 Data Setup
actual_labels_predict = [1, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 1, 1, 0, 2, 2, 1, 1, 1, 0, 0, 1, 2, 0, 0, 0, 2, 1, 0, 1, 1, 2, 1, 2, 0, 0, 2, 1, 1, 0, 0, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2, 2, 1, 2, 2, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 2, 0, 1, 1, 0, 1, 2, 0, 2, 1, 1, 0, 0, 1, 0, 1, 0, 2, 2, 0, 2, 0, 1, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 2, 1, 2, 2, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 2, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 0, 2, 1, 1, 0, 1, 0, 0, 0, 2, 0, 0, 1, 1, 2, 1, 0, 2, 0, 2, 2, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 1, 2, 2, 0, 2, 0, 0, 1, 2, 2, 0, 1, 0, 2, 1, 1, 0, 1, 1, 0, 2, 0, 1, 0, 1, 2, 0, 1, 1, 1, 2, 0, 2, 0, 2, 2, 0, 0, 0, 1, 2, 1, 1, 0, 1, 1, 1, 0, 1, 0, 2, 0, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 1, 0, 0, 2, 2, 1, 0, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 2, 0, 2, 0, 2, 0, 2, 1, 0, 0, 2, 0, 2, 1, 1, 1, 1, 0, 1, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 1, 2, 2, 1, 0, 0, 1, 0, 0, 1, 2, 1, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 1, 0, 1, 1, 0, 0, 2, 0, 1, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 1, 1, 2, 2, 0, 0, 0, 2, 2, 1, 0, 1, 0, 1, 0, 0, 2, 1, 0, 0, 2, 1, 2, 2, 0, 0, 1, 2, 2, 2, 1, 0, 0, 2, 1, 0, 2, 0, 0, 0, 0, 0, 1, 1, 0, 2, 2, 1, 2, 0, 1, 2, 1, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 2, 1, 0, 1, 1, 1, 2, 1, 1, 2, 1, 0, 1, 1, 0, 2, 0, 0, 0, 2, 1, 2, 2, 2, 0, 0, 2, 0, 0, 1, 1, 1, 1, 2, 1, 2, 1, 2, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 1, 2, 0, 0, 2, 0, 0, 2, 0, 2, 0, 1, 2, 1, 0, 2, 2, 2, 1, 1, 0, 0, 1, 1, 0, 0, 2, 1, 2, 2, 2, 2, 1, 0, 0, 0, 1, 2, 1, 2, 0, 1, 0, 2, 0, 1, 1, 1, 2, 1, 0, 2, 2, 2, 0, 2, 1, 0, 1, 2, 0, 1, 0, 2, 1, 2, 2, 0, 0, 2, 0, 1, 1, 0, 2, 2, 1, 2, 1, 0, 1, 0, 1, 0, 2, 1, 1, 1, 0, 2, 2, 2, 1, 2, 1, 0, 0, 0, 0, 1, 2, 0, 1, 2, 1, 1, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 1, 0, 0, 1, 0, 2, 1, 0, 0, 1, 1, 1, 0, 0, 2, 1, 2, 1, 1, 1, 0, 0, 1, 0, 2, 2, 2, 0, 0, 2, 2, 1, 2, 1, 2, 0, 2, 2, 2, 2, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 1, 1, 0, 1, 2, 1, 0, 2, 0, 0, 2, 2, 1, 1, 0, 1, 0, 1, 2, 2, 2, 0, 0, 0, 2, 2, 2, 2, 0, 2, 1, 1, 2, 0, 1, 2, 0, 2, 0, 2, 2, 1, 2, 2, 0, 2, 1, 1, 1, 0, 1, 0, 0, 2, 2, 0, 1, 0, 0, 0, 0, 2, 1, 1, 0, 2, 0, 2, 2, 1, 0, 0, 2, 0, 2, 2, 0, 1, 2, 2, 0, 1, 1, 2, 1, 2, 2, 1, 1, 0, 2, 2, 1, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 0, 1, 0, 2, 1, 1, 2, 1, 2, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 0, 1, 1, 0, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 2, 2, 1, 2, 0, 0, 2, 1, 2, 1, 0, 2, 0, 2, 1, 0, 1, 2, 1, 1, 0, 0, 2, 2, 0, 1, 0, 2, 2, 2, 1, 0, 1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 2, 2, 1, 0, 0, 0, 2, 0, 1, 1, 0, 0, 0, 1, 1, 2, 0, 2, 1, 0, 0, 2, 1, 0, 1, 0, 2, 0, 0, 2, 1, 1, 0, 1, 2, 2, 1, 1, 1, 0, 2, 2, 2, 2, 1, 0, 0, 2, 2, 0, 1, 1, 2, 0, 2, 0, 2, 2, 0, 0, 1, 2, 1, 0, 2, 2, 1, 2, 1, 1, 1, 0, 1, 1, 1, 0, 2, 0, 0, 2, 0, 1, 2, 2, 1, 0, 2, 1, 2, 1, 0, 0, 0, 2, 0, 1, 2, 2, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 2, 1, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 2, 2, 2, 0, 2, 0, 0, 1, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1, 0, 1, 2, 2, 0, 2, 0, 2, 1, 2, 1, 2, 0, 0, 1, 1, 1, 0, 2, 0, 2, 2, 2, 2, 2, 1, 2, 0, 0, 0, 2, 1, 2, 1, 1, 0, 1, 2, 2, 1, 0, 1, 2, 1, 0, 1, 1, 2, 2, 1, 1, 0, 0, 2, 2, 1, 2, 0, 2, 1, 1]


In [14]:
# evaluate the hdbscan prediction labels
evaluate(hdbscan_predict_label_list, actual_labels_predict)

Result Evaluation
Normalized MI = 1.00
Adjusted MI = 1.00


In [18]:
# these are the labels from the notebook: Tribuo Predictions 5000C3F4
tribuo_predict_label_list = [4, 2, 2, 2, 5, 5, 5, 2, 5, 5, 5, 2, 4, 4, 5, 2, 2, 4, 4, 4, 5, 5, 4, 2, 5, 5, 5, 2, 4, 5, 4, 4, 2, 4, 2, 5, 5, 2, 4, 4, 5, 5, 5, 5, 2, 2, 4, 4, 5, 5, 2, 2, 2, 4, 2, 2, 5, 5, 4, 4, 4, 4, 5, 5, 5, 4, 2, 5, 4, 4, 5, 4, 2, 5, 2, 4, 4, 5, 5, 4, 5, 4, 5, 2, 2, 5, 2, 5, 4, 2, 5, 5, 5, 4, 2, 2, 5, 5, 5, 2, 4, 2, 2, 5, 5, 5, 4, 5, 4, 4, 5, 5, 5, 5, 2, 5, 4, 4, 4, 4, 4, 2, 2, 2, 4, 2, 5, 2, 4, 4, 5, 4, 5, 5, 5, 2, 5, 5, 4, 4, 2, 4, 5, 2, 5, 2, 2, 4, 4, 4, 2, 2, 2, 2, 5, 5, 5, 4, 2, 2, 5, 2, 5, 5, 4, 2, 2, 5, 4, 5, 2, 4, 4, 5, 4, 4, 5, 2, 5, 4, 5, 4, 2, 5, 4, 4, 4, 2, 5, 2, 5, 2, 2, 5, 5, 5, 4, 2, 4, 4, 5, 4, 4, 4, 5, 4, 5, 2, 5, 2, 5, 5, 5, 5, 5, 2, 2, 5, 5, 4, 5, 5, 2, 2, 4, 5, 5, 4, 2, 4, 4, 4, 4, 4, 4, 4, 5, 5, 4, 5, 5, 2, 5, 2, 5, 2, 5, 2, 4, 5, 5, 2, 5, 2, 4, 4, 4, 4, 5, 4, 4, 5, 2, 5, 2, 4, 4, 4, 4, 5, 4, 2, 2, 4, 5, 5, 4, 5, 5, 4, 2, 4, 5, 2, 5, 5, 2, 5, 5, 5, 2, 5, 2, 2, 5, 5, 4, 5, 4, 4, 5, 5, 2, 5, 4, 2, 5, 2, 5, 5, 5, 2, 5, 5, 5, 4, 4, 2, 2, 5, 5, 5, 2, 2, 4, 5, 4, 5, 4, 5, 5, 2, 4, 5, 5, 2, 4, 2, 2, 5, 5, 4, 2, 2, 2, 4, 5, 5, 2, 4, 5, 2, 5, 5, 5, 5, 5, 4, 4, 5, 2, 2, 4, 2, 5, 4, 2, 4, 5, 5, 5, 2, 4, 2, 4, 2, 4, 2, 2, 4, 5, 4, 4, 4, 2, 4, 4, 2, 4, 5, 4, 4, 5, 2, 5, 5, 5, 2, 4, 2, 2, 2, 5, 5, 2, 5, 5, 4, 4, 4, 4, 2, 4, 2, 4, 2, 2, 2, 2, 5, 5, 2, 2, 5, 5, 4, 4, 4, 2, 5, 5, 2, 5, 5, 2, 5, 2, 5, 4, 2, 4, 5, 2, 2, 2, 4, 4, 5, 5, 4, 4, 5, 5, 2, 4, 2, 2, 2, 2, 4, 5, 5, 5, 4, 2, 4, 2, 5, 4, 5, 2, 5, 4, 4, 4, 2, 4, 5, 2, 2, 2, 5, 2, 4, 5, 4, 2, 5, 4, 5, 2, 4, 2, 2, 5, 5, 2, 5, 4, 4, 5, 2, 2, 4, 2, 4, 5, 4, 5, 4, 5, 2, 4, 4, 4, 5, 2, 2, 2, 4, 2, 4, 5, 5, 5, 5, 4, 2, 5, 4, 2, 4, 4, 5, 5, 2, 5, 2, 5, 5, 2, 2, 2, 2, 2, 4, 5, 5, 4, 5, 2, 4, 5, 5, 4, 4, 4, 5, 5, 2, 4, 2, 4, 4, 4, 5, 5, 4, 5, 2, 2, 2, 5, 5, 2, 2, 4, 2, 4, 2, 5, 2, 2, 2, 2, 5, 5, 5, 2, 5, 5, 2, 5, 2, 5, 2, 4, 4, 5, 4, 2, 4, 5, 2, 5, 5, 2, 2, 4, 4, 5, 4, 5, 4, 2, 2, 2, 5, 5, 5, 2, 2, 2, 2, 5, 2, 4, 4, 2, 5, 4, 2, 5, 2, 5, 2, 2, 4, 2, 2, 5, 2, 4, 4, 4, 5, 4, 5, 5, 2, 2, 5, 4, 5, 5, 5, 5, 2, 4, 4, 5, 2, 5, 2, 2, 4, 5, 5, 2, 5, 2, 2, 5, 4, 2, 2, 5, 4, 4, 2, 4, 2, 2, 4, 4, 5, 2, 2, 4, 2, 4, 2, 4, 4, 2, 2, 2, 4, 2, 5, 4, 5, 2, 4, 4, 2, 4, 2, 4, 5, 2, 5, 5, 5, 5, 5, 5, 5, 2, 5, 5, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 5, 4, 4, 5, 5, 2, 4, 2, 5, 2, 5, 2, 5, 2, 2, 2, 4, 2, 5, 5, 2, 4, 2, 4, 5, 2, 5, 2, 4, 5, 4, 2, 4, 4, 5, 5, 2, 2, 5, 4, 5, 2, 2, 2, 4, 5, 4, 2, 4, 5, 4, 2, 5, 5, 4, 4, 2, 2, 4, 5, 5, 5, 2, 5, 4, 4, 5, 5, 5, 4, 4, 2, 5, 2, 4, 5, 5, 2, 4, 5, 4, 5, 2, 5, 5, 2, 4, 4, 5, 4, 2, 2, 4, 4, 4, 5, 2, 2, 2, 2, 4, 5, 5, 2, 2, 5, 4, 4, 2, 5, 2, 5, 2, 2, 5, 5, 4, 2, 4, 5, 2, 2, 4, 2, 4, 4, 4, 5, 4, 4, 4, 5, 2, 5, 5, 2, 5, 4, 2, 2, 4, 5, 2, 4, 2, 4, 5, 5, 5, 2, 5, 4, 2, 2, 5, 4, 5, 4, 5, 4, 4, 5, 5, 4, 2, 4, 2, 5, 2, 2, 2, 5, 2, 2, 2, 5, 5, 4, 5, 4, 4, 4, 4, 5, 5, 2, 2, 2, 5, 2, 5, 5, 4, 2, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 4, 5, 4, 2, 2, 5, 2, 5, 2, 4, 2, 4, 2, 5, 5, 4, 4, 4, 5, 2, 5, 2, 2, 2, 2, 2, 4, 2, 5, 5, 5, 2, 4, 2, 4, 4, 5, 4, 2, 2, 4, 5, 4, 2, 4, 5, 4, 4, 2, 2, 4, 4, 5, 5, 2, 2, 4, 2, 5, 2, 4, 4]


In [19]:
# evaluate the Tribuo prediction labels
evaluate(tribuo_predict_label_list, actual_labels_predict)

Result Evaluation
Normalized MI = 1.00
Adjusted MI = 1.00


In [20]:
# check the MI between the prediction labels from hdbscan model and the tribuo model
evaluate(hdbscan_predict_label_list, tribuo_predict_label_list)

Result Evaluation
Normalized MI = 1.00
Adjusted MI = 1.00
