## This notebook compares the predictions from the Python hdbscan module to the Tribuo HDBSCAN* java implementation using the dataset: 5000 records, 5 centers, 4 features

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import normalized_mutual_info_score
import hdbscan

In [2]:
df_train = pd.read_csv('../../../data/big-gaussians-5centers-train.csv')
df_predict = pd.read_csv('../../../data/big-gaussians-5centers-predict.csv')

In [3]:
print(df_train.shape)
print(df_predict.shape)

(4000, 4)
(1000, 4)


In [4]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=8, prediction_data=True).fit(df_train)

In [5]:
# print(clusterer.labels_)

In [6]:
hdbscan_predict_labels, strengths = hdbscan.approximate_predict(clusterer, df_predict)

In [7]:
hdbscan_predict_label_list = hdbscan_predict_labels.tolist()
# print(hdbscan_predict_label_list)

In [8]:
# check the mutual information between results
def evaluate(result1, result2):
    print('Result Evaluation')
    print('Normalized MI = %.2f' % normalized_mutual_info_score(result1, result2))
    print('Adjusted MI = %.2f' % adjusted_mutual_info_score(result1, result2))

In [9]:
# these are the cluster assignments from the gaussians - declared in notebook: scikit-learn Gaussians5000C3F4 Data Setup
actual_labels_predict = [0, 2, 2, 2, 3, 1, 4, 2, 2, 3, 3, 0, 4, 1, 4, 3, 3, 4, 4, 0, 0, 0, 3, 0, 0, 0, 3, 2, 2, 1, 3, 4, 2, 2, 2, 0, 0, 1, 4, 1, 1, 0, 4, 2, 0, 3, 2, 3, 3, 3, 1, 1, 4, 2, 3, 1, 1, 0, 0, 3, 3, 1, 2, 1, 1, 4, 3, 3, 2, 4, 3, 1, 0, 3, 2, 2, 1, 0, 0, 1, 2, 3, 0, 2, 1, 0, 2, 4, 0, 3, 3, 1, 0, 1, 2, 1, 2, 4, 4, 1, 4, 0, 0, 0, 0, 3, 2, 4, 3, 0, 1, 0, 2, 1, 2, 2, 1, 0, 1, 1, 4, 1, 2, 2, 2, 3, 2, 3, 4, 4, 2, 3, 1, 4, 2, 3, 0, 1, 1, 0, 1, 4, 0, 0, 3, 2, 3, 3, 0, 3, 0, 3, 4, 2, 2, 3, 4, 4, 4, 3, 0, 0, 0, 3, 3, 4, 1, 4, 1, 1, 1, 4, 4, 0, 3, 0, 3, 3, 1, 1, 2, 3, 0, 4, 1, 2, 1, 3, 4, 1, 2, 2, 2, 3, 0, 3, 1, 4, 4, 0, 0, 0, 1, 3, 2, 2, 1, 3, 1, 2, 1, 2, 1, 4, 0, 3, 0, 0, 0, 0, 0, 4, 4, 0, 1, 2, 0, 0, 3, 4, 2, 1, 0, 3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 0, 1, 4, 1, 4, 0, 4, 0, 4, 2, 1, 1, 4, 1, 4, 2, 1, 1, 2, 0, 2, 2, 0, 3, 1, 3, 2, 2, 1, 2, 0, 2, 4, 4, 2, 0, 0, 2, 0, 1, 2, 4, 2, 0, 4, 1, 0, 4, 0, 1, 1, 4, 0, 3, 3, 1, 0, 2, 0, 2, 3, 0, 1, 4, 1, 1, 3, 0, 4, 0, 0, 1, 4, 1, 0, 0, 1, 2, 4, 4, 1, 1, 1, 4, 3, 2, 1, 3, 0, 2, 1, 0, 3, 2, 0, 0, 3, 2, 3, 4, 1, 0, 1, 4, 4, 3, 2, 1, 0, 4, 2, 1, 3, 1, 0, 0, 1, 0, 2, 2, 0, 4, 4, 2, 4, 0, 2, 3, 3, 1, 0, 0, 4, 2, 4, 2, 4, 2, 3, 4, 1, 1, 2, 2, 3, 4, 2, 1, 4, 2, 0, 2, 2, 1, 4, 0, 0, 1, 3, 3, 4, 3, 3, 1, 0, 4, 0, 1, 2, 3, 2, 1, 4, 2, 4, 1, 3, 3, 4, 4, 0, 0, 4, 4, 1, 0, 1, 2, 3, 3, 0, 0, 3, 1, 0, 3, 0, 3, 0, 3, 3, 2, 1, 4, 4, 3, 1, 2, 1, 0, 1, 2, 0, 0, 4, 2, 4, 4, 3, 4, 2, 0, 0, 1, 1, 3, 2, 3, 1, 2, 0, 4, 0, 3, 2, 3, 4, 3, 1, 3, 3, 4, 4, 1, 4, 2, 1, 1, 3, 0, 1, 0, 4, 2, 4, 3, 1, 0, 3, 1, 3, 2, 0, 4, 4, 1, 4, 2, 0, 1, 0, 2, 1, 4, 2, 2, 1, 0, 3, 4, 4, 3, 4, 2, 1, 1, 1, 1, 2, 4, 0, 1, 4, 2, 2, 1, 1, 3, 0, 4, 0, 1, 3, 4, 4, 3, 3, 3, 0, 0, 2, 1, 4, 2, 0, 1, 3, 3, 2, 0, 0, 4, 2, 3, 1, 2, 3, 1, 1, 1, 1, 3, 3, 4, 0, 1, 4, 4, 2, 3, 2, 4, 1, 3, 4, 4, 4, 0, 1, 0, 4, 0, 0, 3, 0, 4, 1, 3, 3, 3, 0, 1, 3, 2, 1, 4, 0, 1, 4, 4, 2, 2, 0, 3, 1, 2, 4, 4, 4, 1, 0, 0, 4, 3, 3, 4, 1, 3, 2, 2, 3, 1, 2, 2, 1, 3, 1, 4, 4, 2, 4, 3, 0, 3, 2, 1, 2, 1, 2, 0, 0, 4, 4, 0, 2, 1, 1, 1, 0, 3, 2, 2, 1, 4, 0, 4, 4, 1, 1, 0, 4, 0, 4, 4, 0, 2, 3, 4, 0, 2, 2, 3, 1, 3, 4, 2, 2, 1, 4, 4, 2, 4, 3, 4, 1, 2, 3, 4, 4, 2, 4, 0, 2, 0, 4, 3, 3, 4, 2, 4, 2, 0, 3, 0, 1, 1, 1, 0, 1, 1, 4, 0, 0, 2, 3, 3, 3, 4, 1, 4, 3, 2, 4, 2, 1, 4, 1, 2, 2, 1, 1, 3, 3, 3, 0, 4, 0, 3, 0, 4, 4, 4, 2, 3, 1, 1, 3, 1, 3, 2, 1, 3, 0, 3, 2, 0, 3, 4, 2, 3, 0, 1, 3, 4, 1, 2, 0, 4, 4, 3, 1, 0, 2, 3, 1, 1, 2, 4, 0, 0, 1, 2, 3, 4, 2, 0, 1, 0, 3, 1, 2, 3, 1, 0, 1, 2, 1, 4, 0, 3, 1, 0, 0, 3, 1, 0, 2, 0, 4, 1, 0, 4, 3, 2, 1, 2, 4, 3, 2, 2, 2, 1, 3, 4, 4, 3, 3, 0, 0, 3, 4, 0, 1, 2, 3, 0, 3, 0, 4, 3, 0, 0, 3, 4, 2, 0, 3, 4, 2, 3, 2, 2, 2, 0, 3, 2, 2, 0, 3, 0, 0, 3, 0, 3, 3, 4, 2, 1, 4, 1, 3, 3, 1, 1, 0, 3, 0, 3, 4, 4, 1, 2, 0, 2, 0, 3, 3, 0, 1, 2, 3, 3, 4, 0, 4, 3, 4, 0, 4, 3, 3, 0, 0, 3, 0, 2, 1, 1, 2, 1, 0, 3, 4, 4, 0, 3, 0, 1, 2, 3, 1, 1, 1, 3, 3, 4, 4, 4, 3, 3, 2, 0, 3, 3, 4, 0, 3, 0, 4, 2, 4, 2, 4, 1, 1, 2, 3, 2, 0, 3, 0, 4, 3, 3, 3, 3, 1, 3, 0, 0, 0, 4, 2, 4, 3, 2, 1, 2, 4, 4, 2, 1, 2, 3, 3, 0, 1, 1, 3, 3, 2, 2, 1, 0, 4]


In [10]:
# evaluate the hdbscan prediction labels
evaluate(hdbscan_predict_label_list, actual_labels_predict)

Result Evaluation
Normalized MI = 1.00
Adjusted MI = 1.00


In [11]:
# these are the labels from the notebook: Tribuo Predictions 5000C5F4
tribuo_predict_label_list = [8, 6, 6, 6, 3, 9, 5, 6, 6, 3, 3, 8, 5, 9, 5, 3, 3, 5, 5, 8, 8, 8, 3, 8, 8, 8, 3, 6, 6, 9, 3, 5, 6, 6, 6, 8, 8, 9, 5, 9, 9, 8, 5, 6, 8, 3, 6, 3, 3, 3, 9, 9, 5, 6, 3, 9, 9, 8, 8, 3, 3, 9, 6, 9, 9, 5, 3, 3, 6, 5, 3, 9, 8, 3, 6, 6, 9, 8, 8, 9, 6, 3, 8, 6, 9, 8, 6, 5, 8, 3, 3, 9, 8, 9, 6, 9, 6, 5, 5, 9, 5, 8, 8, 8, 8, 3, 6, 5, 3, 8, 9, 8, 6, 9, 6, 6, 9, 8, 9, 9, 5, 9, 6, 6, 6, 3, 6, 3, 5, 5, 6, 3, 9, 5, 6, 3, 8, 9, 9, 8, 9, 5, 8, 8, 3, 6, 3, 3, 8, 3, 8, 3, 5, 6, 6, 3, 5, 5, 5, 3, 8, 8, 8, 3, 3, 5, 9, 5, 9, 9, 9, 5, 5, 8, 3, 8, 3, 3, 9, 9, 6, 3, 8, 5, 9, 6, 9, 3, 5, 9, 6, 6, 6, 3, 8, 3, 9, 5, 5, 8, 8, 8, 9, 3, 6, 6, 9, 3, 9, 6, 9, 6, 9, 5, 8, 3, 8, 8, 8, 8, 8, 5, 5, 8, 9, 6, 8, 8, 3, 5, 6, 9, 8, 3, 3, 6, 6, 6, 6, 6, 6, 6, 8, 9, 6, 8, 9, 5, 9, 5, 8, 5, 8, 5, 6, 9, 9, 5, 9, 5, 6, 9, 9, 6, 8, 6, 6, 8, 3, 9, 3, 6, 6, 9, 6, 8, 6, 5, 5, 6, 8, 8, 6, 8, 9, 6, 5, 6, 8, 5, 9, 8, 5, 8, 9, 9, 5, 8, 3, 3, 9, 8, 6, 8, 6, 3, 8, 9, 5, 9, 9, 3, 8, 5, 8, 8, 9, 5, 9, 8, 8, 9, 6, 5, 5, 9, 9, 9, 5, 3, 6, 9, 3, 8, 6, 9, 8, 3, 6, 8, 8, 3, 6, 3, 5, 9, 8, 9, 5, 5, 3, 6, 9, 8, 5, 6, 9, 3, 9, 8, 8, 9, 8, 6, 6, 8, 5, 5, 6, 5, 8, 6, 3, 3, 9, 8, 8, 5, 6, 5, 6, 5, 6, 3, 5, 9, 9, 6, 6, 3, 5, 6, 9, 5, 6, 8, 6, 6, 9, 5, 8, 8, 9, 3, 3, 5, 3, 3, 9, 8, 5, 8, 9, 6, 3, 6, 9, 5, 6, 5, 9, 3, 3, 5, 5, 8, 8, 5, 5, 9, 8, 9, 6, 3, 3, 8, 8, 3, 9, 8, 3, 8, 3, 8, 3, 3, 6, 9, 5, 5, 3, 9, 6, 9, 8, 9, 6, 8, 8, 5, 6, 5, 5, 3, 5, 6, 8, 8, 9, 9, 3, 6, 3, 9, 6, 8, 5, 8, 3, 6, 3, 5, 3, 9, 3, 3, 5, 5, 9, 5, 6, 9, 9, 3, 8, 9, 8, 5, 6, 5, 3, 9, 8, 3, 9, 3, 6, 8, 5, 5, 9, 5, 6, 8, 9, 8, 6, 9, 5, 6, 6, 9, 8, 3, 5, 5, 3, 5, 6, 9, 9, 9, 9, 6, 5, 8, 9, 5, 6, 6, 9, 9, 3, 8, 5, 8, 9, 3, 5, 5, 3, 3, 3, 8, 8, 6, 9, 5, 6, 8, 9, 3, 3, 6, 8, 8, 5, 6, 3, 9, 6, 3, 9, 9, 9, 9, 3, 3, 5, 8, 9, 5, 5, 6, 3, 6, 5, 9, 3, 5, 5, 5, 8, 9, 8, 5, 8, 8, 3, 8, 5, 9, 3, 3, 3, 8, 9, 3, 6, 9, 5, 8, 9, 5, 5, 6, 6, 8, 3, 9, 6, 5, 5, 5, 9, 8, 8, 5, 3, 3, 5, 9, 3, 6, 6, 3, 9, 6, 6, 9, 3, 9, 5, 5, 6, 5, 3, 8, 3, 6, 9, 6, 9, 6, 8, 8, 5, 5, 8, 6, 9, 9, 9, 8, 3, 6, 6, 9, 5, 8, 5, 5, 9, 9, 8, 5, 8, 5, 5, 8, 6, 3, 5, 8, 6, 6, 3, 9, 3, 5, 6, 6, 9, 5, 5, 6, 5, 3, 5, 9, 6, 3, 5, 5, 6, 5, 8, 6, 8, 5, 3, 3, 5, 6, 5, 6, 8, 3, 8, 9, 9, 9, 8, 9, 9, 5, 8, 8, 6, 3, 3, 3, 5, 9, 5, 3, 6, 5, 6, 9, 5, 9, 6, 6, 9, 9, 3, 3, 3, 8, 5, 8, 3, 8, 5, 5, 5, 6, 3, 9, 9, 3, 9, 3, 6, 9, 3, 8, 3, 6, 8, 3, 5, 6, 3, 8, 9, 3, 5, 9, 6, 8, 5, 5, 3, 9, 8, 6, 3, 9, 9, 6, 5, 8, 8, 9, 6, 3, 5, 6, 8, 9, 8, 3, 9, 6, 3, 9, 8, 9, 6, 9, 5, 8, 3, 9, 8, 8, 3, 9, 8, 6, 8, 5, 9, 8, 5, 3, 6, 9, 6, 5, 3, 6, 6, 6, 9, 3, 5, 5, 3, 3, 8, 8, 3, 5, 8, 9, 6, 3, 8, 3, 8, 5, 3, 8, 8, 3, 5, 6, 8, 3, 5, 6, 3, 6, 6, 6, 8, 3, 6, 6, 8, 3, 8, 8, 3, 8, 3, 3, 5, 6, 9, 5, 9, 3, 3, 9, 9, 8, 3, 8, 3, 5, 5, 9, 6, 8, 6, 8, 3, 3, 8, 9, 6, 3, 3, 5, 8, 5, 3, 5, 8, 5, 3, 3, 8, 8, 3, 8, 6, 9, 9, 6, 9, 8, 3, 5, 5, 8, 3, 8, 9, 6, 3, 9, 9, 9, 3, 3, 5, 5, 5, 3, 3, 6, 8, 3, 3, 5, 8, 3, 8, 5, 6, 5, 6, 5, 9, 9, 6, 3, 6, 8, 3, 8, 5, 3, 3, 3, 3, 9, 3, 8, 8, 8, 5, 6, 5, 3, 6, 9, 6, 5, 5, 6, 9, 6, 3, 3, 8, 9, 9, 3, 3, 6, 6, 9, 8, 5]


In [12]:
# evaluate the Tribuo prediction labels
evaluate(tribuo_predict_label_list, actual_labels_predict)

Result Evaluation
Normalized MI = 1.00
Adjusted MI = 1.00


In [13]:
# check the MI between the prediction labels from hdbscan model and the tribuo model
evaluate(hdbscan_predict_label_list, tribuo_predict_label_list)

Result Evaluation
Normalized MI = 1.00
Adjusted MI = 1.00
