## This notebook prepares an dataset for unit testing the Tribuo HDBSCAN* implementation. It is also used to compare HDBSCAN* implementations.

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs

In [2]:
X, y = make_blobs(n_samples=2000, centers=4, n_features=3, random_state=0)

In [3]:
print(X.shape)

(2000, 3)


In [4]:
print(X[0])

[-1.23310899  8.97710796 10.54741024]


In [5]:
df = pd.DataFrame({'Feature1': X[:, 0], 'Feature2': X[:, 1], 'Feature3': X[:, 2]})


In [6]:
df.head(10)

Unnamed: 0,Feature1,Feature2,Feature3
0,-1.233109,8.977108,10.54741
1,-3.10388,4.669625,1.726785
2,0.614266,-0.710369,2.920267
3,-2.644812,7.052198,-0.509567
4,0.468752,3.251907,4.552468
5,0.312843,6.703921,8.977024
6,-0.508796,-1.701727,3.597826
7,-0.057279,-2.997306,3.92831
8,0.382493,2.291907,2.644971
9,-1.426183,7.62384,9.540739


In [7]:
# the same data is saved to 2 different files, the Tribuo and Python loader uses the header
df.to_csv('/Users/gstewart/temp/development/mscs/cpsc69700/RefImpl/HDBSCAN_Star/basic-gaussians.csv', index=False, header=False)
df.to_csv('../../data/basic-gaussians.csv', index=False, header=True)

# the data is also used for comparing prediction results
df[:1980].to_csv('../../data/basic-gaussians-train.csv', index=False, header=True)
df[1980:].to_csv('../../data/basic-gaussians-predict.csv', index=False, header=True)


In [8]:
print(y.tolist())

[2, 3, 1, 3, 0, 2, 1, 1, 0, 2, 1, 0, 3, 0, 2, 0, 1, 3, 3, 3, 3, 1, 0, 3, 3, 3, 3, 3, 1, 0, 2, 2, 1, 3, 2, 3, 0, 2, 0, 1, 2, 0, 0, 3, 2, 2, 2, 2, 2, 2, 3, 0, 1, 3, 1, 3, 2, 3, 1, 3, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 0, 3, 2, 2, 1, 0, 2, 0, 3, 2, 0, 0, 3, 1, 2, 1, 0, 0, 2, 2, 1, 3, 1, 1, 0, 3, 2, 2, 3, 3, 1, 1, 0, 1, 3, 3, 1, 3, 2, 2, 1, 2, 0, 2, 2, 1, 1, 2, 3, 1, 3, 2, 2, 0, 0, 2, 1, 1, 2, 0, 0, 0, 2, 0, 1, 2, 0, 0, 2, 1, 1, 3, 1, 1, 0, 2, 0, 2, 1, 3, 2, 0, 3, 2, 3, 0, 2, 1, 1, 1, 2, 2, 1, 1, 3, 0, 0, 3, 2, 0, 2, 3, 0, 2, 0, 2, 0, 3, 0, 2, 0, 0, 0, 0, 1, 1, 1, 0, 3, 1, 0, 3, 3, 2, 3, 0, 2, 1, 1, 1, 0, 1, 2, 3, 0, 0, 1, 0, 1, 3, 0, 2, 0, 1, 3, 3, 0, 1, 2, 2, 2, 0, 0, 3, 3, 2, 0, 1, 3, 1, 2, 0, 1, 0, 3, 0, 2, 2, 3, 2, 2, 1, 2, 0, 3, 1, 1, 3, 3, 0, 3, 3, 3, 2, 2, 0, 1, 2, 0, 2, 3, 1, 1, 3, 3, 3, 1, 3, 1, 3, 3, 3, 2, 3, 0, 1, 1, 1, 1, 1, 1, 3, 3, 2, 3, 0, 2, 3, 1, 3, 3, 0, 1, 1, 1, 2, 1, 2, 3, 2, 3, 2, 1, 1, 1, 2, 1, 0, 1, 3, 3, 1, 2, 1, 1, 1, 2, 0, 2, 0, 1, 3, 3, 2, 0, 3, 2, 2, 2, 1, 1, 2, 1, 