## This notebook prepares an initial dataset for testing HDBSCAN* implementations

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs

In [2]:
X, y = make_blobs(n_samples=4000, centers=3, n_features=7, random_state=0)

In [3]:
print(X.shape)

(4000, 7)


In [4]:
print(X[0])

[ 0.67230582  4.18948271  1.10578504 -0.74528368 -0.71367015  3.79597305
 -1.94661927]


In [5]:
df = pd.DataFrame({'Feature1': X[:, 0], 'Feature2': X[:, 1], 'Feature3': X[:, 2], 
                   'Feature4': X[:, 3], 'Feature5': X[:, 4], 'Feature6': X[:, 5], 
                   'Feature7': X[:, 6]})


In [6]:
df.head(10)

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7
0,0.672306,4.189483,1.105785,-0.745284,-0.71367,3.795973,-1.946619
1,7.514049,10.0636,-1.519702,5.337507,-0.625071,1.81239,9.316743
2,-8.302205,-8.408587,-9.810006,7.940123,6.038438,7.001291,9.368331
3,8.641311,9.251208,-3.345092,6.961084,-0.602621,0.944358,9.842754
4,1.655776,3.849035,1.659614,1.177785,-1.038246,2.975681,-1.130993
5,-0.600762,3.026254,2.544056,0.455811,-1.221484,1.403239,-3.800997
6,8.160245,8.761986,-3.131785,6.509558,1.412981,2.846367,10.031371
7,0.346628,3.831529,0.541906,2.005288,-1.350665,1.977529,-0.318661
8,1.51,5.47948,3.04521,3.374911,-1.824327,4.662318,-3.105788
9,-7.787603,-7.992853,-9.734749,5.824639,5.926706,6.158528,10.264364


In [7]:
# This generates a csv file

# the same data is saved to 2 different files, the Tribuo and Python loader uses the header
df.to_csv('/Users/gstewart/temp/development/mscs/cpsc69700/RefImpl/HDBSCAN_Star/first-gaussians.csv', index=False, header=False)
df.to_csv('../../data/first-gaussians.csv', index=False, header=True)


In [8]:
print(y.tolist())

[0, 1, 2, 1, 0, 0, 1, 0, 0, 2, 2, 1, 2, 2, 0, 2, 1, 1, 0, 0, 0, 1, 2, 2, 1, 0, 1, 0, 1, 0, 1, 1, 2, 0, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 0, 0, 1, 1, 1, 1, 0, 1, 0, 2, 1, 1, 2, 0, 0, 0, 2, 1, 2, 2, 2, 2, 2, 1, 0, 2, 0, 0, 0, 2, 0, 0, 1, 0, 1, 0, 2, 0, 2, 1, 2, 0, 1, 2, 1, 0, 2, 2, 1, 0, 1, 2, 2, 1, 0, 0, 2, 0, 1, 0, 1, 2, 1, 0, 1, 1, 0, 1, 2, 2, 1, 0, 0, 1, 0, 0, 1, 2, 1, 2, 1, 0, 2, 0, 1, 1, 1, 0, 2, 2, 0, 1, 1, 2, 2, 1, 1, 2, 0, 2, 1, 1, 2, 1, 0, 0, 2, 0, 2, 1, 1, 1, 0, 0, 1, 1, 0, 0, 2, 1, 2, 2, 1, 0, 2, 1, 0, 0, 1, 2, 1, 0, 2, 1, 1, 0, 1, 1, 2, 0, 1, 2, 1, 0, 2, 1, 1, 1, 0, 0, 0, 2, 1, 1, 1, 2, 1, 1, 0, 0, 1, 2, 2, 2, 1, 0, 1, 0, 0, 2, 0, 1, 1, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 0, 0, 1, 2, 1, 1, 1, 1, 0, 1, 0, 1, 2, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2, 0, 1, 0, 0, 1, 2, 2, 1, 1, 0, 2, 2, 2, 1, 0, 1, 2, 1, 2, 0, 1, 2, 0, 0, 1, 0, 0, 1, 0, 1, 0, 2, 0, 2, 2, 1, 1, 1, 0, 1, 2, 2, 2, 0, 1, 1, 0, 2, 1, 0, 2, 1, 0, 2, 2, 2, 0, 2, 1, 1, 2, 1, 0, 2, 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 0, 0, 1, 2, 0, 