## This notebook prepares a dataset for testing HDBSCAN Predictions: 5000 records, 3 centers, 4 features

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs

In [2]:
X, y = make_blobs(n_samples=5000, centers=3, n_features=4, random_state=0)

In [3]:
print(X.shape)

(5000, 4)


In [4]:
print(X[0])

[ 8.25211094 -2.18435032  6.82927351  0.4625509 ]


In [5]:
df = pd.DataFrame({'Feature1': X[:, 0], 'Feature2': X[:, 1], 'Feature3': X[:, 2], 'Feature4': X[:, 3]})


In [6]:
df.head(10)

Unnamed: 0,Feature1,Feature2,Feature3,Feature4
0,8.252111,-2.18435,6.829274,0.462551
1,0.008188,4.780855,1.699316,3.437895
2,10.203929,-2.089471,5.469007,2.435436
3,1.595425,2.429474,1.155381,1.379744
4,1.847534,4.964358,3.229329,1.428795
5,1.707929,4.238299,2.403437,1.560922
6,8.682364,-1.882109,6.300037,1.954943
7,-2.136359,3.980295,-1.288406,7.559039
8,9.59648,-2.428622,6.325567,0.178875
9,9.258085,-3.213289,6.429689,0.492628


In [7]:
df_train = df[:4000]
df_predict = df[4000:]

In [8]:
# write out the data, the tribuo loader uses the header
df_train.to_csv('../../data/big-gaussians-3centers-train.csv', index=False, header=True)
df_predict.to_csv('../../data/big-gaussians-3centers-predict.csv', index=False, header=True)


In [9]:
# y_train
# print(y[:4000].tolist())

In [10]:
# y_predict
print(y[4000:].tolist())

[1, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 2, 1, 1, 0, 2, 2, 1, 1, 1, 0, 0, 1, 2, 0, 0, 0, 2, 1, 0, 1, 1, 2, 1, 2, 0, 0, 2, 1, 1, 0, 0, 0, 0, 2, 2, 1, 1, 0, 0, 2, 2, 2, 1, 2, 2, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 2, 0, 1, 1, 0, 1, 2, 0, 2, 1, 1, 0, 0, 1, 0, 1, 0, 2, 2, 0, 2, 0, 1, 2, 0, 0, 0, 1, 2, 2, 0, 0, 0, 2, 1, 2, 2, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 2, 0, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 0, 2, 1, 1, 0, 1, 0, 0, 0, 2, 0, 0, 1, 1, 2, 1, 0, 2, 0, 2, 2, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 1, 2, 2, 0, 2, 0, 0, 1, 2, 2, 0, 1, 0, 2, 1, 1, 0, 1, 1, 0, 2, 0, 1, 0, 1, 2, 0, 1, 1, 1, 2, 0, 2, 0, 2, 2, 0, 0, 0, 1, 2, 1, 1, 0, 1, 1, 1, 0, 1, 0, 2, 0, 2, 0, 0, 0, 0, 0, 2, 2, 0, 0, 1, 0, 0, 2, 2, 1, 0, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 2, 0, 2, 0, 2, 0, 2, 1, 0, 0, 2, 0, 2, 1, 1, 1, 1, 0, 1, 1, 0, 2, 0, 2, 1, 1, 1, 1, 0, 1, 2, 2, 1, 0, 0, 1, 0, 0, 1, 2, 1, 0, 2, 0, 0, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 1, 0, 1, 1, 0, 0, 2, 0, 1, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 1, 1, 2, 2, 0, 0, 0, 2, 2, 1, 0, 1, 0, 1, 0, 0, 2, 1, 