## This notebook prepares a dataset for testing HDBSCAN Predictions: 5000 records, 5 centers, 4 features

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs

In [2]:
X, y = make_blobs(n_samples=5000, centers=5, n_features=4, random_state=0)

In [3]:
print(X.shape)

(5000, 4)


In [4]:
print(X[0])

[ 7.70771342 -1.26691781  5.38792335  1.20370119]


In [5]:
df = pd.DataFrame({'Feature1': X[:, 0], 'Feature2': X[:, 1], 'Feature3': X[:, 2], 'Feature4': X[:, 3]})


In [6]:
df.head(10)

Unnamed: 0,Feature1,Feature2,Feature3,Feature4
0,7.707713,-1.266918,5.387923,1.203701
1,3.150831,9.309946,-8.825983,-8.143661
2,1.484398,7.555982,-9.36243,-9.390599
3,-1.109179,3.176266,-2.398684,8.164574
4,1.583382,3.255617,1.195005,1.225965
5,9.845388,-2.373233,6.667881,0.448418
6,2.206561,5.506167,1.667941,0.595361
7,8.504903,-2.090548,4.96981,-1.057357
8,-1.500025,1.832179,-1.875144,9.234417
9,1.464961,4.195825,0.633364,-0.038346


In [7]:
df_train = df[:4000]
df_predict = df[4000:]

In [8]:
# write out the data, the tribuo loader uses the header
df_train.to_csv('../../data/big-gaussians-5centers-train.csv', index=False, header=True)
df_predict.to_csv('../../data/big-gaussians-5centers-predict.csv', index=False, header=True)


In [9]:
# y_train
# print(y[:4000].tolist())

In [10]:
# y_predict
print(y[4000:].tolist())

[0, 2, 2, 2, 3, 1, 4, 2, 2, 3, 3, 0, 4, 1, 4, 3, 3, 4, 4, 0, 0, 0, 3, 0, 0, 0, 3, 2, 2, 1, 3, 4, 2, 2, 2, 0, 0, 1, 4, 1, 1, 0, 4, 2, 0, 3, 2, 3, 3, 3, 1, 1, 4, 2, 3, 1, 1, 0, 0, 3, 3, 1, 2, 1, 1, 4, 3, 3, 2, 4, 3, 1, 0, 3, 2, 2, 1, 0, 0, 1, 2, 3, 0, 2, 1, 0, 2, 4, 0, 3, 3, 1, 0, 1, 2, 1, 2, 4, 4, 1, 4, 0, 0, 0, 0, 3, 2, 4, 3, 0, 1, 0, 2, 1, 2, 2, 1, 0, 1, 1, 4, 1, 2, 2, 2, 3, 2, 3, 4, 4, 2, 3, 1, 4, 2, 3, 0, 1, 1, 0, 1, 4, 0, 0, 3, 2, 3, 3, 0, 3, 0, 3, 4, 2, 2, 3, 4, 4, 4, 3, 0, 0, 0, 3, 3, 4, 1, 4, 1, 1, 1, 4, 4, 0, 3, 0, 3, 3, 1, 1, 2, 3, 0, 4, 1, 2, 1, 3, 4, 1, 2, 2, 2, 3, 0, 3, 1, 4, 4, 0, 0, 0, 1, 3, 2, 2, 1, 3, 1, 2, 1, 2, 1, 4, 0, 3, 0, 0, 0, 0, 0, 4, 4, 0, 1, 2, 0, 0, 3, 4, 2, 1, 0, 3, 3, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 0, 1, 4, 1, 4, 0, 4, 0, 4, 2, 1, 1, 4, 1, 4, 2, 1, 1, 2, 0, 2, 2, 0, 3, 1, 3, 2, 2, 1, 2, 0, 2, 4, 4, 2, 0, 0, 2, 0, 1, 2, 4, 2, 0, 4, 1, 0, 4, 0, 1, 1, 4, 0, 3, 3, 1, 0, 2, 0, 2, 3, 0, 1, 4, 1, 1, 3, 0, 4, 0, 0, 1, 4, 1, 0, 0, 1, 2, 4, 4, 1, 1, 1, 4, 3, 2, 1, 3, 