## This notebook prepares a dataset for testing HDBSCAN Performance: 100000 records, 6 centers, 7 features

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs

In [2]:
X, y = make_blobs(n_samples=100000, centers=6, n_features=7, random_state=0)

In [3]:
print(X.shape)

(100000, 7)


In [4]:
print(X[0])

[ -7.98052064  -8.96285772 -10.57567576   6.90806476   5.35194402
   7.34244939   8.89208925]


In [5]:
df = pd.DataFrame({'Feature1': X[:, 0], 'Feature2': X[:, 1], 'Feature3': X[:, 2], 'Feature4': X[:, 3],
                   'Feature5': X[:, 4], 'Feature6': X[:, 5], 'Feature7': X[:, 6]})


In [6]:
df.head(10)

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7
0,-7.980521,-8.962858,-10.575676,6.908065,5.351944,7.342449,8.892089
1,2.248289,-0.585661,-4.625192,5.826962,-1.090488,2.808712,-8.49956
2,3.166599,1.030258,3.80751,7.712602,4.579464,-2.682617,-1.342537
3,3.07942,2.853454,3.748174,8.650505,2.897442,-3.213296,-0.440185
4,7.725893,7.428309,-3.479445,5.67647,-0.375668,3.769964,7.806266
5,-10.1053,-8.04791,-10.989488,6.737778,4.83061,7.553869,9.119083
6,-0.927766,-0.505007,-6.936486,4.778307,-2.071822,3.938992,-10.941437
7,-7.528819,-7.648303,-8.741672,5.991236,4.059281,6.540605,10.179735
8,-8.844708,-7.857196,-8.691144,6.029673,4.712196,5.932339,8.324834
9,-8.293835,-8.933276,-10.078186,4.230638,7.056091,6.945011,11.090134


In [7]:
# write out the data, the tribuo loader uses the header
df.to_csv('/Users/gstewart/temp/development/mscs/cpsc69700/RefImpl/HDBSCAN_Star/xtrabig-gaussians-6centers.csv', index=False, header=False)
df.to_csv('../../data/xtrabig-gaussians-6centers.csv', index=False, header=True)


In [8]:
# this 60:40 train test ratio isn't a recommended way to split, but gives a larger predict set for measuring performance 
df_train = df[:60000]
df_predict = df[60000:]

In [9]:
df_train.to_csv('../../data/xtrabig-gaussians-6centers-train.csv', index=False, header=True)
df_predict.to_csv('../../data/xtrabig-gaussians-6centers-predict.csv', index=False, header=True)