## This notebook measures Tribuo Hdbscan Prediction performance with the Gaussian 50000 data

In [1]:
%jars ../../../jars/junit-jupiter-api-5.7.0.jar
%jars ../../../jars/opentest4j-1.2.0.jar
%jars ../../../jars/junit-platform-commons-1.7.1.jar
%jars ../../../jars/tribuo-clustering-hdbscan-4.2.0-SNAPSHOT-jar-with-dependencies.jar

In [2]:
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.*;

In [3]:
import org.tribuo.Dataset;
import org.tribuo.Feature;
import org.tribuo.MutableDataset;
import org.tribuo.Prediction;
import org.tribuo.clustering.ClusterID;
import org.tribuo.clustering.ClusteringFactory;
import org.tribuo.clustering.hdbscan.HdbscanModel;
import org.tribuo.clustering.hdbscan.HdbscanTrainer;
import org.tribuo.data.columnar.FieldProcessor;
import org.tribuo.data.columnar.ResponseProcessor;
import org.tribuo.data.columnar.RowProcessor;
import org.tribuo.data.columnar.processors.field.DoubleFieldProcessor;
import org.tribuo.data.columnar.processors.response.EmptyResponseProcessor;
import org.tribuo.data.csv.CSVDataSource;
import org.tribuo.data.csv.CSVLoader;
import org.tribuo.util.Util;

In [4]:
ClusteringFactory clusteringFactory = new ClusteringFactory();
ResponseProcessor<ClusterID> emptyResponseProcessor = new EmptyResponseProcessor<>(clusteringFactory);
Map<String, FieldProcessor> regexMappingProcessors = new HashMap<>();
regexMappingProcessors.put("Feature1", new DoubleFieldProcessor("Feature1"));
regexMappingProcessors.put("Feature2", new DoubleFieldProcessor("Feature2"));
regexMappingProcessors.put("Feature3", new DoubleFieldProcessor("Feature3"));
regexMappingProcessors.put("Feature4", new DoubleFieldProcessor("Feature4"));

RowProcessor<ClusterID> rowProcessor = new RowProcessor<>(emptyResponseProcessor, regexMappingProcessors);
CSVDataSource<ClusterID> csvDataSource = new CSVDataSource<>(Paths.get("../../../data/verybig-gaussians-6centers-train.csv"), rowProcessor, false);
Dataset<ClusterID> dataset = new MutableDataset<>(csvDataSource);

CSVDataSource<ClusterID> csvTestSource = new CSVDataSource<>(Paths.get("../../../data/verybig-gaussians-6centers-predict.csv"),rowProcessor,false);
Dataset<ClusterID> predictSet = new MutableDataset<>(csvTestSource);


In [5]:
System.out.println(String.format("Data size = %d, number of features = %d",dataset.size(),dataset.getFeatureMap().size()));
System.out.println(String.format("Predict Data size = %d, number of features = %d",predictSet.size(),predictSet.getFeatureMap().size()));


Data size = 30000, number of features = 4
Predict Data size = 20000, number of features = 4


In [6]:
var trainer = new HdbscanTrainer(10, HdbscanTrainer.Distance.EUCLIDEAN, 10, 8);
var model = trainer.train(dataset);

In [7]:
System.out.println(model.getClusterLabels().size());

30000


In [8]:
var startTime = System.currentTimeMillis();
List<Prediction<ClusterID>> predictions = model.predict(predictSet);
var endTime = System.currentTimeMillis();
System.out.println("Predicting took " + Util.formatDuration(startTime,endTime));

// run 1
// time 0.26s

// run 2
// time 0.33s

// run 3
// time 0.24s

Predicting took (00:00:00:236)
