## This notebook measures Tribuo Hdbscan performance with the Gaussian 100000 data

In [1]:
%jars ../../../jars/junit-jupiter-api-5.7.0.jar
%jars ../../../jars/opentest4j-1.2.0.jar
%jars ../../../jars/junit-platform-commons-1.7.1.jar
%jars ../../../jars/tribuo-clustering-hdbscan-4.3.0-jar-with-dependencies.jar

In [2]:
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.*;

In [3]:
import org.tribuo.Dataset;
import org.tribuo.Feature;
import org.tribuo.MutableDataset;
import org.tribuo.Prediction;
import org.tribuo.clustering.ClusterID;
import org.tribuo.clustering.ClusteringFactory;
import org.tribuo.clustering.hdbscan.HdbscanModel;
import org.tribuo.clustering.hdbscan.HdbscanTrainer;
import org.tribuo.data.columnar.FieldProcessor;
import org.tribuo.data.columnar.ResponseProcessor;
import org.tribuo.data.columnar.RowProcessor;
import org.tribuo.data.columnar.processors.field.DoubleFieldProcessor;
import org.tribuo.data.columnar.processors.response.EmptyResponseProcessor;
import org.tribuo.data.csv.CSVDataSource;
import org.tribuo.data.csv.CSVLoader;
import org.tribuo.math.distance.DistanceType;
import org.tribuo.math.neighbour.NeighboursQueryFactoryType;
import org.tribuo.util.Util;

In [4]:
ClusteringFactory clusteringFactory = new ClusteringFactory();
ResponseProcessor<ClusterID> emptyResponseProcessor = new EmptyResponseProcessor<>(clusteringFactory);
Map<String, FieldProcessor> regexMappingProcessors = new HashMap<>();
regexMappingProcessors.put("Feature1", new DoubleFieldProcessor("Feature1"));
regexMappingProcessors.put("Feature2", new DoubleFieldProcessor("Feature2"));
regexMappingProcessors.put("Feature3", new DoubleFieldProcessor("Feature3"));
regexMappingProcessors.put("Feature4", new DoubleFieldProcessor("Feature4"));
regexMappingProcessors.put("Feature5", new DoubleFieldProcessor("Feature5"));
regexMappingProcessors.put("Feature6", new DoubleFieldProcessor("Feature6"));
regexMappingProcessors.put("Feature7", new DoubleFieldProcessor("Feature7"));

RowProcessor<ClusterID> rowProcessor = new RowProcessor<>(emptyResponseProcessor, regexMappingProcessors);
CSVDataSource<ClusterID> csvDataSource = new CSVDataSource<>(Paths.get("../../../data/xtrabig-gaussians-6centers.csv"), rowProcessor, false);
Dataset<ClusterID> dataset = new MutableDataset<>(csvDataSource);

In [5]:
System.out.println(String.format("Data size = %d, number of features = %d",dataset.size(),dataset.getFeatureMap().size()));


Data size = 100000, number of features = 7


In [6]:
var trainer = new HdbscanTrainer(10, DistanceType.L2.getDistance(), 10, 8, NeighboursQueryFactoryType.BRUTE_FORCE);
var startTime = System.currentTimeMillis();
var model = trainer.train(dataset);
var endTime = System.currentTimeMillis();
System.out.println("Training took " + Util.formatDuration(startTime,endTime));

// run 1
// time 315.6s

// run 2
// time 307.2s

// run 3
// time 301.5s

Training took (00:05:46:366)


In [7]:
System.out.println(model.getClusterLabels().size());

100000
