In [1]:
%jars ../../../jars/junit-jupiter-api-5.7.0.jar
%jars ../../../jars/opentest4j-1.2.0.jar
%jars ../../../jars/junit-platform-commons-1.7.1.jar
%jars ../../../jars/tribuo-clustering-hdbscan-4.3.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../../../jars/opencsv-5.4.jar

In [2]:
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.*;
import com.opencsv.CSVReaderHeaderAware;

In [3]:
import org.tribuo.Dataset;
import org.tribuo.Feature;
import org.tribuo.MutableDataset;
import org.tribuo.Prediction;
import org.tribuo.clustering.ClusterID;
import org.tribuo.clustering.ClusteringFactory;
import org.tribuo.clustering.hdbscan.HdbscanModel;
import org.tribuo.clustering.hdbscan.HdbscanTrainer;
import org.tribuo.data.columnar.FieldProcessor;
import org.tribuo.data.columnar.ResponseProcessor;
import org.tribuo.data.columnar.RowProcessor;
import org.tribuo.data.columnar.processors.field.DoubleFieldProcessor;
import org.tribuo.data.columnar.processors.response.EmptyResponseProcessor;
import org.tribuo.data.csv.CSVDataSource;
import org.tribuo.data.csv.CSVLoader;
import org.tribuo.math.distance.DistanceType;
import org.tribuo.math.neighbour.NeighboursQueryFactoryType;
import org.tribuo.util.Util;

In [4]:
String train_csv_path = "../../../data/synthetic_data_training_scaled.csv";
Map<String, FieldProcessor> regexMappingProcessors = new HashMap<>(); 
var reader = new CSVReaderHeaderAware(new FileReader(train_csv_path));
var record = reader.readMap();  
for (Map.Entry<String, String> e : record.entrySet()) {
    regexMappingProcessors.put(e.getKey().toString(), new DoubleFieldProcessor(e.getKey().toString()));
}

RowProcessor<ClusterID> rowProcessor = new RowProcessor<>(new EmptyResponseProcessor<>(new ClusteringFactory()), regexMappingProcessors);
CSVDataSource<ClusterID> csvSource = new CSVDataSource<>(Paths.get(train_csv_path), rowProcessor, false);
// Dataset creation from CSV
Dataset<ClusterID> dataset = new MutableDataset<>(csvSource);

In [5]:
System.out.println(String.format("Data size = %d, number of features = %d",dataset.size(),dataset.getFeatureMap().size()));


Data size = 2500, number of features = 1013


In [6]:
var trainer = new HdbscanTrainer(5,  DistanceType.L2, 5, 2, NeighboursQueryFactoryType.BRUTE_FORCE);
var model = trainer.train(dataset);

In [7]:
System.out.println(model.getClusterLabels().size());

2500


In [8]:
/*for (Integer label: model.getClusterLabels()) {
    if (label != 0) {
        System.out.println(label);
    }    
}*/

// all zeros
System.out.println(model.getClusterLabels());

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [9]:
System.out.println(model.getOutlierScores());

[0.03737376972861728, 0.05548965673182693, 0.051447890833471455, 0.05738675076085542, 0.059939044863606794, 0.022051044332634384, 0.030724362417322237, 0.04819293585762707, 0.04890650815199604, 0.0703343933388455, 0.04403765646950997, 0.04983660953077862, 0.013239109112373804, 0.06661140679064392, 0.10423833198807697, 0.05607841049460649, 0.050063398474184284, 0.03731068145401073, 0.03706902331516604, 0.03531027185311242, 0.034392712991972885, 0.04137922155838614, 0.04895556067802276, 0.03869965807010611, 0.05735573394159155, 0.03896536901355596, 0.03323588914987552, 0.023278909855701913, 0.0461104116121509, 0.05654361066984015, 0.047541359039049214, 0.03258474973255554, 0.05824904189710467, 0.048207407666840285, 0.07358096705223438, 0.016630583958956224, 0.04594377969806995, 0.0518613758953832, 0.032186170518681556, 0.041420608625220345, 0.06966401188949334, 0.0264003919460144, 0.029963730190943005, 0.03514218809661385, 0.043768814694660985, 0.05897444066203583, 0.020257775780099463, 

5, 0.030421076601334085, 0.06839355531590907, 0.032790907799950086, 0.045584914127888565, 0.08894359145928576, 0.01826354731387958, 0.021074969307321445, 0.02926780513548255, 0.04619750657932764, 0.03787382207884449, 0.055719257326674154, 0.044493045269651366, 0.030044194039949623, 0.03463139419599248, 0.04414113494901928, 0.025701802603228074, 0.09061752639723797, 0.03346254461930065, 0.07291741736606938, 0.03804441822892635, 0.03583417477170037, 0.05616989525351235, 0.08589412168263344, 0.05187751181157951, 0.049361816613712306, 0.05071751673415181, 0.026703686337750887, 0.05051900237457563, 0.026291264609313303, 0.03728366117213444, 0.03837010637933336, 0.024809929554991395, 0.04425385679484961, 0.05253843751522824, 0.007414678037118483, 0.05650841654848593, 0.024903096902323396, 0.04346310022832123, 0.02037125261970607, 0.03953866507015402, 0.049398728468655895, 0.0452610304485197, 0.011506014034587508, 0.05643327969456302, 0.053789342401814766, 0.017859082906065415, 0.038327491189

907755, 0.06471451077811907, 0.024903096902323396, 0.05838119624811211, 0.03301475362236139, 0.03212568511232794, 0.03793004173105674, 0.01966258940066523, 0.055138193338814645, 0.04650864138420663, 0.03887220745665654, 0.05316669737282731, 0.04127525045120983, 0.04789454297385165, 0.05671994279398529, 0.05299557643192765, 0.061509741892955994, 0.0396519960355316, 0.046719169651186165, 0.015385363213749503, 0.039694105495348686, 0.044845434917598626, 0.047109619523483914, 0.03172685669111974, 0.03522306337436332, 0.03849176846310065, 0.006706402542195455, 0.031016149213660138, 0.048907825676372974, 0.06477479170598333, 0.05867635459022724, 0.03410895300227457, 0.0390562528109899, 0.018446342026106954, 0.02583707721979378, 0.024931724587312076, 0.040310436401508554, 0.031398091765673986, 0.04157435081276861, 0.04725285072580898, 0.06891334931977378, 0.06204848130147833, 0.003612908528817793, 0.05302946616842741, 0.02468897896006439, 0.025515049575022375, 0.05305286951199151, 0.016961299

61826, 0.04442471523868996, 0.03959081883477, 0.033302388451465714, 0.05752653021502685, 0.03980515682390917, 0.05137573537446283, 0.04290776702088439, 0.04521682389097992, 0.013377072612268504, 0.050731816349026904, 0.04188264254818197, 0.05122980689224488, 0.020189466719363836, 0.02861977098500057, 0.030624468406285632, 0.020928544288861928, 0.059665299147311, 0.06329927232148391, 0.012550584187568936, 0.05045164186962714, 0.019183770608493078, 0.0438570432037001, 0.08149592784585113, 0.04517946496637104, 0.048631300013749934, 0.06670750708984652, 0.0631157771324351, 0.05289564065036578, 0.028382320058960353, 0.05840313769277605, 0.02572402702114629, 0.06703747677120997, 0.030396854103421633, 0.0476339647170112, 0.03790269737735397, 0.0422327995405849, 0.04256172045401929, 1.6792037636159662E-5, 0.04997716711357203, 0.025408003336028218, 0.044906250793428804, 0.07426036379292633, 0.05668236593958398, 0.0642252798045444, 0.04266598885088657, 0.060850696156953155, 0.06163784316011356, 