## This notebook measures Tribuo Hdbscan performance with the credit card data

In [1]:
%jars ../../../jars/junit-jupiter-api-5.7.0.jar
%jars ../../../jars/opentest4j-1.2.0.jar
%jars ../../../jars/junit-platform-commons-1.7.1.jar
%jars ../../../jars/tribuo-clustering-hdbscan-4.2.0-SNAPSHOT-jar-with-dependencies.jar

In [2]:
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.*;

In [3]:
import org.tribuo.Dataset;
import org.tribuo.Feature;
import org.tribuo.MutableDataset;
import org.tribuo.Prediction;
import org.tribuo.clustering.ClusterID;
import org.tribuo.clustering.ClusteringFactory;
import org.tribuo.clustering.hdbscan.HdbscanModel;
import org.tribuo.clustering.hdbscan.HdbscanTrainer;
import org.tribuo.data.columnar.FieldProcessor;
import org.tribuo.data.columnar.ResponseProcessor;
import org.tribuo.data.columnar.RowProcessor;
import org.tribuo.data.columnar.processors.field.DoubleFieldProcessor;
import org.tribuo.data.columnar.processors.response.EmptyResponseProcessor;
import org.tribuo.data.csv.CSVDataSource;
import org.tribuo.data.csv.CSVLoader;
import org.tribuo.util.Util;

In [4]:
ClusteringFactory clusteringFactory = new ClusteringFactory();
ResponseProcessor<ClusterID> emptyResponseProcessor = new EmptyResponseProcessor<>(clusteringFactory);
Map<String, FieldProcessor> regexMappingProcessors = new HashMap<>();
regexMappingProcessors.put("BALANCE", new DoubleFieldProcessor("BALANCE"));
regexMappingProcessors.put("BALANCE_FREQUENCY", new DoubleFieldProcessor("BALANCE_FREQUENCY"));
regexMappingProcessors.put("PURCHASES", new DoubleFieldProcessor("PURCHASES"));
regexMappingProcessors.put("ONEOFF_PURCHASES", new DoubleFieldProcessor("ONEOFF_PURCHASES"));
regexMappingProcessors.put("INSTALLMENTS_PURCHASES", new DoubleFieldProcessor("INSTALLMENTS_PURCHASES"));
regexMappingProcessors.put("CASH_ADVANCE", new DoubleFieldProcessor("CASH_ADVANCE"));
regexMappingProcessors.put("PURCHASES_FREQUENCY", new DoubleFieldProcessor("PURCHASES_FREQUENCY"));
regexMappingProcessors.put("ONEOFF_PURCHASES_FREQUENCY", new DoubleFieldProcessor("ONEOFF_PURCHASES_FREQUENCY"));
regexMappingProcessors.put("PURCHASES_INSTALLMENTS_FREQUENCY", new DoubleFieldProcessor("PURCHASES_INSTALLMENTS_FREQUENCY"));
regexMappingProcessors.put("CASH_ADVANCE_FREQUENCY", new DoubleFieldProcessor("CASH_ADVANCE_FREQUENCY"));
regexMappingProcessors.put("CASH_ADVANCE_TRX", new DoubleFieldProcessor("CASH_ADVANCE_TRX"));
regexMappingProcessors.put("PURCHASES_TRX", new DoubleFieldProcessor("PURCHASES_TRX"));
regexMappingProcessors.put("CREDIT_LIMIT", new DoubleFieldProcessor("CREDIT_LIMIT"));
regexMappingProcessors.put("PAYMENTS", new DoubleFieldProcessor("PAYMENTS"));
regexMappingProcessors.put("MINIMUM_PAYMENTS", new DoubleFieldProcessor("MINIMUM_PAYMENTS"));
regexMappingProcessors.put("PRC_FULL_PAYMENT", new DoubleFieldProcessor("PRC_FULL_PAYMENT"));
regexMappingProcessors.put("TENURE", new DoubleFieldProcessor("TENURE"));

RowProcessor<ClusterID> rowProcessor = new RowProcessor<>(emptyResponseProcessor, regexMappingProcessors);
CSVDataSource<ClusterID> csvDataSource = new CSVDataSource<>(Paths.get("../../../data/cleanedCC.csv"), rowProcessor, false);
Dataset<ClusterID> dataset = new MutableDataset<>(csvDataSource);

In [5]:
System.out.println(String.format("Data size = %d, number of features = %d",dataset.size(),dataset.getFeatureMap().size()));


Data size = 8949, number of features = 17


In [6]:
var trainer = new HdbscanTrainer(5, HdbscanTrainer.Distance.EUCLIDEAN, 5, 4);
var startTime = System.currentTimeMillis();
var model = trainer.train(dataset);
var endTime = System.currentTimeMillis();
System.out.println("Training took " + Util.formatDuration(startTime,endTime));

// run 1
// time 4.01s

// run 2
// time 

// run 3
// time 

Training took (00:00:04:085)


In [7]:
System.out.println(model.getClusterLabels().size());

8949
