## This demonstrates Tribuo clustering for comparison with scikit-learn clustering

In [1]:
%jars ../../jars/tribuo-clustering-kmeans-4.1.0-jar-with-dependencies.jar
%jars ../../jars/tribuo-json-4.1.0-jar-with-dependencies.jar

In [2]:
import java.nio.file.Paths;
import java.nio.file.Files;
import java.util.logging.Level;
import java.util.logging.Logger;

In [3]:
import org.tribuo.*;
import org.tribuo.evaluation.TrainTestSplitter;
import org.tribuo.data.csv.CSVLoader;
import org.tribuo.util.Util;
import org.tribuo.clustering.*;
import org.tribuo.clustering.evaluation.*;
import org.tribuo.clustering.kmeans.*;
import org.tribuo.clustering.kmeans.KMeansTrainer.Distance;
import org.tribuo.clustering.kmeans.KMeansTrainer.Initialisation;

In [4]:
var clusteringFactory = new ClusteringFactory();
var csvLoader = new CSVLoader<>(clusteringFactory);

In [5]:
var startTime = System.currentTimeMillis();
// This dataset is generated in the notebook: scikit-learn Clustering - Data Setup
var gaussianSource = csvLoader.loadDataSource(Paths.get("../../data/gaussianBlobs.csv"), "Cluster");
var endTime = System.currentTimeMillis();
System.out.println("Loading took " + Util.formatDuration(startTime,endTime));
var startTime = System.currentTimeMillis();
var splitter = new TrainTestSplitter<>(gaussianSource, 0.8f, 0L);
var endTime = System.currentTimeMillis();
System.out.println("Splitting took " + Util.formatDuration(startTime,endTime));

Dataset<ClusterID> trainData = new MutableDataset<>(splitter.getTrain());
Dataset<ClusterID> evalData = new MutableDataset<>(splitter.getTest());

System.out.println(String.format("Training data size = %d, number of features = %d",trainData.size(),trainData.getFeatureMap().size()));
System.out.println(String.format("Testing data size = %d, number of features = %d",evalData.size(),evalData.getFeatureMap().size()));


Loading took (00:01:04:321)
Splitting took (00:00:06:125)
Training data size = 4800000, number of features = 5
Testing data size = 1200000, number of features = 5


In [6]:
// Note: the types including generics were tricky to get working
public Model train(String name, Trainer trainer, Dataset<ClusterID> trainData) { 
    // Train the model
    var startTime = System.currentTimeMillis();
    var model = trainer.train(trainData);
    var endTime = System.currentTimeMillis();
    System.out.println("Training " + name + " took " + Util.formatDuration(startTime,endTime));
    // Don't report training scores for now
    // var eval = new ClusteringEvaluator();
    // var evaluation = eval.evaluate(model,trainData);
    // System.out.println(evaluation.toString());
    //System.out.println(evaluation.getConfusionMatrix().toString());
    return model;
}

In [7]:
public void evaluate(Model model, Dataset<ClusterID> testData) {
    // Evaluate the model on the test data
    var eval = new ClusteringEvaluator();
    var evaluation = eval.evaluate(model,testData);
    System.out.println(evaluation.toString());
    //System.out.println(evaluation.getConfusionMatrix().toString());
}

In [8]:
var kmTrainer = new KMeansTrainer(6,100,Distance.EUCLIDEAN,4,1);
var kmPlusPlusTrainer = new KMeansTrainer(6,100,Distance.EUCLIDEAN,Initialisation.PLUSPLUS,4,1);

In [9]:
System.out.println(kmTrainer.toString());
System.out.println(kmPlusPlusTrainer.toString());

KMeansTrainer(centroids=6,distanceType=EUCLIDEAN,seed=1,numThreads=4)
KMeansTrainer(centroids=6,distanceType=EUCLIDEAN,seed=1,numThreads=4)


In [10]:
var kmModel = train("KMeans", kmTrainer, trainData);
// run 1
// time 2min 43s

// run 2
// time 

// run 3
// time 

Training KMeans took (00:02:42:972)


In [11]:
evaluate(kmModel, evalData);
// run 1
//

// run 2
//

// run 3
//

Clustering Evaluation
Normalized MI = 0.8790240537433484
Adjusted MI = 0.8790234761754055


In [12]:
var kmPlusPlusModel = train("KMeans", kmPlusPlusTrainer, trainData);
// run 1
// time 11.93s

// run 2
// time 

// run 3
// time 

Training KMeans took (00:00:11:932)


In [13]:
evaluate(kmPlusPlusModel, evalData);
// run 1
//

// run 2
//

// run 3
//

Clustering Evaluation
Normalized MI = 0.9998518616970554
Adjusted MI = 0.9998518611000934
