## This demonstrates Tribuo regression for comparison with scikit-learn regression, although these resulting regression models were not used in the final comparisons

In [1]:
%jars ../../jars/tribuo-json-4.3.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../../jars/tribuo-regression-liblinear-4.3.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../../jars/tribuo-regression-sgd-4.3.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../../jars/tribuo-regression-xgboost-4.3.0-SNAPSHOT-jar-with-dependencies.jar
%jars ../../jars/tribuo-regression-tree-4.3.0-SNAPSHOT-jar-with-dependencies.jar

In [2]:
import java.nio.file.Paths;
import java.nio.file.Files;
import java.util.logging.Level;
import java.util.logging.Logger;

In [3]:
import org.tribuo.*;
import org.tribuo.data.csv.CSVLoader;
import org.tribuo.datasource.ListDataSource;
import org.tribuo.evaluation.TrainTestSplitter;
import org.tribuo.math.optimisers.*;
import org.tribuo.regression.*;
import org.tribuo.regression.evaluation.*;
import org.tribuo.regression.liblinear.LibLinearRegressionTrainer;
import org.tribuo.regression.sgd.RegressionObjective;
import org.tribuo.regression.sgd.linear.LinearSGDTrainer;
import org.tribuo.regression.sgd.objectives.SquaredLoss;
import org.tribuo.regression.rtree.CARTRegressionTrainer;
import org.tribuo.util.Util;

In [4]:
var regressionFactory = new RegressionFactory();
var csvLoader = new CSVLoader<>(regressionFactory);

In [5]:
// This dataset is generated in the notebook: scikit-learn Regressor Example - Data Cleanup
var oceanSource = csvLoader.loadDataSource(Paths.get("../../data/cleanedBottle.csv"), "temp");
var splitter = new TrainTestSplitter<>(oceanSource, 0.8f, 0L);
Dataset<Regressor> trainData = new MutableDataset<>(splitter.getTrain());
Dataset<Regressor> evalData = new MutableDataset<>(splitter.getTest());
//testData
System.out.println(String.format("Training data size = %d, number of features = %d",trainData.size(),trainData.getFeatureMap().size()));
System.out.println(String.format("Testing data size = %d, number of features = %d",evalData.size(),evalData.getFeatureMap().size()));


Training data size = 651397, number of features = 2
Testing data size = 162850, number of features = 2


In [6]:
public Model<Regressor> train(String name, Trainer<Regressor> trainer, Dataset<Regressor> trainData) {
    // Train the model
    var startTime = System.currentTimeMillis();
    Model<Regressor> model = trainer.train(trainData);
    var endTime = System.currentTimeMillis();
    System.out.println("Training " + name + " took " + Util.formatDuration(startTime,endTime));
    // Evaluate the model on the training data
    // This is a useful debugging tool to check the model actually learned something
    RegressionEvaluator eval = new RegressionEvaluator();
    var evaluation = eval.evaluate(model,trainData);
    // We create a dimension here to aid pulling out the appropriate statistics.
    // You can also produce the String directly by calling "evaluation.toString()"
    var dimension = new Regressor("DIM-0",Double.NaN);
    System.out.printf("Evaluation (train):%n  RMSE %f%n  MAE %f%n  R^2 %f%n",
            evaluation.rmse(dimension), evaluation.mae(dimension), evaluation.r2(dimension));
    return model;
}


In [7]:
public void evaluate(Model<Regressor> model, Dataset<Regressor> testData) {
    // Evaluate the model on the test data
    RegressionEvaluator eval = new RegressionEvaluator();
    var evaluation = eval.evaluate(model,testData);
    // We create a dimension here to aid pulling out the appropriate statistics.
    // You can also produce the String directly by calling "evaluation.toString()"
    var dimension = new Regressor("DIM-0",Double.NaN);
    System.out.printf("Evaluation (test):%n  RMSE %f%n  MAE %f%n  R^2 %f%n",
            evaluation.rmse(dimension), evaluation.mae(dimension), evaluation.r2(dimension));
}


In [8]:
var lrsgd = new LinearSGDTrainer(
    new SquaredLoss(), // loss function
    SGD.getLinearDecaySGD(0.01), // gradient descent algorithm
    5,                // number of training epochs
    trainData.size()/4,// logging interval
    1,                 // minibatch size
    1L                 // RNG seed
);

var lrada = new LinearSGDTrainer(
    new SquaredLoss(),
    new AdaGrad(0.01),
    5,
    trainData.size()/4,
    1,
    1L 
);

var lr = new LibLinearRegressionTrainer();

var cart = new CARTRegressionTrainer(6);

In [9]:
System.out.println(lr.toString());
System.out.println(lrsgd.toString());
System.out.println(lrada.toString());
System.out.println(cart.toString());

LibLinearTrainer(solver=L2R_L2LOSS_SVR,cost=1.0,terminationCriterion=0.1,maxIterations=1000,regression-epsilon=0.1,seed=12345)
LinearSGDTrainer(objective=SquaredLoss,optimiser=SGD(type=LinearDecay,initialLearningRate=0.01),epochs=5,minibatchSize=1,seed=1)
LinearSGDTrainer(objective=SquaredLoss,optimiser=AdaGrad(initialLearningRate=0.01,epsilon=1.0E-6,initialValue=0.0),epochs=5,minibatchSize=1,seed=1)
CARTRegressionTrainer(maxDepth=6,minChildWeight=5.0,minImpurityDecrease=0.0,fractionFeaturesInSplit=1.0,useRandomSplitPoints=false,impurity=MeanSquaredError,seed=12345)


In [10]:
var lrModel = train("Linear Regression",lr,trainData);

Training Linear Regression took (00:00:00:610)
Evaluation (train):
  RMSE 3.068896
  MAE 2.263334
  R^2 0.472028


In [11]:
evaluate(lrModel,evalData);

Evaluation (test):
  RMSE 3.073997
  MAE 2.265460
  R^2 0.472003


In [12]:
var lrsgdModel = train("Linear Regression (SGD)",lrsgd,trainData);

Training Linear Regression (SGD) took (00:00:02:541)
Evaluation (train):
  RMSE 10.157049
  MAE 9.372090
  R^2 -4.783377


In [13]:
evaluate(lrsgdModel,evalData);

Evaluation (test):
  RMSE 10.155645
  MAE 9.367787
  R^2 -4.762875


In [14]:
var lradaModel = train("Linear Regression (AdaGrad)",lrada,trainData);
evaluate(lradaModel,evalData);

Training Linear Regression (AdaGrad) took (00:00:02:890)
Evaluation (train):
  RMSE 3.052163
  MAE 2.284762
  R^2 0.477770
Evaluation (test):
  RMSE 3.056912
  MAE 2.286477
  R^2 0.477856


In [15]:
// var cartModel = train("CART",cart,trainData);
// evaluate(cartModel,evalData);