## This demonstrates Tribuo regression for comparison with scikit-learn regression

In [1]:
%jars ../../jars/tribuo-regression-liblinear-4.2.1-jar-with-dependencies.jar
%jars ../../jars/tribuo-regression-sgd-4.2.1-jar-with-dependencies.jar
%jars ../../jars/tribuo-regression-xgboost-4.2.1-jar-with-dependencies.jar
%jars ../../jars/tribuo-regression-tree-4.2.1-jar-with-dependencies.jar
%jars ../../jars/tribuo-regression-libsvm-4.2.1-jar-with-dependencies.jar
%jars ../../jars/tribuo-regression-xgboost-4.2.1-jar-with-dependencies.jar

In [2]:
import java.nio.file.Paths;
import java.nio.file.Files;
import java.util.logging.Level;
import java.util.logging.Logger;

In [3]:
import org.tribuo.*;
import org.tribuo.data.csv.CSVLoader;
import org.tribuo.datasource.ListDataSource;
import org.tribuo.evaluation.TrainTestSplitter;
import org.tribuo.math.optimisers.*;
import org.tribuo.regression.*;
import org.tribuo.regression.evaluation.*;
import org.tribuo.regression.liblinear.LibLinearRegressionTrainer;
import org.tribuo.regression.sgd.RegressionObjective;
import org.tribuo.regression.liblinear.LinearRegressionType;
import org.tribuo.regression.liblinear.LinearRegressionType.LinearType;
import org.tribuo.regression.sgd.linear.LinearSGDTrainer;
import org.tribuo.regression.sgd.objectives.SquaredLoss;
import org.tribuo.regression.rtree.CARTRegressionTrainer;
import org.tribuo.regression.libsvm.LibSVMRegressionTrainer;
import org.tribuo.regression.libsvm.SVMRegressionType.SVMMode;
import org.tribuo.regression.xgboost.XGBoostRegressionTrainer;
import org.tribuo.util.Util;

In [4]:
var regressionFactory = new RegressionFactory();
var csvLoader = new CSVLoader<>(regressionFactory);

In [5]:
var startTime = System.currentTimeMillis();
// This dataset is prepared in the notebook: scikit-learn Regressor - Data Cleanup
// unzip cleanedCars.zip
// WARNING! This dataset takes a very long time to load.
// This issue is now resolved on the main branch, which looks like will be version 4.2
var carsSource = csvLoader.loadDataSource(Paths.get("../../data/cleanedCars.csv"), "price_usd");
var endTime = System.currentTimeMillis();
System.out.println("Loading took " + Util.formatDuration(startTime,endTime));
var startTime = System.currentTimeMillis();
var splitter = new TrainTestSplitter<>(carsSource, 0.8f, 0L);
var endTime = System.currentTimeMillis();
System.out.println("Splitting took " + Util.formatDuration(startTime,endTime));

Dataset<Regressor> trainData = new MutableDataset<>(splitter.getTrain());
Dataset<Regressor> evalData = new MutableDataset<>(splitter.getTest());

System.out.println(String.format("Training data size = %d, number of features = %d",trainData.size(),trainData.getFeatureMap().size()));
System.out.println(String.format("Testing data size = %d, number of features = %d",evalData.size(),evalData.getFeatureMap().size()));


Loading took (00:00:00:611)
Splitting took (00:00:28:025)
Training data size = 30816, number of features = 1186
Testing data size = 7705, number of features = 1186


In [6]:
public Model<Regressor> train(String name, Trainer<Regressor> trainer, Dataset<Regressor> trainData) {
    // Train the model
    var startTime = System.currentTimeMillis();
    Model<Regressor> model = trainer.train(trainData);
    var endTime = System.currentTimeMillis();
    System.out.println("Training " + name + " took " + Util.formatDuration(startTime,endTime));
    // Evaluate the model on the training data
    // This is a useful debugging tool to check the model actually learned something
    RegressionEvaluator eval = new RegressionEvaluator();
    var evaluation = eval.evaluate(model,trainData);
    // We create a dimension here to aid pulling out the appropriate statistics.
    // You can also produce the String directly by calling "evaluation.toString()"
    var dimension = new Regressor("DIM-0",Double.NaN);
    // Don't report training scores
    //System.out.printf("Evaluation (train):%n  RMSE %f%n  MAE %f%n  R^2 %f%n",
    //        evaluation.rmse(dimension), evaluation.mae(dimension), evaluation.r2(dimension));
    return model;
}


In [7]:
public void evaluate(Model<Regressor> model, Dataset<Regressor> testData) {
    // Evaluate the model on the test data
    RegressionEvaluator eval = new RegressionEvaluator();
    var evaluation = eval.evaluate(model,testData);
    // We create a dimension here to aid pulling out the appropriate statistics.
    // You can also produce the String directly by calling "evaluation.toString()"
    var dimension = new Regressor("DIM-0",Double.NaN);
    System.out.printf("Evaluation (test):%n  RMSE: %f%n  MAE:  %f%n  R^2:  %f%n",
            evaluation.rmse(dimension), evaluation.mae(dimension), evaluation.r2(dimension));
}


In [8]:
var lrsgd = new LinearSGDTrainer(
    new SquaredLoss(),           // loss function
    SGD.getLinearDecaySGD(0.01), // gradient descent algorithm
    50,                           // number of training epochs
    trainData.size()/4,          // logging interval
    1,                           // minibatch size
    1L                           // RNG seed
);

//var lr = new LibLinearRegressionTrainer();

var lr = new LibLinearRegressionTrainer(
    new LinearRegressionType(LinearType.L2R_L2LOSS_SVR),
    1.0,    // cost penalty
    1000,   // max iterations
    0.1,    // termination criteria
    0.1     // epsilon
);

var cart = new CARTRegressionTrainer(10);

var xgb = new XGBoostRegressionTrainer(75);


In [9]:
System.out.println(lrsgd.toString());
System.out.println(lr.toString());
System.out.println(cart.toString());
System.out.println(xgb.toString());

LinearSGDTrainer(objective=SquaredLoss,optimiser=SGD(type=LinearDecay,initialLearningRate=0.01),epochs=50,minibatchSize=1,seed=1)
LibLinearTrainer(solver=L2R_L2LOSS_SVR,cost=1.0,terminationCriterion=0.1,maxIterations=1000,regression-epsilon=0.1)
CARTRegressionTrainer(maxDepth=10,minChildWeight=5.0,minImpurityDecrease=0.0,fractionFeaturesInSplit=1.0,useRandomSplitPoints=false,impurity=MeanSquaredError,seed=12345)
XGBoostTrainer(numTrees=75,parameters{colsample_bytree=1.0, tree_method=auto, seed=12345, max_depth=6, booster=gbtree, objective=reg:squarederror, lambda=1.0, eta=0.3, nthread=4, alpha=0.0, subsample=1.0, gamma=0.0, min_child_weight=1.0, verbosity=0})


In [10]:
var lrsgdModel = train("Linear Regression (SGD)", lrsgd, trainData);

// run 1
// time 10.59 s

// run 2
// time 9.96 s

// run 3
// time 9.11 s

Training Linear Regression (SGD) took (00:00:09:399)


In [11]:
evaluate(lrsgdModel,evalData);

// run 1
//  RMSE: NaN
//  MAE:  NaN
//  R^2:  NaN

// run 2
//  RMSE: NaN
//  MAE:  NaN
//  R^2:  NaN

// run 3
//  RMSE: NaN
//  MAE:  NaN
//  R^2:  NaN

Evaluation (test):
  RMSE: NaN
  MAE:  NaN
  R^2:  NaN


In [12]:
var lrModel = train("Linear Regression",lr,trainData);
// run 1
// time 6.60 s

// run 2
// time 6.96 s

// run 3
// time 6.92 s

Training Linear Regression took (00:00:06:845)


In [13]:
evaluate(lrModel,evalData);

// run 1
//   RMSE: 4125.63
//   MAE:  2624.56
//   R^2:  0.59

// run 2
//  RMSE: 4125.63
//  MAE:  2624.56
//  R^2:  0.59

// run 3
//  RMSE: 4125.63
//  MAE:  2624.56
//  R^2:  0.59

Evaluation (test):
  RMSE: 4125.628552
  MAE:  2624.556902
  R^2:  0.594143


In [14]:
var cartModel = train("CART",cart,trainData);
// run 1
// time 7.41 s

// run 2
// time 7.59 s

// run 3
// time 8.07 s

Training CART took (00:00:08:102)


In [15]:
evaluate(cartModel,evalData);
// run 1
//  RMSE: 2453.70
//  MAE:  1469.94
//  R^2:  0.86

// run 2
//  RMSE: 2453.70
//  MAE:  1469.94
//  R^2:  0.86

// run 3
//  RMSE: 2453.70
//  MAE:  1469.94
//  R^2:  0.86

Evaluation (test):
  RMSE: 2453.700424
  MAE:  1469.942039
  R^2:  0.856439


In [16]:
var xgbModel = train("XGBoost", xgb, trainData);
// run 1
// time 2min 31s

// run 2
// time 2min 29s

// run 3
// time 2min 26s

Training XGBoost took (00:02:17:326)


In [17]:
evaluate(xgbModel, evalData);
// run 1
//  RMSE: 1883.17
//  MAE:  1164.05
//  R^2:  0.92

// run 2
//  RMSE: 1883.17
//  MAE:  1164.05
//  R^2:  0.92

// run 3
//  RMSE: 1883.17
//  MAE:  1164.05
//  R^2:  0.92

Evaluation (test):
  RMSE: 1883.167266
  MAE:  1164.050417
  R^2:  0.915439


In [18]:
// Setup parameters for SVR

import com.oracle.labs.mlrg.olcut.config.Option;
import com.oracle.labs.mlrg.olcut.config.Options;
import org.tribuo.common.libsvm.KernelType;
import org.tribuo.common.libsvm.SVMParameters;
import org.tribuo.regression.libsvm.SVMRegressionType;

public class LibSVMOptions implements Options {
    @Override
    public String getOptionsDescription() {
        return "Trains and tests a LibSVM regression model on the specified datasets.";
    }
    
    @Option(longName="coefficient",usage="Intercept in kernel function.")
    public double coeff = 1.0;
    @Option(charName='d',longName="degree",usage="Degree in polynomial kernel.")
    public int degree = 3;
    @Option(charName='g',longName="gamma",usage="Gamma value in kernel function.")
    public double gamma = 0.0;
    @Option(charName='k',longName="kernel",usage="Type of SVM kernel.")
    public KernelType kernelType = KernelType.RBF;
    @Option(charName='t',longName="type",usage="Type of SVM.")
    public SVMRegressionType.SVMMode svmType = SVMMode.EPSILON_SVR;
    @Option(longName="standardize",usage="Standardize the regression outputs internally to the SVM")
    public boolean standardize = false;
}

In [19]:
// setup for SVR trainer

var svnOptions = new LibSVMOptions();
var parameters = new SVMParameters<>(new SVMRegressionType(svnOptions.svmType), 
                                                                    svnOptions.kernelType);
parameters.setGamma(0.0);
parameters.setCoeff(1.0);
parameters.setDegree(3);
var svr = new LibSVMRegressionTrainer(parameters, false);