Skip to content

Commit

Permalink
Merge pull request #708 from lucagiovagnoli/luca-xgboost-1.0.0
Browse files Browse the repository at this point in the history
Upgrade to xgboost 1.0.0 - Use h2oai Predictor
  • Loading branch information
ancasarb committed Oct 9, 2020
2 parents ad4ad3f + 55f550c commit fbd3c75
Show file tree
Hide file tree
Showing 14 changed files with 6,604 additions and 1,669 deletions.
16 changes: 8 additions & 8 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
sudo: required
dist: trusty

addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- gcc-4.8
- g++-4.8

services:
- docker

Expand All @@ -22,14 +30,6 @@ jobs:
script:
- make test_benchmark

- name: "MLeap xgboost runtime tests"
script:
- make test_xgboost_runtime

- name: "MLeap xgboost spark tests"
script:
- make test_xgboost_spark

- name: "All other sbt tests"
script:
- make test_root_sbt_project
Expand Down
6 changes: 0 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,6 @@ test_executor:
test_benchmark:
sbt "+ mleap-benchmark/test"

test_xgboost_runtime:
sbt "mleap-xgboost-runtime/test"

test_xgboost_spark:
sbt "mleap-xgboost-spark/test"

test_root_sbt_project:
sbt "+ test"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,11 @@ case class XGBoostPredictorMultinomialClassificationModel(
}

def predictProbabilities(data: FVec): Vector = {
Vectors.dense(predictor.predict(data, false, treeLimit))
Vectors.dense(predictor.predict(data, false, treeLimit).map(_.toDouble))
}

def predictRaw(data: FVec): Vector = {
Vectors.dense(predictor.predict(data, true, treeLimit))
Vectors.dense(predictor.predict(data, true, treeLimit).map(_.toDouble))
}

override def rawToProbabilityInPlace(raw: Vector): Vector = {
Expand Down
6,513 changes: 6,513 additions & 0 deletions mleap-xgboost-runtime/src/test/resources/datasources/agaricus.csv

Large diffs are not rendered by default.

1,611 changes: 0 additions & 1,611 deletions mleap-xgboost-runtime/src/test/resources/datasources/agaricus.test

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package ml.combust.mleap.xgboost.runtime

import ml.combust.mleap.core.types.{BasicType, NodeShape, ScalarType, StructField, TensorType}
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Transformer}
import ml.combust.mleap.tensor.SparseTensor
import ml.combust.mleap.tensor.{SparseTensor, Tensor}
import ml.combust.mleap.xgboost.runtime.testing.{BoosterUtils, BundleSerializationUtils, CachedDatasetUtils, ClassifierUtils, FloatingPointApproximations}
import ml.dmlc.xgboost4j.scala.Booster
import org.apache.spark.ml.linalg.Vectors
Expand Down Expand Up @@ -32,7 +32,7 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val mleapRawPredictionColIndex = mleapResult.schema.indexOf("raw_prediction").get
val mleapProbabilityColIndex = mleapResult.schema.indexOf("probability").get

val singleRowDMatrix = r(featuresColumnIndex).asInstanceOf[SparseTensor[Double]].asXGB
val singleRowDMatrix = r(featuresColumnIndex).asInstanceOf[Tensor[Double]].asXGB

val boosterResult = booster.predict(singleRowDMatrix, false, 0).head(0)

Expand Down Expand Up @@ -104,7 +104,7 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val booster = trainBooster(binomialDataset)
val xgboostTransformer = trainXGBoost4jClassifier

equalityTestRowByRow(booster, xgboostTransformer, leapFrameLibSVMtrain)
equalityTestRowByRow(booster, xgboostTransformer, leapFrameBinomial)
}

it("has the correct inputs and outputs with columns: prediction, probability and raw_prediction") {
Expand All @@ -124,8 +124,8 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val preSerializationResult = xgboostTransformer.transform(leapFrameLibSVMtrain).get
val deserializedModelResult = deserializedTransformer.transform(leapFrameLibSVMtrain).get
val preSerializationResult = xgboostTransformer.transform(leapFrameBinomial).get
val deserializedModelResult = deserializedTransformer.transform(leapFrameBinomial).get

assert(preSerializationResult.dataset == deserializedModelResult.dataset)
}
Expand All @@ -134,7 +134,7 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val multiBooster = trainMultinomialBooster(multinomialDataset)
val xgboostTransformer = trainMultinomialXGBoost4jClassifier

equalityTestRowByRowMultinomial(multiBooster, xgboostTransformer, leapFrameIrisTrain)
equalityTestRowByRowMultinomial(multiBooster, xgboostTransformer, leapFrameMultinomial)
}

it("XGBoostMultinomialClassificationModel results are the same pre and post serialization") {
Expand All @@ -143,8 +143,8 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val preSerializationResult = xgboostTransformer.transform(leapFrameIrisTrain).get
val deserializedResult = deserializedTransformer.transform(leapFrameIrisTrain).get
val preSerializationResult = xgboostTransformer.transform(leapFrameMultinomial).get
val deserializedResult = deserializedTransformer.transform(leapFrameMultinomial).get

assert(preSerializationResult.dataset == deserializedResult.dataset)
}
Expand All @@ -155,7 +155,7 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameLibSVMtrain)
val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameBinomial)

val preSerializationResult = xgboostTransformer.transform(denseLeapFrame).get
val deserializedResult = deserializedTransformer.transform(denseLeapFrame).get
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ class XGBoostPredictorClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboost4jTransformer)
val deserializedPredictor: Transformer = loadXGBoostPredictorFromBundle(mleapBundle)

val preSerializationXGBoost4jResult = xgboost4jTransformer.transform(leapFrameLibSVMtrain).get
val predictorModelResult = deserializedPredictor.transform(leapFrameLibSVMtrain).get
val preSerializationXGBoost4jResult = xgboost4jTransformer.transform(leapFrameBinomial).get
val predictorModelResult = deserializedPredictor.transform(leapFrameBinomial).get

probabilityColumnEqualityTest(preSerializationXGBoost4jResult, predictorModelResult)
}
Expand All @@ -91,10 +91,10 @@ class XGBoostPredictorClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboost4jTransformer)

val deserializedXGBoost4jTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)
val deserializedXGBoost4jResult = deserializedXGBoost4jTransformer.transform(leapFrameLibSVMtrain).get
val deserializedXGBoost4jResult = deserializedXGBoost4jTransformer.transform(leapFrameBinomial).get

val deserializedPredictorTransformer: Transformer = loadXGBoostPredictorFromBundle(mleapBundle)
val deserializedPredictorResult = deserializedPredictorTransformer.transform(leapFrameLibSVMtrain).get
val deserializedPredictorResult = deserializedPredictorTransformer.transform(leapFrameBinomial).get

probabilityColumnEqualityTest(deserializedPredictorResult, deserializedXGBoost4jResult)
}
Expand All @@ -117,7 +117,7 @@ class XGBoostPredictorClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(trainMultinomialXGBoost4jClassifier)
val deserializedPredictorTransformer: Transformer = loadXGBoostPredictorFromBundle(mleapBundle)

equalityTestRowByRowMultinomialProbability(multiBooster, deserializedPredictorTransformer, leapFrameIrisTrain)
equalityTestRowByRowMultinomialProbability(multiBooster, deserializedPredictorTransformer, leapFrameMultinomial)
}

it("[Multinomial] XGBoostPredictorMultinomialClassificationModel results are the same pre and post serialization") {
Expand All @@ -126,19 +126,19 @@ class XGBoostPredictorClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboost4jTransformer)
val predictorTransformer: Transformer = loadXGBoostPredictorFromBundle(mleapBundle)

val xgboost4jResult = xgboost4jTransformer.transform(leapFrameIrisTrain).get
val predictorResult = predictorTransformer.transform(leapFrameIrisTrain).get
val xgboost4jResult = xgboost4jTransformer.transform(leapFrameMultinomial).get
val predictorResult = predictorTransformer.transform(leapFrameMultinomial).get

probabilityColumnEqualityTest(xgboost4jResult, predictorResult)
}

it("XGBoost4j and Predictor results are the same when using a dense dataset") {
val xgboost4jTransformer = trainMultinomialXGBoost4jClassifier
val xgboost4jTransformer = trainXGBoost4jClassifier

val mleapBundle = serializeModelToMleapBundle(xgboost4jTransformer)
val predictorTransformer: Transformer = loadXGBoostPredictorFromBundle(mleapBundle)

val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameLibSVMtrain)
val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameBinomial)

val xgboost4jResult = xgboost4jTransformer.transform(denseLeapFrame).get
val predictorResult = predictorTransformer.transform(denseLeapFrame).get
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package ml.combust.mleap.xgboost.runtime

import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Transformer}
import ml.combust.mleap.tensor.SparseTensor
import ml.combust.mleap.tensor.Tensor
import ml.combust.mleap.xgboost.runtime.testing.{BoosterUtils, BundleSerializationUtils, CachedDatasetUtils, FloatingPointApproximations}
import ml.dmlc.xgboost4j.scala.Booster
import org.scalatest.FunSpec
Expand All @@ -22,7 +22,7 @@ class XGBoostRegressionModelParitySpec extends FunSpec
XGBoostRegression(
"xgboostSingleThread",
NodeShape.regression(),
XGBoostRegressionModel(booster, numFeatures(leapFrameLibSVMtrain), 0)
XGBoostRegressionModel(booster, numFeatures(leapFrameBinomial), 0)
)
}

Expand All @@ -35,7 +35,7 @@ class XGBoostRegressionModelParitySpec extends FunSpec
val mleapResult = mleapTransformer.transform(DefaultLeapFrame(leapFrameDataset.schema, Seq(r))).get
val mleapPredictionColIndex = mleapResult.schema.indexOf("prediction").get

val singleRowDMatrix = r(featuresColumnIndex).asInstanceOf[SparseTensor[Double]].asXGB
val singleRowDMatrix = r(featuresColumnIndex).asInstanceOf[Tensor[Double]].asXGB
val boosterResult = booster.predict(singleRowDMatrix, false, 0).head(0)

assert (boosterResult == mleapResult.dataset.head.getDouble(mleapPredictionColIndex))
Expand All @@ -50,7 +50,7 @@ class XGBoostRegressionModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

equalityTestRowByRow(booster, deserializedTransformer, leapFrameLibSVMtrain)
equalityTestRowByRow(booster, deserializedTransformer, leapFrameBinomial)
}

it("has the correct inputs and outputs with columns: prediction, probability and raw_prediction") {
Expand All @@ -69,8 +69,8 @@ class XGBoostRegressionModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val preSerializationResult = xgboostTransformer.transform(leapFrameLibSVMtrain).get
val deserializedModelResult = deserializedTransformer.transform(leapFrameLibSVMtrain).get
val preSerializationResult = xgboostTransformer.transform(leapFrameBinomial).get
val deserializedModelResult = deserializedTransformer.transform(leapFrameBinomial).get

assert(preSerializationResult.dataset == deserializedModelResult.dataset)
}
Expand All @@ -81,7 +81,7 @@ class XGBoostRegressionModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameLibSVMtrain)
val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameBinomial)

val preSerializationResult = xgboostTransformer.transform(denseLeapFrame).get
val deserializedResult = deserializedTransformer.transform(denseLeapFrame).get
Expand Down
Original file line number Diff line number Diff line change
@@ -1,30 +1,64 @@
package ml.combust.mleap.xgboost.runtime.testing

import ml.combust.mleap.core.types.TensorType
import ml.combust.mleap.core.types.{BasicType, DataType, ScalarShape, StructField, StructType, TensorType}
import ml.combust.mleap.core.util.VectorConverters
import ml.combust.mleap.runtime.frame.{ArrayRow, DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.DenseTensor
import ml.dmlc.xgboost4j.scala.DMatrix
import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.mleap.TypeConverters


trait CachedDatasetUtils {

private final val TrainDataFilePath = "datasources/agaricus.train"
private final val TrainDataFilePathCSV = "datasources/agaricus.csv"
private final val TrainDataMultinomialFilePath = "datasources/iris.scale.txt"

// indexing_mode is necessary to tell xgboost that features start from 1, not 0 (xgboost default is 0)
val binomialDataset: DMatrix =
new DMatrix(this.getClass.getClassLoader.getResource(TrainDataFilePath).getFile)
new DMatrix(this.getClass.getClassLoader.getResource(TrainDataFilePath).getFile + "?indexing_mode=1")

val multinomialDataset: DMatrix =
new DMatrix(this.getClass.getClassLoader.getResource(TrainDataMultinomialFilePath).getFile)
new DMatrix(this.getClass.getClassLoader.getResource(TrainDataMultinomialFilePath).getFile + "?indexing_mode=1")

lazy val leapFrameLibSVMtrain: DefaultLeapFrame = leapFrameFromLibSVMFile(TrainDataFilePath)
lazy val leapFrameIrisTrain: DefaultLeapFrame = leapFrameFromLibSVMFile(TrainDataMultinomialFilePath)
lazy val leapFrameBinomial: DefaultLeapFrame = leapFrameFromCSVFile(TrainDataFilePathCSV)
lazy val leapFrameMultinomial: DefaultLeapFrame = leapFrameFromLibSVMFile(TrainDataMultinomialFilePath)

def numFeatures(dataset: DefaultLeapFrame): Int =
dataset.schema.getField("features").get.dataType.asInstanceOf[TensorType].dimensions.get.head

private def leapFrameFromCSVFile(filePath: String): DefaultLeapFrame = {
// Use Spark utils to load csv from disk
val spark = SparkSession.builder()
.master("local[2]")
.appName(s"${this.getClass.getName}")
.getOrCreate()

val dataFrame = spark.read.format("csv")
.option("header", "false")
.option("inferSchema", "true")
.load(this.getClass.getClassLoader.getResource(filePath).getFile)

val nFeatures = dataFrame.head().length - 1

val mleapSchema = StructType(
StructField("label", DataType(BasicType.Double, ScalarShape())),
StructField("features", TensorType.Double(nFeatures))).get

val mleapMatrix: Array[ArrayRow] = dataFrame.collect().map {
r => {
val sequence = r.toSeq.toArray.map(_.asInstanceOf[Double])
ArrayRow(
Seq(sequence.head,
DenseTensor[Double](sequence.slice(1, sequence.length), Seq(nFeatures))))
}
}

DefaultLeapFrame(mleapSchema, mleapMatrix)
}

private def leapFrameFromLibSVMFile(filePath: String): DefaultLeapFrame = {

// Use Spark utils to load libsvm from disk
Expand All @@ -37,7 +71,7 @@ trait CachedDatasetUtils {
val dataFrame = spark.read.format("libsvm")
.load(this.getClass.getClassLoader.getResource(filePath).getFile)

val mleapSchema = Option(TypeConverters.sparkSchemaToMleapSchema(dataFrame))
val mleapSchema = TypeConverters.sparkSchemaToMleapSchema(dataFrame)

val mleapMatrix: Array[ArrayRow] = dataFrame.collect().map {
r => ArrayRow(
Expand All @@ -47,7 +81,7 @@ trait CachedDatasetUtils {
))
}

DefaultLeapFrame(mleapSchema.get, mleapMatrix)
DefaultLeapFrame(mleapSchema, mleapMatrix)
}

def toDenseFeaturesLeapFrame(sparseLeapFrame: DefaultLeapFrame): DefaultLeapFrame = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ trait ClassifierUtils extends BoosterUtils with CachedDatasetUtils {
rawPredictionCol = Some("raw_prediction"),
probabilityCol = Some("probability")),
XGBoostClassificationModel(
XGBoostBinaryClassificationModel(booster, numFeatures(leapFrameLibSVMtrain), 0))
XGBoostBinaryClassificationModel(booster, numFeatures(leapFrameBinomial), 0))
)
}

Expand All @@ -33,7 +33,7 @@ trait ClassifierUtils extends BoosterUtils with CachedDatasetUtils {
probabilityCol = Some("probability")),
XGBoostClassificationModel(
XGBoostMultinomialClassificationModel(
booster, xgboostMultinomialParams("num_class").asInstanceOf[Int], numFeatures(leapFrameIrisTrain), 0))
booster, xgboostMultinomialParams("num_class").asInstanceOf[Int], numFeatures(leapFrameMultinomial), 0))
)
}
}
15 changes: 9 additions & 6 deletions project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ object Dependencies {
lazy val slf4jVersion = "1.7.25"
lazy val awsSdkVersion = "1.11.349"
val tensorflowVersion = "1.11.0"
val xgboostVersion = "0.90"
val xgboostVersion = "1.0.0"
val hadoopVersion = "2.6.5" // matches spark version
val kryoVersion = "4.0.2" // Remove upon upgrading to xgboost 1.1.1

object Compile {
val sparkMllibLocal = "org.apache.spark" %% "spark-mllib-local" % sparkVersion excludeAll(ExclusionRule(organization = "org.scalatest"))
Expand Down Expand Up @@ -67,10 +68,12 @@ object Dependencies {
"ch.qos.logback" % "logback-classic" % logbackVersion,
"com.typesafe.scala-logging" %% "scala-logging" % loggingVersion
)
val xgboostDep = "ml.dmlc" % "xgboost4j" % xgboostVersion // scala 2.11 only
val xgboostPredictorDep = "biz.k11i" % "xgboost-predictor" % "0.3.1"

val xgboostSparkDep = "ml.dmlc" % "xgboost4j-spark" % xgboostVersion // scala 2.11 only
val kryo = "com.esotericsoftware" % "kryo" % kryoVersion
val xgboostDep = "ml.dmlc" %% "xgboost4j" % xgboostVersion exclude("com.esotericsoftware.kryo", "kryo")
val xgboostPredictorDep = "ai.h2o" % "xgboost-predictor" % "0.3.17" exclude("com.esotericsoftware.kryo", "kryo")

val xgboostSparkDep = "ml.dmlc" %% "xgboost4j-spark" % xgboostVersion exclude("com.esotericsoftware.kryo", "kryo")
val hadoop = "org.apache.hadoop" % "hadoop-client" % hadoopVersion
}

Expand Down Expand Up @@ -119,9 +122,9 @@ object Dependencies {

val tensorflow = l ++= tensorflowDeps ++ Seq(Test.scalaTest)

val xgboostRuntime = l ++= Seq(xgboostDep) ++ Seq(xgboostPredictorDep) ++ Test.spark ++ Test.sparkTest ++ Seq(Test.scalaTest)
val xgboostRuntime = l ++= Seq(xgboostDep) ++ Seq(xgboostPredictorDep) ++ Seq(kryo) ++ Test.spark ++ Test.sparkTest ++ Seq(Test.scalaTest)

val xgboostSpark = l ++= Seq(xgboostSparkDep) ++ Provided.spark ++ Test.spark ++ Test.sparkTest
val xgboostSpark = l ++= Seq(xgboostSparkDep) ++ Seq(kryo) ++ Provided.spark ++ Test.spark ++ Test.sparkTest

val serving = l ++= Seq(akkaHttp, akkaHttpSprayJson, config, Test.scalaTest, Test.akkaHttpTestkit)

Expand Down
2 changes: 2 additions & 0 deletions project/MleapProject.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ object MleapProject {
bundleHdfs,
core,
runtime,
xgboostRuntime,
xgboostSpark,
avro,
sparkBase,
sparkTestkit,
Expand Down
2 changes: 2 additions & 0 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,7 @@ addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.6")
addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.6.1")
addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "0.9.3")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.7")
addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2")


libraryDependencies += "com.thesamet.scalapb" %% "compilerplugin" % "0.7.1"
Loading

0 comments on commit fbd3c75

Please sign in to comment.