Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade to xgboost 1.0.0 - Use h2oai Predictor #708

Merged
merged 9 commits into from
Oct 9, 2020
16 changes: 8 additions & 8 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
sudo: required
dist: trusty

addons:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you please remind me why these addons are needed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uh I wasn't able to build it without it. I can post the error message here by removing them and running tests again if you'd like. I also noticed that it was necessary in the spark-3.0.0 release branch here: https://github.com/combust/mleap/blob/spark-3.0.0/.travis.yml#L14

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

apt:
sources:
- ubuntu-toolchain-r-test
packages:
- gcc-4.8
- g++-4.8

services:
- docker

Expand All @@ -22,14 +30,6 @@ jobs:
script:
- make test_benchmark

- name: "MLeap xgboost runtime tests"
script:
- make test_xgboost_runtime

- name: "MLeap xgboost spark tests"
script:
- make test_xgboost_spark

- name: "All other sbt tests"
script:
- make test_root_sbt_project
Expand Down
6 changes: 0 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,6 @@ test_executor:
test_benchmark:
sbt "+ mleap-benchmark/test"

test_xgboost_runtime:
sbt "mleap-xgboost-runtime/test"

test_xgboost_spark:
sbt "mleap-xgboost-spark/test"

test_root_sbt_project:
sbt "+ test"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,11 @@ case class XGBoostPredictorMultinomialClassificationModel(
}

def predictProbabilities(data: FVec): Vector = {
Vectors.dense(predictor.predict(data, false, treeLimit))
Vectors.dense(predictor.predict(data, false, treeLimit).map(_.toDouble))
}

def predictRaw(data: FVec): Vector = {
Vectors.dense(predictor.predict(data, true, treeLimit))
Vectors.dense(predictor.predict(data, true, treeLimit).map(_.toDouble))
}

override def rawToProbabilityInPlace(raw: Vector): Vector = {
Expand Down
6,513 changes: 6,513 additions & 0 deletions mleap-xgboost-runtime/src/test/resources/datasources/agaricus.csv

Large diffs are not rendered by default.

1,611 changes: 0 additions & 1,611 deletions mleap-xgboost-runtime/src/test/resources/datasources/agaricus.test

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package ml.combust.mleap.xgboost.runtime

import ml.combust.mleap.core.types.{BasicType, NodeShape, ScalarType, StructField, TensorType}
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Transformer}
import ml.combust.mleap.tensor.SparseTensor
import ml.combust.mleap.tensor.{SparseTensor, Tensor}
import ml.combust.mleap.xgboost.runtime.testing.{BoosterUtils, BundleSerializationUtils, CachedDatasetUtils, ClassifierUtils, FloatingPointApproximations}
import ml.dmlc.xgboost4j.scala.Booster
import org.apache.spark.ml.linalg.Vectors
Expand Down Expand Up @@ -32,7 +32,7 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val mleapRawPredictionColIndex = mleapResult.schema.indexOf("raw_prediction").get
val mleapProbabilityColIndex = mleapResult.schema.indexOf("probability").get

val singleRowDMatrix = r(featuresColumnIndex).asInstanceOf[SparseTensor[Double]].asXGB
val singleRowDMatrix = r(featuresColumnIndex).asInstanceOf[Tensor[Double]].asXGB

val boosterResult = booster.predict(singleRowDMatrix, false, 0).head(0)

Expand Down Expand Up @@ -104,7 +104,7 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val booster = trainBooster(binomialDataset)
val xgboostTransformer = trainXGBoost4jClassifier

equalityTestRowByRow(booster, xgboostTransformer, leapFrameLibSVMtrain)
equalityTestRowByRow(booster, xgboostTransformer, leapFrameBinomial)
}

it("has the correct inputs and outputs with columns: prediction, probability and raw_prediction") {
Expand All @@ -124,8 +124,8 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val preSerializationResult = xgboostTransformer.transform(leapFrameLibSVMtrain).get
val deserializedModelResult = deserializedTransformer.transform(leapFrameLibSVMtrain).get
val preSerializationResult = xgboostTransformer.transform(leapFrameBinomial).get
val deserializedModelResult = deserializedTransformer.transform(leapFrameBinomial).get

assert(preSerializationResult.dataset == deserializedModelResult.dataset)
}
Expand All @@ -134,7 +134,7 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val multiBooster = trainMultinomialBooster(multinomialDataset)
val xgboostTransformer = trainMultinomialXGBoost4jClassifier

equalityTestRowByRowMultinomial(multiBooster, xgboostTransformer, leapFrameIrisTrain)
equalityTestRowByRowMultinomial(multiBooster, xgboostTransformer, leapFrameMultinomial)
}

it("XGBoostMultinomialClassificationModel results are the same pre and post serialization") {
Expand All @@ -143,8 +143,8 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val preSerializationResult = xgboostTransformer.transform(leapFrameIrisTrain).get
val deserializedResult = deserializedTransformer.transform(leapFrameIrisTrain).get
val preSerializationResult = xgboostTransformer.transform(leapFrameMultinomial).get
val deserializedResult = deserializedTransformer.transform(leapFrameMultinomial).get

assert(preSerializationResult.dataset == deserializedResult.dataset)
}
Expand All @@ -155,7 +155,7 @@ class XGBoostClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameLibSVMtrain)
val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameBinomial)

val preSerializationResult = xgboostTransformer.transform(denseLeapFrame).get
val deserializedResult = deserializedTransformer.transform(denseLeapFrame).get
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ class XGBoostPredictorClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboost4jTransformer)
val deserializedPredictor: Transformer = loadXGBoostPredictorFromBundle(mleapBundle)

val preSerializationXGBoost4jResult = xgboost4jTransformer.transform(leapFrameLibSVMtrain).get
val predictorModelResult = deserializedPredictor.transform(leapFrameLibSVMtrain).get
val preSerializationXGBoost4jResult = xgboost4jTransformer.transform(leapFrameBinomial).get
val predictorModelResult = deserializedPredictor.transform(leapFrameBinomial).get

probabilityColumnEqualityTest(preSerializationXGBoost4jResult, predictorModelResult)
}
Expand All @@ -91,10 +91,10 @@ class XGBoostPredictorClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboost4jTransformer)

val deserializedXGBoost4jTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)
val deserializedXGBoost4jResult = deserializedXGBoost4jTransformer.transform(leapFrameLibSVMtrain).get
val deserializedXGBoost4jResult = deserializedXGBoost4jTransformer.transform(leapFrameBinomial).get

val deserializedPredictorTransformer: Transformer = loadXGBoostPredictorFromBundle(mleapBundle)
val deserializedPredictorResult = deserializedPredictorTransformer.transform(leapFrameLibSVMtrain).get
val deserializedPredictorResult = deserializedPredictorTransformer.transform(leapFrameBinomial).get

probabilityColumnEqualityTest(deserializedPredictorResult, deserializedXGBoost4jResult)
}
Expand All @@ -117,7 +117,7 @@ class XGBoostPredictorClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(trainMultinomialXGBoost4jClassifier)
val deserializedPredictorTransformer: Transformer = loadXGBoostPredictorFromBundle(mleapBundle)

equalityTestRowByRowMultinomialProbability(multiBooster, deserializedPredictorTransformer, leapFrameIrisTrain)
equalityTestRowByRowMultinomialProbability(multiBooster, deserializedPredictorTransformer, leapFrameMultinomial)
}

it("[Multinomial] XGBoostPredictorMultinomialClassificationModel results are the same pre and post serialization") {
Expand All @@ -126,19 +126,19 @@ class XGBoostPredictorClassificationModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboost4jTransformer)
val predictorTransformer: Transformer = loadXGBoostPredictorFromBundle(mleapBundle)

val xgboost4jResult = xgboost4jTransformer.transform(leapFrameIrisTrain).get
val predictorResult = predictorTransformer.transform(leapFrameIrisTrain).get
val xgboost4jResult = xgboost4jTransformer.transform(leapFrameMultinomial).get
val predictorResult = predictorTransformer.transform(leapFrameMultinomial).get

probabilityColumnEqualityTest(xgboost4jResult, predictorResult)
}

it("XGBoost4j and Predictor results are the same when using a dense dataset") {
val xgboost4jTransformer = trainMultinomialXGBoost4jClassifier
val xgboost4jTransformer = trainXGBoost4jClassifier

val mleapBundle = serializeModelToMleapBundle(xgboost4jTransformer)
val predictorTransformer: Transformer = loadXGBoostPredictorFromBundle(mleapBundle)

val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameLibSVMtrain)
val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameBinomial)

val xgboost4jResult = xgboost4jTransformer.transform(denseLeapFrame).get
val predictorResult = predictorTransformer.transform(denseLeapFrame).get
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ package ml.combust.mleap.xgboost.runtime

import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Transformer}
import ml.combust.mleap.tensor.SparseTensor
import ml.combust.mleap.tensor.Tensor
import ml.combust.mleap.xgboost.runtime.testing.{BoosterUtils, BundleSerializationUtils, CachedDatasetUtils, FloatingPointApproximations}
import ml.dmlc.xgboost4j.scala.Booster
import org.scalatest.FunSpec
Expand All @@ -22,7 +22,7 @@ class XGBoostRegressionModelParitySpec extends FunSpec
XGBoostRegression(
"xgboostSingleThread",
NodeShape.regression(),
XGBoostRegressionModel(booster, numFeatures(leapFrameLibSVMtrain), 0)
XGBoostRegressionModel(booster, numFeatures(leapFrameBinomial), 0)
)
}

Expand All @@ -35,7 +35,7 @@ class XGBoostRegressionModelParitySpec extends FunSpec
val mleapResult = mleapTransformer.transform(DefaultLeapFrame(leapFrameDataset.schema, Seq(r))).get
val mleapPredictionColIndex = mleapResult.schema.indexOf("prediction").get

val singleRowDMatrix = r(featuresColumnIndex).asInstanceOf[SparseTensor[Double]].asXGB
val singleRowDMatrix = r(featuresColumnIndex).asInstanceOf[Tensor[Double]].asXGB
val boosterResult = booster.predict(singleRowDMatrix, false, 0).head(0)

assert (boosterResult == mleapResult.dataset.head.getDouble(mleapPredictionColIndex))
Expand All @@ -50,7 +50,7 @@ class XGBoostRegressionModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

equalityTestRowByRow(booster, deserializedTransformer, leapFrameLibSVMtrain)
equalityTestRowByRow(booster, deserializedTransformer, leapFrameBinomial)
}

it("has the correct inputs and outputs with columns: prediction, probability and raw_prediction") {
Expand All @@ -69,8 +69,8 @@ class XGBoostRegressionModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val preSerializationResult = xgboostTransformer.transform(leapFrameLibSVMtrain).get
val deserializedModelResult = deserializedTransformer.transform(leapFrameLibSVMtrain).get
val preSerializationResult = xgboostTransformer.transform(leapFrameBinomial).get
val deserializedModelResult = deserializedTransformer.transform(leapFrameBinomial).get

assert(preSerializationResult.dataset == deserializedModelResult.dataset)
}
Expand All @@ -81,7 +81,7 @@ class XGBoostRegressionModelParitySpec extends FunSpec
val mleapBundle = serializeModelToMleapBundle(xgboostTransformer)
val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle)

val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameLibSVMtrain)
val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameBinomial)

val preSerializationResult = xgboostTransformer.transform(denseLeapFrame).get
val deserializedResult = deserializedTransformer.transform(denseLeapFrame).get
Expand Down
Original file line number Diff line number Diff line change
@@ -1,30 +1,64 @@
package ml.combust.mleap.xgboost.runtime.testing

import ml.combust.mleap.core.types.TensorType
import ml.combust.mleap.core.types.{BasicType, DataType, ScalarShape, StructField, StructType, TensorType}
import ml.combust.mleap.core.util.VectorConverters
import ml.combust.mleap.runtime.frame.{ArrayRow, DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.DenseTensor
import ml.dmlc.xgboost4j.scala.DMatrix
import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.mleap.TypeConverters


trait CachedDatasetUtils {

private final val TrainDataFilePath = "datasources/agaricus.train"
private final val TrainDataFilePathCSV = "datasources/agaricus.csv"
private final val TrainDataMultinomialFilePath = "datasources/iris.scale.txt"

// indexing_mode is necessary to tell xgboost that features start from 1, not 0 (xgboost default is 0)
val binomialDataset: DMatrix =
new DMatrix(this.getClass.getClassLoader.getResource(TrainDataFilePath).getFile)
new DMatrix(this.getClass.getClassLoader.getResource(TrainDataFilePath).getFile + "?indexing_mode=1")

val multinomialDataset: DMatrix =
new DMatrix(this.getClass.getClassLoader.getResource(TrainDataMultinomialFilePath).getFile)
new DMatrix(this.getClass.getClassLoader.getResource(TrainDataMultinomialFilePath).getFile + "?indexing_mode=1")

lazy val leapFrameLibSVMtrain: DefaultLeapFrame = leapFrameFromLibSVMFile(TrainDataFilePath)
lazy val leapFrameIrisTrain: DefaultLeapFrame = leapFrameFromLibSVMFile(TrainDataMultinomialFilePath)
lazy val leapFrameBinomial: DefaultLeapFrame = leapFrameFromCSVFile(TrainDataFilePathCSV)
lazy val leapFrameMultinomial: DefaultLeapFrame = leapFrameFromLibSVMFile(TrainDataMultinomialFilePath)

def numFeatures(dataset: DefaultLeapFrame): Int =
dataset.schema.getField("features").get.dataType.asInstanceOf[TensorType].dimensions.get.head

private def leapFrameFromCSVFile(filePath: String): DefaultLeapFrame = {
// Use Spark utils to load csv from disk
val spark = SparkSession.builder()
.master("local[2]")
.appName(s"${this.getClass.getName}")
.getOrCreate()

val dataFrame = spark.read.format("csv")
.option("header", "false")
.option("inferSchema", "true")
.load(this.getClass.getClassLoader.getResource(filePath).getFile)

val nFeatures = dataFrame.head().length - 1

val mleapSchema = StructType(
StructField("label", DataType(BasicType.Double, ScalarShape())),
StructField("features", TensorType.Double(nFeatures))).get

val mleapMatrix: Array[ArrayRow] = dataFrame.collect().map {
r => {
val sequence = r.toSeq.toArray.map(_.asInstanceOf[Double])
ArrayRow(
Seq(sequence.head,
DenseTensor[Double](sequence.slice(1, sequence.length), Seq(nFeatures))))
}
}

DefaultLeapFrame(mleapSchema, mleapMatrix)
}

private def leapFrameFromLibSVMFile(filePath: String): DefaultLeapFrame = {

// Use Spark utils to load libsvm from disk
Expand All @@ -37,7 +71,7 @@ trait CachedDatasetUtils {
val dataFrame = spark.read.format("libsvm")
.load(this.getClass.getClassLoader.getResource(filePath).getFile)

val mleapSchema = Option(TypeConverters.sparkSchemaToMleapSchema(dataFrame))
val mleapSchema = TypeConverters.sparkSchemaToMleapSchema(dataFrame)

val mleapMatrix: Array[ArrayRow] = dataFrame.collect().map {
r => ArrayRow(
Expand All @@ -47,7 +81,7 @@ trait CachedDatasetUtils {
))
}

DefaultLeapFrame(mleapSchema.get, mleapMatrix)
DefaultLeapFrame(mleapSchema, mleapMatrix)
}

def toDenseFeaturesLeapFrame(sparseLeapFrame: DefaultLeapFrame): DefaultLeapFrame = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ trait ClassifierUtils extends BoosterUtils with CachedDatasetUtils {
rawPredictionCol = Some("raw_prediction"),
probabilityCol = Some("probability")),
XGBoostClassificationModel(
XGBoostBinaryClassificationModel(booster, numFeatures(leapFrameLibSVMtrain), 0))
XGBoostBinaryClassificationModel(booster, numFeatures(leapFrameBinomial), 0))
)
}

Expand All @@ -33,7 +33,7 @@ trait ClassifierUtils extends BoosterUtils with CachedDatasetUtils {
probabilityCol = Some("probability")),
XGBoostClassificationModel(
XGBoostMultinomialClassificationModel(
booster, xgboostMultinomialParams("num_class").asInstanceOf[Int], numFeatures(leapFrameIrisTrain), 0))
booster, xgboostMultinomialParams("num_class").asInstanceOf[Int], numFeatures(leapFrameMultinomial), 0))
)
}
}
15 changes: 9 additions & 6 deletions project/Dependencies.scala
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ object Dependencies {
lazy val slf4jVersion = "1.7.25"
lazy val awsSdkVersion = "1.11.349"
val tensorflowVersion = "1.11.0"
val xgboostVersion = "0.90"
val xgboostVersion = "1.0.0"
val hadoopVersion = "2.6.5" // matches spark version
val kryoVersion = "4.0.2" // Remove upon upgrading to xgboost 1.1.1

object Compile {
val sparkMllibLocal = "org.apache.spark" %% "spark-mllib-local" % sparkVersion excludeAll(ExclusionRule(organization = "org.scalatest"))
Expand Down Expand Up @@ -67,10 +68,12 @@ object Dependencies {
"ch.qos.logback" % "logback-classic" % logbackVersion,
"com.typesafe.scala-logging" %% "scala-logging" % loggingVersion
)
val xgboostDep = "ml.dmlc" % "xgboost4j" % xgboostVersion // scala 2.11 only
val xgboostPredictorDep = "biz.k11i" % "xgboost-predictor" % "0.3.1"

val xgboostSparkDep = "ml.dmlc" % "xgboost4j-spark" % xgboostVersion // scala 2.11 only
val kryo = "com.esotericsoftware" % "kryo" % kryoVersion
val xgboostDep = "ml.dmlc" %% "xgboost4j" % xgboostVersion exclude("com.esotericsoftware.kryo", "kryo")
val xgboostPredictorDep = "ai.h2o" % "xgboost-predictor" % "0.3.17" exclude("com.esotericsoftware.kryo", "kryo")

val xgboostSparkDep = "ml.dmlc" %% "xgboost4j-spark" % xgboostVersion exclude("com.esotericsoftware.kryo", "kryo")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did we run into issues with the kyro versions?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are brought in elsewhere and there is a version conflict. This is just making sure that others are chosen

val hadoop = "org.apache.hadoop" % "hadoop-client" % hadoopVersion
}

Expand Down Expand Up @@ -119,9 +122,9 @@ object Dependencies {

val tensorflow = l ++= tensorflowDeps ++ Seq(Test.scalaTest)

val xgboostRuntime = l ++= Seq(xgboostDep) ++ Seq(xgboostPredictorDep) ++ Test.spark ++ Test.sparkTest ++ Seq(Test.scalaTest)
val xgboostRuntime = l ++= Seq(xgboostDep) ++ Seq(xgboostPredictorDep) ++ Seq(kryo) ++ Test.spark ++ Test.sparkTest ++ Seq(Test.scalaTest)

val xgboostSpark = l ++= Seq(xgboostSparkDep) ++ Provided.spark ++ Test.spark ++ Test.sparkTest
val xgboostSpark = l ++= Seq(xgboostSparkDep) ++ Seq(kryo) ++ Provided.spark ++ Test.spark ++ Test.sparkTest

val serving = l ++= Seq(akkaHttp, akkaHttpSprayJson, config, Test.scalaTest, Test.akkaHttpTestkit)

Expand Down
2 changes: 2 additions & 0 deletions project/MleapProject.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ object MleapProject {
bundleHdfs,
core,
runtime,
xgboostRuntime,
xgboostSpark,
avro,
sparkBase,
sparkTestkit,
Expand Down
2 changes: 2 additions & 0 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,7 @@ addSbtPlugin("com.github.gseitz" % "sbt-release" % "1.0.6")
addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.6.1")
addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "0.9.3")
addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.7")
addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.2")


libraryDependencies += "com.thesamet.scalapb" %% "compilerplugin" % "0.7.1"
Loading