diff --git a/mleap-xgboost-runtime/src/main/resources/pixar_movie_mleap_bundle.zip b/mleap-xgboost-runtime/src/main/resources/pixar_movie_mleap_bundle.zip deleted file mode 100644 index 2785314ab..000000000 Binary files a/mleap-xgboost-runtime/src/main/resources/pixar_movie_mleap_bundle.zip and /dev/null differ diff --git a/mleap-xgboost-runtime/src/main/resources/reference.conf b/mleap-xgboost-runtime/src/main/resources/reference.conf index 2e8a00514..bd83e1166 100644 --- a/mleap-xgboost-runtime/src/main/resources/reference.conf +++ b/mleap-xgboost-runtime/src/main/resources/reference.conf @@ -1,5 +1,4 @@ ml.combust.mleap.xgboost.ops = [ -// "ml.combust.mleap.xgboost.runtime.bundle.ops.XGBoostPerformantClassificationOp", "ml.combust.mleap.xgboost.runtime.bundle.ops.XGBoostClassificationOp", "ml.combust.mleap.xgboost.runtime.bundle.ops.XGBoostRegressionOp" ] diff --git a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/MyMain.scala b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/MyMain.scala deleted file mode 100644 index baae13e8e..000000000 --- a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/MyMain.scala +++ /dev/null @@ -1,51 +0,0 @@ -package ml.combust.mleap.xgboost.runtime - -import ml.combust.mleap.tensor.{DenseTensor, Tensor} - -object MyMain { - - def main(args: Array[String]): Unit = { - - import ml.combust.bundle.BundleFile - import ml.combust.mleap.runtime.MleapSupport._ - import resource._ - - import java.nio.file.Paths - val modelArtifactName = "pixar_movie_mleap_bundle_old.zip" - val artifactLocalFile = Paths.get(this.asInstanceOf[Any].getClass.getClassLoader.getResource(modelArtifactName).toURI).toFile - - // load the Spark pipeline we saved in the previous section - val bundle = (for (bundleFile <- managed(BundleFile(s"jar:file:${artifactLocalFile.getAbsolutePath}"))) yield { - bundleFile.loadMleapBundle().get - }).tried.get - - // create a simple LeapFrame to transform - import ml.combust.mleap.core.types._ - import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row} - - // MLeap makes extensive use of monadic types like Try - val schema = StructType( - StructField("features", TensorType(BasicType.Double, Seq(2))) - ).get -// StructField("long_feature", ScalarType.Long), -// StructField("double_feature", ScalarType.Double)).get - - val data = Seq( - Row( -// "features", -// TensorType(Double, Seq(3)) - Tensor.denseVector(Array(0.2, 0.7)) -// TensorType(Double, Some(Array(0.1, 0.2)), false) - ) - ) -// val data = Seq(Row("hello", 65L), Row("MLeap", 0.2)) - val frame = DefaultLeapFrame(schema, data) - - // transform the dataframe using our pipeline - val mleapPipeline = bundle.root - val frame2 = mleapPipeline.transform(frame).get - println(frame2.dataset) - - - } -} diff --git a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostPerformantClassification.scala b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostPredictorClassification.scala similarity index 59% rename from mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostPerformantClassification.scala rename to mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostPredictorClassification.scala index ddca3a832..89b6266e8 100644 --- a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostPerformantClassification.scala +++ b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostPredictorClassification.scala @@ -1,30 +1,28 @@ package ml.combust.mleap.xgboost.runtime +import biz.k11i.xgboost.util.FVec import ml.combust.mleap.core.types.NodeShape import ml.combust.mleap.runtime.function.UserDefinedFunction import ml.combust.mleap.tensor.Tensor import ml.combust.mleap.runtime.frame.{MultiTransformer, Row, Transformer} -import org.apache.spark.ml.linalg.{Vector, Vectors} -import biz.k11i.xgboost.util.FVec +import ml.combust.mleap.core.util.VectorConverters._ import XgbConverters._ -case class XGBoostPerformantClassification( +case class XGBoostPredictorClassification( override val uid: String = Transformer.uniqueName("xgboost.classification"), override val shape: NodeShape, - override val model: XGBoostPerformantClassificationModel) extends MultiTransformer { + override val model: XGBoostPredictorClassificationModel) extends MultiTransformer { override val exec: UserDefinedFunction = { - val rawPrediction = shape.getOutput("raw_prediction").map { - _ => (data: FVec) => Some(model.predictRaw(data): Vector) - }.getOrElse((_: FVec) => None) - val prediction = (data: FVec) => Some(model.predict(data)) - - val all = Seq(prediction) + // Since the Predictor is our performant implementation, we only compute probability for performance reasons. + val probability = shape.getOutput("probability").map { + _ => (data: FVec) => Some(model.predictProbabilities(data): Tensor[Double]) + }.getOrElse((_: FVec) => None) val f = (features: Tensor[Double]) => { val data: FVec = features.asXGBPredictor - val rowData = all.map(_.apply(data)).filter(_.isDefined).map(_.get) + val rowData = Seq(probability(data).get) Row(rowData: _*) } diff --git a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostPerformantClassificationModel.scala b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostPredictorClassificationModel.scala similarity index 60% rename from mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostPerformantClassificationModel.scala rename to mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostPredictorClassificationModel.scala index c4b288281..a54ea032a 100644 --- a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostPerformantClassificationModel.scala +++ b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XGBoostPredictorClassificationModel.scala @@ -4,11 +4,11 @@ import biz.k11i.xgboost.Predictor import biz.k11i.xgboost.util.FVec import ml.combust.mleap.core.classification.ProbabilisticClassificationModel import org.apache.spark.ml.linalg.{Vector, Vectors} +import ml.combust.mleap.core.types.{StructType, TensorType} import XgbConverters._ -import ml.combust.mleap.core.types.{ListType, ScalarType, StructType, TensorType} -trait XGBoostPerformantClassificationModelBase extends ProbabilisticClassificationModel { +trait XGBoostPredictorClassificationModelBase extends ProbabilisticClassificationModel { def predictor: Predictor def treeLimit: Int @@ -26,10 +26,10 @@ trait XGBoostPerformantClassificationModelBase extends ProbabilisticClassificati } -case class XGBoostPerformantBinaryClassificationModel( +case class XGBoostPredictorBinaryClassificationModel( override val predictor: Predictor, override val numFeatures: Int, - override val treeLimit: Int) extends XGBoostPerformantClassificationModelBase { + override val treeLimit: Int) extends XGBoostPredictorClassificationModelBase { override val numClasses: Int = 2 @@ -51,15 +51,38 @@ case class XGBoostPerformantBinaryClassificationModel( } } -case class XGBoostPerformantClassificationModel(impl: XGBoostPerformantClassificationModelBase) extends ProbabilisticClassificationModel { +case class XGBoostPredictorMultinomialClassificationModel( + override val predictor: Predictor, + override val numClasses: Int, + override val numFeatures: Int, + override val treeLimit: Int) extends XGBoostPredictorClassificationModelBase { + + override def predict(data: FVec): Double = { + probabilityToPrediction(predictProbabilities(data)) + } + + def predictProbabilities(data: FVec): Vector = { + Vectors.dense(predictor.predict(data, false, treeLimit)) + } + + def predictRaw(data: FVec): Vector = { + Vectors.dense(predictor.predict(data, true, treeLimit)) + } + + override def rawToProbabilityInPlace(raw: Vector): Vector = { + throw new Exception("XGBoost Classification model does not support \'rawToProbabilityInPlace\'") + } +} + +case class XGBoostPredictorClassificationModel(impl: XGBoostPredictorClassificationModelBase) extends ProbabilisticClassificationModel { override val numClasses: Int = impl.numClasses override val numFeatures: Int = impl.numFeatures def treeLimit: Int = impl.treeLimit def predictor: Predictor = impl.predictor - def binaryClassificationModel: XGBoostPerformantBinaryClassificationModel = impl.asInstanceOf[XGBoostPerformantBinaryClassificationModel] -// def multinomialClassificationModel: XGBoostMultinomialClassificationModel = impl.asInstanceOf[XGBoostMultinomialClassificationModel] + def binaryClassificationModel: XGBoostPredictorBinaryClassificationModel = impl.asInstanceOf[XGBoostPredictorBinaryClassificationModel] + def multinomialClassificationModel: XGBoostPredictorMultinomialClassificationModel = impl.asInstanceOf[XGBoostPredictorMultinomialClassificationModel] def predict(data: FVec): Double = impl.predict(data) @@ -75,10 +98,6 @@ case class XGBoostPerformantClassificationModel(impl: XGBoostPerformantClassific override def rawToProbabilityInPlace(raw: Vector): Vector = impl.rawToProbabilityInPlace(raw) override def outputSchema: StructType = StructType( - "raw_prediction" -> TensorType.Double(numClasses), -// "probability" -> TensorType.Double(numClasses), - "prediction" -> ScalarType.Double.nonNullable -// "leaf_prediction" -> ListType.Double, -// "contrib_prediction" -> ListType.Double).get + "probability" -> TensorType.Double(numClasses) ).get } diff --git a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XgbConverters.scala b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XgbConverters.scala index 631198e93..2a8cc67d6 100644 --- a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XgbConverters.scala +++ b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XgbConverters.scala @@ -2,18 +2,19 @@ package ml.combust.mleap.xgboost.runtime import biz.k11i.xgboost.util.FVec import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor} -import ml.combust.mleap.xgboost.FVecTensorImpl +import ml.combust.mleap.xgboost.runtime.struct.{FVecFactory, FVecTensorImpl} import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.DMatrix import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} + trait XgbConverters { implicit class VectorOps(vector: Vector) { def asXGB: DMatrix = { vector match { - case SparseVector(_, indices, values) =>{ + case SparseVector(_, indices, values) => new DMatrix(Iterator(new LabeledPoint(0.0f, indices, values.map(_.toFloat)))) - } + case DenseVector(values) => new DMatrix(Iterator(new LabeledPoint(0.0f, null, values.map(_.toFloat)))) } @@ -21,12 +22,10 @@ trait XgbConverters { def asXGBPredictor: FVec = { vector match { -// case SparseVector(_, indices, values) => { -// FVec.Transformer.fromMap( -// } - case DenseVector(values) => { - FVec.Transformer.fromArray(values.map(_.toFloat), false) - } + case sparseVector: SparseVector => + FVecFactory.fromSparseVector(sparseVector) + case denseVector: DenseVector => + FVecFactory.fromDenseVector(denseVector) } } } @@ -34,25 +33,15 @@ trait XgbConverters { implicit class DoubleTensorOps(tensor: Tensor[Double]) { def asXGB: DMatrix = { tensor match { - case SparseTensor(indices, values, _) =>{ + case SparseTensor(indices, values, _) => new DMatrix(Iterator(new LabeledPoint(0.0f, indices.map(_.head).toArray, values.map(_.toFloat)))) - } - case DenseTensor(_, _) =>{ + + case DenseTensor(_, _) => new DMatrix(Iterator(new LabeledPoint(0.0f, null, tensor.toDense.rawValues.map(_.toFloat)))) - } } } - def asXGBPredictor: FVec = { - tensor match { - case sparseTensor: SparseTensor[Double] => { - FVecTensorImpl(sparseTensor) - } - case DenseTensor(_, _) => { - FVec.Transformer.fromArray(tensor.toArray.map(_.toFloat), false) - } - } - } + def asXGBPredictor: FVec = FVecTensorImpl(tensor) } } diff --git a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostClassificationOp.scala b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostClassificationOp.scala index 0ae0149c4..91342a769 100644 --- a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostClassificationOp.scala +++ b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostClassificationOp.scala @@ -7,7 +7,7 @@ import ml.combust.bundle.dsl.{Model, Value} import ml.combust.bundle.op.OpModel import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.runtime.MleapContext -import ml.combust.mleap.xgboost.runtime.{XGBoostBinaryClassificationModel, XGBoostClassification, XGBoostClassificationModel, XGBoostMultinomialClassificationModel, XGBoostPerformantBinaryClassificationModel, XGBoostPerformantClassification, XGBoostPerformantClassificationModel} +import ml.combust.mleap.xgboost.runtime.{XGBoostBinaryClassificationModel, XGBoostClassification, XGBoostClassificationModel, XGBoostMultinomialClassificationModel, XGBoostPredictorBinaryClassificationModel, XGBoostPredictorClassification, XGBoostPredictorClassificationModel} import ml.dmlc.xgboost4j.scala.{Booster, XGBoost} diff --git a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostPerformantClassificationOp.scala b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostPredictorClassificationOp.scala similarity index 53% rename from mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostPerformantClassificationOp.scala rename to mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostPredictorClassificationOp.scala index c9e073a74..e9cf83ee4 100644 --- a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostPerformantClassificationOp.scala +++ b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostPredictorClassificationOp.scala @@ -14,20 +14,20 @@ import ml.combust.mleap.xgboost.runtime._ /** * Created by hollinwilkins on 9/16/17. */ -class XGBoostPerformantClassificationOp extends MleapOp[XGBoostPerformantClassification, XGBoostPerformantClassificationModel] { +class XGBoostPredictorClassificationOp extends MleapOp[XGBoostPredictorClassification, XGBoostPredictorClassificationModel] { - override val Model: OpModel[MleapContext, XGBoostPerformantClassificationModel] = new OpModel[MleapContext, XGBoostPerformantClassificationModel] { - override val klazz: Class[XGBoostPerformantClassificationModel] = classOf[XGBoostPerformantClassificationModel] + override val Model: OpModel[MleapContext, XGBoostPredictorClassificationModel] = new OpModel[MleapContext, XGBoostPredictorClassificationModel] { + override val klazz: Class[XGBoostPredictorClassificationModel] = classOf[XGBoostPredictorClassificationModel] override def opName: String = "xgboost.classifier" @throws[RuntimeException] - override def store(model: Model, obj: XGBoostPerformantClassificationModel) + override def store(model: Model, obj: XGBoostPredictorClassificationModel) (implicit context: BundleContext[MleapContext]): Model = throw new RuntimeException("The XGBoostPredictor implementation does not support storing the model.") override def load(model: Model) - (implicit context: BundleContext[MleapContext]): XGBoostPerformantClassificationModel = { + (implicit context: BundleContext[MleapContext]): XGBoostPredictorClassificationModel = { val predictor = new Predictor(Files.newInputStream(context.file("xgboost.model"))) @@ -35,11 +35,15 @@ class XGBoostPerformantClassificationOp extends MleapOp[XGBoostPerformantClassif val numFeatures = model.value("num_features").getInt val treeLimit = model.value("tree_limit").getInt - val impl = XGBoostPerformantBinaryClassificationModel(predictor, numFeatures, treeLimit) + val impl = if(numClasses == 2) { + XGBoostPredictorBinaryClassificationModel(predictor, numFeatures, treeLimit) + } else { + XGBoostPredictorMultinomialClassificationModel(predictor, numClasses, numFeatures, treeLimit) + } - XGBoostPerformantClassificationModel(impl) + XGBoostPredictorClassificationModel(impl) } } - override def model(node: XGBoostPerformantClassification): XGBoostPerformantClassificationModel = node.model + override def model(node: XGBoostPredictorClassification): XGBoostPredictorClassificationModel = node.model } diff --git a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/struct/FVecFactory.scala b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/struct/FVecFactory.scala new file mode 100644 index 000000000..e93050016 --- /dev/null +++ b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/struct/FVecFactory.scala @@ -0,0 +1,34 @@ +package ml.combust.mleap.xgboost.runtime.struct + +import biz.k11i.xgboost.util.FVec +import ml.combust.mleap.tensor.Tensor +import org.apache.spark.ml.linalg.{DenseVector, SparseVector} + +import scala.collection.JavaConversions.mapAsJavaMap + + +object FVecFactory { + + private def fromScalaMap(map: Map[Int, Float]): FVec = { + val javaMap = mapAsJavaMap(map).asInstanceOf[java.util.Map[java.lang.Integer, java.lang.Float]] + + FVec.Transformer.fromMap(javaMap) + } + + def fromSparseVector(sparseVector: SparseVector): FVec = { + // Casting to floats, because doubles result in compounding differences from the c++ implementation + // https://github.com/komiya-atsushi/xgboost-predictor-java/issues/21 + val scalaMap = (sparseVector.indices zip sparseVector.values.map(_.toFloat)).toMap + + FVecFactory.fromScalaMap(scalaMap) + } + + def fromDenseVector(denseVector: DenseVector): FVec = { + // Casting to floats, because doubles result in compounding differences from the c++ implementation + // https://github.com/komiya-atsushi/xgboost-predictor-java/issues/21 + FVec.Transformer.fromArray(denseVector.values.map(_.toFloat), false) + } + + def fromTensor(tensor: Tensor[Double]): FVec = FVecTensorImpl(tensor) + +} diff --git a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/FVecTensorImpl.scala b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/struct/FVecTensorImpl.scala similarity index 91% rename from mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/FVecTensorImpl.scala rename to mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/struct/FVecTensorImpl.scala index fd44a6ee9..624a1f766 100644 --- a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/FVecTensorImpl.scala +++ b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/struct/FVecTensorImpl.scala @@ -1,8 +1,9 @@ -package ml.combust.mleap.xgboost +package ml.combust.mleap.xgboost.runtime.struct import biz.k11i.xgboost.util.FVec import ml.combust.mleap.tensor.Tensor + /** * Created by hollinwilkins on 9/16/17. */ @@ -13,4 +14,3 @@ case class FVecTensorImpl(tensor: Tensor[Double]) extends FVec { // https://github.com/komiya-atsushi/xgboost-predictor-java/issues/21 override def fvalue(index: Int): Double = tensor.get(index).getOrElse(Double.NaN).toFloat } - diff --git a/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostClassificationModelParitySpec.scala b/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostClassificationModelParitySpec.scala index 2b0b743ae..29531865b 100644 --- a/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostClassificationModelParitySpec.scala +++ b/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostClassificationModelParitySpec.scala @@ -3,7 +3,7 @@ package ml.combust.mleap.xgboost.runtime import ml.combust.mleap.core.types.{BasicType, NodeShape, ScalarType, StructField, TensorType} import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Transformer} import ml.combust.mleap.tensor.SparseTensor -import ml.combust.mleap.xgboost.runtime.testing.{BoosterUtils, BundleSerializationUtils, CachedDatasetUtils, FloatingPointApproximations} +import ml.combust.mleap.xgboost.runtime.testing.{BoosterUtils, BundleSerializationUtils, CachedDatasetUtils, ClassifierUtils, FloatingPointApproximations} import ml.dmlc.xgboost4j.scala.Booster import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec @@ -13,38 +13,10 @@ import XgbConverters._ class XGBoostClassificationModelParitySpec extends FunSpec with BoosterUtils with CachedDatasetUtils + with ClassifierUtils with BundleSerializationUtils with FloatingPointApproximations { - def createBoosterClassifier: Transformer = { - - val booster: Booster = trainBooster(binomialDataset) - - XGBoostClassification( - "xgboostSingleThread", - NodeShape.probabilisticClassifier( - rawPredictionCol = Some("raw_prediction"), - probabilityCol = Some("probability")), - XGBoostClassificationModel( - XGBoostBinaryClassificationModel(booster, numFeatures(leapFrameLibSVMtrain), 0)) - ) - } - - def createMultinomialBoosterClassifier: Transformer ={ - - val booster: Booster = trainMultinomialBooster(multinomialDataset) - - XGBoostClassification( - "xgboostSingleThread", - NodeShape.probabilisticClassifier( - rawPredictionCol = Some("raw_prediction"), - probabilityCol = Some("probability")), - XGBoostClassificationModel( - XGBoostMultinomialClassificationModel( - booster, xgboostMultinomialParams("num_class").asInstanceOf[Int], numFeatures(leapFrameIrisTrain), 0)) - ) - } - def equalityTestRowByRow( booster: Booster, mleapTransformer: Transformer, @@ -130,13 +102,13 @@ class XGBoostClassificationModelParitySpec extends FunSpec it("Results between the XGBoost4j booster and the MLeap Transformer are the same") { val booster = trainBooster(binomialDataset) - val xgboostTransformer = createBoosterClassifier + val xgboostTransformer = trainXGBoost4jClassifier equalityTestRowByRow(booster, xgboostTransformer, leapFrameLibSVMtrain) } it("has the correct inputs and outputs with columns: prediction, probability and raw_prediction") { - val transformer = createBoosterClassifier + val transformer = trainXGBoost4jClassifier val numFeatures = transformer.asInstanceOf[XGBoostClassification].model.numFeatures assert(transformer.schema.fields == @@ -147,7 +119,7 @@ class XGBoostClassificationModelParitySpec extends FunSpec } it("Results are the same pre and post serialization") { - val xgboostTransformer = createBoosterClassifier + val xgboostTransformer = trainXGBoost4jClassifier val mleapBundle = serializeModelToMleapBundle(xgboostTransformer) val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle) @@ -160,13 +132,13 @@ class XGBoostClassificationModelParitySpec extends FunSpec it("Results between the XGBoost4j multinomial booster and the MLeap XGBoostMultinomialClassificationModel are the same") { val multiBooster = trainMultinomialBooster(multinomialDataset) - val xgboostTransformer = createMultinomialBoosterClassifier + val xgboostTransformer = trainMultinomialXGBoost4jClassifier equalityTestRowByRowMultinomial(multiBooster, xgboostTransformer, leapFrameIrisTrain) } it("XGBoostMultinomialClassificationModel results are the same pre and post serialization") { - val xgboostTransformer = createMultinomialBoosterClassifier + val xgboostTransformer = trainMultinomialXGBoost4jClassifier val mleapBundle = serializeModelToMleapBundle(xgboostTransformer) val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle) @@ -178,7 +150,7 @@ class XGBoostClassificationModelParitySpec extends FunSpec } it("Results pre and post serialization are the same when using a dense dataset") { - val xgboostTransformer = createBoosterClassifier + val xgboostTransformer = trainXGBoost4jClassifier val mleapBundle = serializeModelToMleapBundle(xgboostTransformer) val deserializedTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle) diff --git a/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostPredictorClassificationModelParitySpec.scala b/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostPredictorClassificationModelParitySpec.scala new file mode 100644 index 000000000..294214939 --- /dev/null +++ b/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostPredictorClassificationModelParitySpec.scala @@ -0,0 +1,146 @@ +package ml.combust.mleap.xgboost.runtime + +import biz.k11i.xgboost.Predictor +import ml.combust.mleap.core.types._ +import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Transformer} +import ml.combust.mleap.tensor.SparseTensor +import ml.combust.mleap.xgboost.runtime.testing.{BoosterUtils, BundleSerializationUtils, CachedDatasetUtils, ClassifierUtils, FloatingPointApproximations} +import ml.dmlc.xgboost4j.scala.Booster +import org.apache.spark.ml.linalg.Vectors +import org.scalatest.FunSpec +import XgbConverters._ + + +class XGBoostPredictorClassificationModelParitySpec extends FunSpec + with BoosterUtils + with CachedDatasetUtils + with ClassifierUtils + with BundleSerializationUtils + with FloatingPointApproximations { + + def equalityTestRowByRowMultinomialProbability( + booster: Booster, + mleapTransformer: Transformer, + leapFrameDataset: DefaultLeapFrame) = { + + + val featuresColumnIndex = leapFrameDataset.schema.indexOf("features").get + + leapFrameDataset.dataset.foreach { + r => + val mleapResult = mleapTransformer.transform(DefaultLeapFrame(leapFrameDataset.schema, Seq(r))).get + val mleapProbabilityColIndex = mleapResult.schema.indexOf("probability").get + + val singleRowDMatrix = r(featuresColumnIndex).asInstanceOf[SparseTensor[Double]].asXGB + val boosterResult = booster.predict(singleRowDMatrix, false, 0).head + val boosterProbability = Vectors.dense(boosterResult.map(_.toDouble)).toDense + + assert( + almostEqualSequences( + Seq(boosterProbability.values), + Seq(mleapResult.dataset.head.getTensor[Double](mleapProbabilityColIndex).toArray) + ) + ) + } + } + + /** + * A Predictor only provides a probability column for performance reasons + */ + def probabilityColumnEqualityTest(mleapFrame1: DefaultLeapFrame, mleapFrame2: DefaultLeapFrame) = { + val probabilityFrame1 = mleapFrame1.select("probability").get + val probabilityFrame2 = mleapFrame2.select("probability").get + + val probabilityIndex = probabilityFrame1.schema.indexOf("probability").get + + probabilityFrame1.dataset zip probabilityFrame2.dataset foreach { + case (row1, row2) => { + assert( + almostEqualSequences( + Seq(row1.getTensor[Double](probabilityIndex).toArray), + Seq(row2.getTensor[Double](probabilityIndex).toArray))) + } + } + } + + it("We can deserialize an xgboost object into a Predictor by changing the MLeapOp") { + val xgboost4jTransformer = trainXGBoost4jClassifier + + val mleapBundle = serializeModelToMleapBundle(xgboost4jTransformer) + val deserializedPredictor: XGBoostPredictorClassification = loadXGBoostPredictorFromBundle(mleapBundle) + .asInstanceOf[XGBoostPredictorClassification] + + assert(deserializedPredictor.model.impl.predictor.isInstanceOf[Predictor]) + } + + it("A pre-serialization XGBoost4j model has the same results of a deserialized Predictor model") { + val xgboost4jTransformer = trainXGBoost4jClassifier + + val mleapBundle = serializeModelToMleapBundle(xgboost4jTransformer) + val deserializedPredictor: Transformer = loadXGBoostPredictorFromBundle(mleapBundle) + + val preSerializationXGBoost4jResult = xgboost4jTransformer.transform(leapFrameLibSVMtrain).get + val predictorModelResult = deserializedPredictor.transform(leapFrameLibSVMtrain).get + + probabilityColumnEqualityTest(preSerializationXGBoost4jResult, predictorModelResult) + } + + it("A deserialized XGBoost4j has the same results of a deserialized Predictor"){ + val xgboost4jTransformer = trainXGBoost4jClassifier + + val mleapBundle = serializeModelToMleapBundle(xgboost4jTransformer) + + val deserializedXGBoost4jTransformer: Transformer = loadMleapTransformerFromBundle(mleapBundle) + val deserializedXGBoost4jResult = deserializedXGBoost4jTransformer.transform(leapFrameLibSVMtrain).get + + val deserializedPredictorTransformer: Transformer = loadXGBoostPredictorFromBundle(mleapBundle) + val deserializedPredictorResult = deserializedPredictorTransformer.transform(leapFrameLibSVMtrain).get + + probabilityColumnEqualityTest(deserializedPredictorResult, deserializedXGBoost4jResult) + } + + it("Predictor has the correct inputs and an output probability column") { + val transformer = trainXGBoost4jClassifier + val mleapBundle = serializeModelToMleapBundle(transformer) + + val deserializedPredictorTransformer: Transformer = loadXGBoostPredictorFromBundle(mleapBundle) + val numFeatures = deserializedPredictorTransformer.asInstanceOf[XGBoostPredictorClassification].model.numFeatures + + assert(deserializedPredictorTransformer.schema.fields == + Seq(StructField("features", TensorType(BasicType.Double, Seq(numFeatures))), + StructField("probability", TensorType(BasicType.Double, Seq(2))))) + } + + it("Results between the XGBoost4j multinomial booster and the XGBoostPredictorMultinomialClassificationModel are the same") { + val multiBooster = trainMultinomialBooster(multinomialDataset) + val xgboostTransformer = trainMultinomialXGBoost4jClassifier + + equalityTestRowByRowMultinomialProbability(multiBooster, xgboostTransformer, leapFrameIrisTrain) + } + + it("XGBoostPredictorMultinomialClassificationModel results are the same pre and post serialization") { + val xgboost4jTransformer = trainMultinomialXGBoost4jClassifier + + val mleapBundle = serializeModelToMleapBundle(xgboost4jTransformer) + val predictorTransformer: Transformer = loadXGBoostPredictorFromBundle(mleapBundle) + + val xgboost4jResult = xgboost4jTransformer.transform(leapFrameIrisTrain).get + val predictorResult = predictorTransformer.transform(leapFrameIrisTrain).get + + probabilityColumnEqualityTest(xgboost4jResult, predictorResult) + } + + it("XGBoost4j and Predictor results are the same when using a dense dataset") { + val xgboost4jTransformer = trainMultinomialXGBoost4jClassifier + + val mleapBundle = serializeModelToMleapBundle(xgboost4jTransformer) + val predictorTransformer: Transformer = loadXGBoostPredictorFromBundle(mleapBundle) + + val denseLeapFrame = toDenseFeaturesLeapFrame(leapFrameLibSVMtrain) + + val xgboost4jResult = xgboost4jTransformer.transform(denseLeapFrame).get + val predictorResult = predictorTransformer.transform(denseLeapFrame).get + + probabilityColumnEqualityTest(xgboost4jResult, predictorResult) + } +} diff --git a/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostRegressionModelParitySpec.scala b/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostRegressionModelParitySpec.scala index 38c2424e6..72641b2f3 100644 --- a/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostRegressionModelParitySpec.scala +++ b/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/XGBoostRegressionModelParitySpec.scala @@ -15,7 +15,7 @@ class XGBoostRegressionModelParitySpec extends FunSpec with BundleSerializationUtils with FloatingPointApproximations { - def trainRegressor: Transformer ={ + def trainRegressor: Transformer = { val booster: Booster = trainBooster(binomialDataset) diff --git a/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/BundleSerializationUtils.scala b/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/BundleSerializationUtils.scala index 4000fab78..2c079414e 100644 --- a/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/BundleSerializationUtils.scala +++ b/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/BundleSerializationUtils.scala @@ -6,8 +6,10 @@ import ml.combust.bundle.BundleFile import ml.combust.bundle.serializer.SerializationFormat import ml.combust.mleap.runtime.{MleapContext, frame} import ml.combust.mleap.runtime.frame.Transformer +import ml.combust.mleap.xgboost.runtime.bundle.ops.{XGBoostClassificationOp, XGBoostPredictorClassificationOp} import resource.managed + trait BundleSerializationUtils { def serializeModelToMleapBundle(transformer: Transformer): File = { @@ -32,4 +34,15 @@ trait BundleSerializationUtils { bf.loadMleapBundle().get.root }).tried.get } -} \ No newline at end of file + + def loadXGBoostPredictorFromBundle(bundleFile: File) + (implicit context: MleapContext): frame.Transformer = { + + // Register a different Op to change the deserialization class between tests. + // Use to deserialize with Predictor rather than xgboost4j + context.bundleRegistry.register(new XGBoostPredictorClassificationOp()) + val transformer = loadMleapTransformerFromBundle(bundleFile) + context.bundleRegistry.register(new XGBoostClassificationOp()) // revert to the original Op + transformer + } +} diff --git a/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/CachedDatasetUtils.scala b/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/CachedDatasetUtils.scala index 3efa67c0f..b6c8fd2b8 100644 --- a/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/CachedDatasetUtils.scala +++ b/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/CachedDatasetUtils.scala @@ -1,6 +1,6 @@ package ml.combust.mleap.xgboost.runtime.testing -import ml.combust.mleap.core.types.{StructType, TensorType} +import ml.combust.mleap.core.types.TensorType import ml.combust.mleap.core.util.VectorConverters import ml.combust.mleap.runtime.frame.{ArrayRow, DefaultLeapFrame, Row} import ml.dmlc.xgboost4j.scala.DMatrix diff --git a/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/ClassifierUtils.scala b/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/ClassifierUtils.scala new file mode 100644 index 000000000..6b636e4e9 --- /dev/null +++ b/mleap-xgboost-runtime/src/test/scala/ml/combust/mleap/xgboost/runtime/testing/ClassifierUtils.scala @@ -0,0 +1,39 @@ +package ml.combust.mleap.xgboost.runtime.testing + +import ml.combust.mleap.core.types.NodeShape +import ml.combust.mleap.runtime.frame.Transformer +import ml.combust.mleap.xgboost.runtime.{XGBoostBinaryClassificationModel, XGBoostClassification, XGBoostClassificationModel, XGBoostMultinomialClassificationModel} +import ml.dmlc.xgboost4j.scala.Booster + + +trait ClassifierUtils extends BoosterUtils with CachedDatasetUtils { + + def trainXGBoost4jClassifier: Transformer = { + + val booster: Booster = trainBooster(binomialDataset) + + XGBoostClassification( + "xgboostSingleThread", + NodeShape.probabilisticClassifier( + rawPredictionCol = Some("raw_prediction"), + probabilityCol = Some("probability")), + XGBoostClassificationModel( + XGBoostBinaryClassificationModel(booster, numFeatures(leapFrameLibSVMtrain), 0)) + ) + } + + def trainMultinomialXGBoost4jClassifier: Transformer ={ + + val booster: Booster = trainMultinomialBooster(multinomialDataset) + + XGBoostClassification( + "xgboostSingleThread", + NodeShape.probabilisticClassifier( + rawPredictionCol = Some("raw_prediction"), + probabilityCol = Some("probability")), + XGBoostClassificationModel( + XGBoostMultinomialClassificationModel( + booster, xgboostMultinomialParams("num_class").asInstanceOf[Int], numFeatures(leapFrameIrisTrain), 0)) + ) + } +} diff --git a/project/Dependencies.scala b/project/Dependencies.scala index 7b5bf0ebd..39b02d11b 100644 --- a/project/Dependencies.scala +++ b/project/Dependencies.scala @@ -117,8 +117,7 @@ object Dependencies { val tensorflow = l ++= tensorflowDeps ++ Seq(Test.scalaTest) -// val xgboostRuntime = l ++= Seq(xgboostDep) ++ Test.spark ++ Seq(Test.scalaTest) - val xgboostRuntime = l ++= Seq(xgboostDep) ++ Seq(xgboostPredictorDep) ++ Test.spark ++ Seq(Test.scalaTest) + val xgboostRuntime = l ++= Seq(xgboostDep) ++ Seq(xgboostPredictorDep) ++ Test.spark ++ Seq(Test.scalaTest) val xgboostSpark = l ++= Seq(xgboostSparkDep) ++ Provided.spark diff --git a/project/MleapProject.scala b/project/MleapProject.scala index bad3d4888..084e2dc36 100644 --- a/project/MleapProject.scala +++ b/project/MleapProject.scala @@ -16,7 +16,6 @@ object MleapProject { sparkBase, sparkTestkit, spark, - xgboostRuntime, sparkExtension, executor, executorTestKit, diff --git a/project/plugins.sbt b/project/plugins.sbt index dd9d6acde..b7d453895 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -9,7 +9,4 @@ addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.6.1") addSbtPlugin("com.typesafe.sbt" % "sbt-git" % "0.9.3") addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.7") -addSbtPlugin("com.eed3si9n" % "sbt-unique-version" % "latest.integration") // or "0.1.0" - - libraryDependencies += "com.thesamet.scalapb" %% "compilerplugin" % "0.7.1"