Added a new optional Op for unloading xgboost models as Predictor obj…

…ects
combust · Feb 24, 2020 · 8474e19 · 8474e19
1 parent 600682a
commit 8474e19
Show file tree

Hide file tree

Showing 19 changed files with 310 additions and 155 deletions.
diff --git a/mleap-xgboost-runtime/src/main/resources/pixar_movie_mleap_bundle.zip b/mleap-xgboost-runtime/src/main/resources/pixar_movie_mleap_bundle.zip
diff --git a/mleap-xgboost-runtime/src/main/resources/reference.conf b/mleap-xgboost-runtime/src/main/resources/reference.conf
@@ -1,5 +1,4 @@
 ml.combust.mleap.xgboost.ops = [
-//  "ml.combust.mleap.xgboost.runtime.bundle.ops.XGBoostPerformantClassificationOp",
   "ml.combust.mleap.xgboost.runtime.bundle.ops.XGBoostClassificationOp",
   "ml.combust.mleap.xgboost.runtime.bundle.ops.XGBoostRegressionOp"
 ]

diff --git a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/MyMain.scala b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/MyMain.scala
diff --git a/...ime/XGBoostPerformantClassification.scala → ...time/XGBoostPredictorClassification.scala b/...ime/XGBoostPerformantClassification.scala → ...time/XGBoostPredictorClassification.scala
@@ -1,30 +1,28 @@
 package ml.combust.mleap.xgboost.runtime
 
+import biz.k11i.xgboost.util.FVec
 import ml.combust.mleap.core.types.NodeShape
 import ml.combust.mleap.runtime.function.UserDefinedFunction
 import ml.combust.mleap.tensor.Tensor
 import ml.combust.mleap.runtime.frame.{MultiTransformer, Row, Transformer}
-import org.apache.spark.ml.linalg.{Vector, Vectors}
-import biz.k11i.xgboost.util.FVec
+import ml.combust.mleap.core.util.VectorConverters._
 import XgbConverters._
 
 
-case class XGBoostPerformantClassification(
+case class XGBoostPredictorClassification(
                                   override val uid: String = Transformer.uniqueName("xgboost.classification"),
                                  override val shape: NodeShape,
-                                 override val model: XGBoostPerformantClassificationModel) extends MultiTransformer {
+                                 override val model: XGBoostPredictorClassificationModel) extends MultiTransformer {
   override val exec: UserDefinedFunction = {
-    val rawPrediction = shape.getOutput("raw_prediction").map {
-      _ => (data: FVec) => Some(model.predictRaw(data): Vector)
-    }.getOrElse((_: FVec) => None)
 
-    val prediction = (data: FVec) => Some(model.predict(data))
-
-    val all = Seq(prediction)
+    // Since the Predictor is our performant implementation, we only compute probability for performance reasons.
+    val probability = shape.getOutput("probability").map {
+      _ => (data: FVec) => Some(model.predictProbabilities(data): Tensor[Double])
+    }.getOrElse((_: FVec) => None)
 
     val f = (features: Tensor[Double]) => {
       val data: FVec = features.asXGBPredictor
-      val rowData = all.map(_.apply(data)).filter(_.isDefined).map(_.get)
+      val rowData = Seq(probability(data).get)
       Row(rowData: _*)
     }
 

diff --git a/...GBoostPerformantClassificationModel.scala → ...XGBoostPredictorClassificationModel.scala b/...GBoostPerformantClassificationModel.scala → ...XGBoostPredictorClassificationModel.scala
@@ -4,11 +4,11 @@ import biz.k11i.xgboost.Predictor
 import biz.k11i.xgboost.util.FVec
 import ml.combust.mleap.core.classification.ProbabilisticClassificationModel
 import org.apache.spark.ml.linalg.{Vector, Vectors}
+import ml.combust.mleap.core.types.{StructType, TensorType}
 import XgbConverters._
-import ml.combust.mleap.core.types.{ListType, ScalarType, StructType, TensorType}
 
 
-trait XGBoostPerformantClassificationModelBase extends ProbabilisticClassificationModel {
+trait XGBoostPredictorClassificationModelBase extends ProbabilisticClassificationModel {
   def predictor: Predictor
   def treeLimit: Int
 
@@ -23,13 +23,12 @@ trait XGBoostPerformantClassificationModelBase extends ProbabilisticClassificati
 
   def predictLeaf(features: Vector): Seq[Double] = predictLeaf(features.asXGBPredictor)
   def predictLeaf(data: FVec): Seq[Double] = predictor.predictLeaf(data, treeLimit).map(_.toDouble)
-
 }
 
-case class XGBoostPerformantBinaryClassificationModel(
+case class XGBoostPredictorBinaryClassificationModel(
       override val predictor: Predictor,
       override val numFeatures: Int,
-      override val treeLimit: Int) extends XGBoostPerformantClassificationModelBase {
+      override val treeLimit: Int) extends XGBoostPredictorClassificationModelBase {
 
   override val numClasses: Int = 2
 
@@ -51,15 +50,38 @@ case class XGBoostPerformantBinaryClassificationModel(
   }
 }
 
-case class XGBoostPerformantClassificationModel(impl: XGBoostPerformantClassificationModelBase) extends ProbabilisticClassificationModel {
+case class XGBoostPredictorMultinomialClassificationModel(
+                       override val predictor: Predictor,
+                       override val numClasses: Int,
+                       override val numFeatures: Int,
+                       override val treeLimit: Int) extends XGBoostPredictorClassificationModelBase {
+
+  override def predict(data: FVec): Double = {
+    probabilityToPrediction(predictProbabilities(data))
+  }
+
+  def predictProbabilities(data: FVec): Vector = {
+    Vectors.dense(predictor.predict(data,  false,  treeLimit))
+  }
+
+  def predictRaw(data: FVec): Vector = {
+    Vectors.dense(predictor.predict(data, true, treeLimit))
+  }
+
+  override def rawToProbabilityInPlace(raw: Vector): Vector = {
+    throw new Exception("XGBoost Classification model does not support \'rawToProbabilityInPlace\'")
+  }
+}
+
+case class XGBoostPredictorClassificationModel(impl: XGBoostPredictorClassificationModelBase) extends ProbabilisticClassificationModel {
   override val numClasses: Int = impl.numClasses
   override val numFeatures: Int = impl.numFeatures
   def treeLimit: Int = impl.treeLimit
 
   def predictor: Predictor = impl.predictor
 
-  def binaryClassificationModel: XGBoostPerformantBinaryClassificationModel = impl.asInstanceOf[XGBoostPerformantBinaryClassificationModel]
-//  def multinomialClassificationModel: XGBoostMultinomialClassificationModel = impl.asInstanceOf[XGBoostMultinomialClassificationModel]
+  def binaryClassificationModel: XGBoostPredictorBinaryClassificationModel = impl.asInstanceOf[XGBoostPredictorBinaryClassificationModel]
+  def multinomialClassificationModel: XGBoostPredictorMultinomialClassificationModel = impl.asInstanceOf[XGBoostPredictorMultinomialClassificationModel]
 
   def predict(data: FVec): Double = impl.predict(data)
 
@@ -75,10 +97,6 @@ case class XGBoostPerformantClassificationModel(impl: XGBoostPerformantClassific
   override def rawToProbabilityInPlace(raw: Vector): Vector = impl.rawToProbabilityInPlace(raw)
 
   override def outputSchema: StructType = StructType(
-    "raw_prediction" -> TensorType.Double(numClasses),
-//    "probability" -> TensorType.Double(numClasses),
-    "prediction" -> ScalarType.Double.nonNullable
-//    "leaf_prediction" -> ListType.Double,
-//    "contrib_prediction" -> ListType.Double).get
+    "probability" -> TensorType.Double(numClasses)
   ).get
 }
diff --git a/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XgbConverters.scala b/mleap-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/XgbConverters.scala
@@ -2,57 +2,46 @@ package ml.combust.mleap.xgboost.runtime
 
 import biz.k11i.xgboost.util.FVec
 import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor}
-import ml.combust.mleap.xgboost.FVecTensorImpl
+import ml.combust.mleap.xgboost.runtime.struct.{FVecFactory, FVecTensorImpl}
 import ml.dmlc.xgboost4j.LabeledPoint
 import ml.dmlc.xgboost4j.scala.DMatrix
 import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
 
+
 trait XgbConverters {
   implicit class VectorOps(vector: Vector) {
     def asXGB: DMatrix = {
       vector match {
-        case SparseVector(_, indices, values) =>{
+        case SparseVector(_, indices, values) =>
           new DMatrix(Iterator(new LabeledPoint(0.0f, indices, values.map(_.toFloat))))
-        }
+
         case DenseVector(values) =>
           new DMatrix(Iterator(new LabeledPoint(0.0f, null, values.map(_.toFloat))))
       }
     }
 
     def asXGBPredictor: FVec = {
       vector match {
-//        case SparseVector(_, indices, values) => {
-//          FVec.Transformer.fromMap(
-//        }
-        case DenseVector(values) => {
-          FVec.Transformer.fromArray(values.map(_.toFloat), false)
-        }
+        case sparseVector: SparseVector =>
+          FVecFactory.fromSparseVector(sparseVector)
+        case denseVector: DenseVector =>
+          FVecFactory.fromDenseVector(denseVector)
       }
     }
   }
 
   implicit class DoubleTensorOps(tensor: Tensor[Double]) {
     def asXGB: DMatrix = {
       tensor match {
-        case SparseTensor(indices, values, _) =>{
+        case SparseTensor(indices, values, _) =>
           new DMatrix(Iterator(new LabeledPoint(0.0f, indices.map(_.head).toArray, values.map(_.toFloat))))
-        }
-        case DenseTensor(_, _) =>{
+
+        case DenseTensor(_, _) =>
           new DMatrix(Iterator(new LabeledPoint(0.0f, null, tensor.toDense.rawValues.map(_.toFloat))))
-        }
       }
     }
 
-    def asXGBPredictor: FVec = {
-      tensor match {
-        case sparseTensor: SparseTensor[Double] => {
-          FVecTensorImpl(sparseTensor)
-        }
-        case DenseTensor(_, _) => {
-          FVec.Transformer.fromArray(tensor.toArray.map(_.toFloat), false)
-        }
-      }
-    }
+    def asXGBPredictor: FVec = FVecTensorImpl(tensor)
   }
 }
 

diff --git a/.../src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostClassificationOp.scala b/.../src/main/scala/ml/combust/mleap/xgboost/runtime/bundle/ops/XGBoostClassificationOp.scala
@@ -7,7 +7,7 @@ import ml.combust.bundle.dsl.{Model, Value}
 import ml.combust.bundle.op.OpModel
 import ml.combust.mleap.bundle.ops.MleapOp
 import ml.combust.mleap.runtime.MleapContext
-import ml.combust.mleap.xgboost.runtime.{XGBoostBinaryClassificationModel, XGBoostClassification, XGBoostClassificationModel, XGBoostMultinomialClassificationModel, XGBoostPerformantBinaryClassificationModel, XGBoostPerformantClassification, XGBoostPerformantClassificationModel}
+import ml.combust.mleap.xgboost.runtime.{XGBoostBinaryClassificationModel, XGBoostClassification, XGBoostClassificationModel, XGBoostMultinomialClassificationModel, XGBoostPredictorBinaryClassificationModel, XGBoostPredictorClassification, XGBoostPredictorClassificationModel}
 import ml.dmlc.xgboost4j.scala.{Booster, XGBoost}
 
 

diff --git a/...s/XGBoostPerformantClassificationOp.scala → ...ps/XGBoostPredictorClassificationOp.scala b/...s/XGBoostPerformantClassificationOp.scala → ...ps/XGBoostPredictorClassificationOp.scala
@@ -14,32 +14,36 @@ import ml.combust.mleap.xgboost.runtime._
 /**
   * Created by hollinwilkins on 9/16/17.
   */
-class XGBoostPerformantClassificationOp extends MleapOp[XGBoostPerformantClassification, XGBoostPerformantClassificationModel] {
+class XGBoostPredictorClassificationOp extends MleapOp[XGBoostPredictorClassification, XGBoostPredictorClassificationModel] {
 
-  override val Model: OpModel[MleapContext, XGBoostPerformantClassificationModel] = new OpModel[MleapContext, XGBoostPerformantClassificationModel] {
-  override val klazz: Class[XGBoostPerformantClassificationModel] = classOf[XGBoostPerformantClassificationModel]
+  override val Model: OpModel[MleapContext, XGBoostPredictorClassificationModel] = new OpModel[MleapContext, XGBoostPredictorClassificationModel] {
+  override val klazz: Class[XGBoostPredictorClassificationModel] = classOf[XGBoostPredictorClassificationModel]
 
   override def opName: String = "xgboost.classifier"
 
   @throws[RuntimeException]
-  override def store(model: Model, obj: XGBoostPerformantClassificationModel)
+  override def store(model: Model, obj: XGBoostPredictorClassificationModel)
                     (implicit context: BundleContext[MleapContext]): Model =
     throw new RuntimeException("The XGBoostPredictor implementation does not support storing the model.")
 
   override def load(model: Model)
-                   (implicit context: BundleContext[MleapContext]): XGBoostPerformantClassificationModel = {
+                   (implicit context: BundleContext[MleapContext]): XGBoostPredictorClassificationModel = {
 
     val predictor = new Predictor(Files.newInputStream(context.file("xgboost.model")))
 
     val numClasses = model.value("num_classes").getInt
     val numFeatures = model.value("num_features").getInt
     val treeLimit = model.value("tree_limit").getInt
 
-    val impl = XGBoostPerformantBinaryClassificationModel(predictor, numFeatures, treeLimit)
+    val impl = if(numClasses == 2) {
+      XGBoostPredictorBinaryClassificationModel(predictor, numFeatures, treeLimit)
+    } else {
+      XGBoostPredictorMultinomialClassificationModel(predictor, numClasses, numFeatures, treeLimit)
+    }
 
-    XGBoostPerformantClassificationModel(impl)
+    XGBoostPredictorClassificationModel(impl)
   }
 }
 
-  override def model(node: XGBoostPerformantClassification): XGBoostPerformantClassificationModel = node.model
+  override def model(node: XGBoostPredictorClassification): XGBoostPredictorClassificationModel = node.model
 }
diff --git a/...-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/struct/FVecFactory.scala b/...-xgboost-runtime/src/main/scala/ml/combust/mleap/xgboost/runtime/struct/FVecFactory.scala
@@ -0,0 +1,33 @@
+package ml.combust.mleap.xgboost.runtime.struct
+
+import biz.k11i.xgboost.util.FVec
+import ml.combust.mleap.tensor.Tensor
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
+
+import scala.collection.JavaConversions.mapAsJavaMap
+
+
+object FVecFactory {
+
+  private def fromScalaMap(map: Map[Int, Float]): FVec = {
+    val javaMap = mapAsJavaMap(map).asInstanceOf[java.util.Map[java.lang.Integer, java.lang.Float]]
+
+    FVec.Transformer.fromMap(javaMap)
+  }
+
+  def fromSparseVector(sparseVector: SparseVector): FVec = {
+    // Casting to floats, because doubles result in compounding differences from the c++ implementation
+    // https://github.com/komiya-atsushi/xgboost-predictor-java/issues/21
+    val scalaMap = (sparseVector.indices zip sparseVector.values.map(_.toFloat)).toMap
+
+    FVecFactory.fromScalaMap(scalaMap)
+  }
+
+  def fromDenseVector(denseVector: DenseVector): FVec = {
+    // Casting to floats, because doubles result in compounding differences from the c++ implementation
+    // https://github.com/komiya-atsushi/xgboost-predictor-java/issues/21
+    FVec.Transformer.fromArray(denseVector.values.map(_.toFloat), false)
+  }
+
+  def fromTensor(tensor: Tensor[Double]): FVec = FVecTensorImpl(tensor)
+}
diff --git a/...ombust/mleap/xgboost/FVecTensorImpl.scala → ...boost/runtime/struct/FVecTensorImpl.scala b/...ombust/mleap/xgboost/FVecTensorImpl.scala → ...boost/runtime/struct/FVecTensorImpl.scala
@@ -1,8 +1,9 @@
-package ml.combust.mleap.xgboost
+package ml.combust.mleap.xgboost.runtime.struct
 
 import biz.k11i.xgboost.util.FVec
 import ml.combust.mleap.tensor.Tensor
 
+
 /**
   * Created by hollinwilkins on 9/16/17.
   */
@@ -13,4 +14,3 @@ case class FVecTensorImpl(tensor: Tensor[Double]) extends FVec {
   // https://github.com/komiya-atsushi/xgboost-predictor-java/issues/21
   override def fvalue(index: Int): Double = tensor.get(index).getOrElse(Double.NaN).toFloat
 }
-