Skip to content

Commit

Permalink
Added a new optional Op for unloading xgboost models as Predictor obj…
Browse files Browse the repository at this point in the history
…ects
  • Loading branch information
lucagiovagnoli committed Feb 24, 2020
1 parent 600682a commit 8474e19
Show file tree
Hide file tree
Showing 19 changed files with 310 additions and 155 deletions.
Binary file not shown.
1 change: 0 additions & 1 deletion mleap-xgboost-runtime/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ml.combust.mleap.xgboost.ops = [
// "ml.combust.mleap.xgboost.runtime.bundle.ops.XGBoostPerformantClassificationOp",
"ml.combust.mleap.xgboost.runtime.bundle.ops.XGBoostClassificationOp",
"ml.combust.mleap.xgboost.runtime.bundle.ops.XGBoostRegressionOp"
]
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,30 +1,28 @@
package ml.combust.mleap.xgboost.runtime

import biz.k11i.xgboost.util.FVec
import ml.combust.mleap.core.types.NodeShape
import ml.combust.mleap.runtime.function.UserDefinedFunction
import ml.combust.mleap.tensor.Tensor
import ml.combust.mleap.runtime.frame.{MultiTransformer, Row, Transformer}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import biz.k11i.xgboost.util.FVec
import ml.combust.mleap.core.util.VectorConverters._
import XgbConverters._


case class XGBoostPerformantClassification(
case class XGBoostPredictorClassification(
override val uid: String = Transformer.uniqueName("xgboost.classification"),
override val shape: NodeShape,
override val model: XGBoostPerformantClassificationModel) extends MultiTransformer {
override val model: XGBoostPredictorClassificationModel) extends MultiTransformer {
override val exec: UserDefinedFunction = {
val rawPrediction = shape.getOutput("raw_prediction").map {
_ => (data: FVec) => Some(model.predictRaw(data): Vector)
}.getOrElse((_: FVec) => None)

val prediction = (data: FVec) => Some(model.predict(data))

val all = Seq(prediction)
// Since the Predictor is our performant implementation, we only compute probability for performance reasons.
val probability = shape.getOutput("probability").map {
_ => (data: FVec) => Some(model.predictProbabilities(data): Tensor[Double])
}.getOrElse((_: FVec) => None)

val f = (features: Tensor[Double]) => {
val data: FVec = features.asXGBPredictor
val rowData = all.map(_.apply(data)).filter(_.isDefined).map(_.get)
val rowData = Seq(probability(data).get)
Row(rowData: _*)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ import biz.k11i.xgboost.Predictor
import biz.k11i.xgboost.util.FVec
import ml.combust.mleap.core.classification.ProbabilisticClassificationModel
import org.apache.spark.ml.linalg.{Vector, Vectors}
import ml.combust.mleap.core.types.{StructType, TensorType}
import XgbConverters._
import ml.combust.mleap.core.types.{ListType, ScalarType, StructType, TensorType}


trait XGBoostPerformantClassificationModelBase extends ProbabilisticClassificationModel {
trait XGBoostPredictorClassificationModelBase extends ProbabilisticClassificationModel {
def predictor: Predictor
def treeLimit: Int

Expand All @@ -23,13 +23,12 @@ trait XGBoostPerformantClassificationModelBase extends ProbabilisticClassificati

def predictLeaf(features: Vector): Seq[Double] = predictLeaf(features.asXGBPredictor)
def predictLeaf(data: FVec): Seq[Double] = predictor.predictLeaf(data, treeLimit).map(_.toDouble)

}

case class XGBoostPerformantBinaryClassificationModel(
case class XGBoostPredictorBinaryClassificationModel(
override val predictor: Predictor,
override val numFeatures: Int,
override val treeLimit: Int) extends XGBoostPerformantClassificationModelBase {
override val treeLimit: Int) extends XGBoostPredictorClassificationModelBase {

override val numClasses: Int = 2

Expand All @@ -51,15 +50,38 @@ case class XGBoostPerformantBinaryClassificationModel(
}
}

case class XGBoostPerformantClassificationModel(impl: XGBoostPerformantClassificationModelBase) extends ProbabilisticClassificationModel {
case class XGBoostPredictorMultinomialClassificationModel(
override val predictor: Predictor,
override val numClasses: Int,
override val numFeatures: Int,
override val treeLimit: Int) extends XGBoostPredictorClassificationModelBase {

override def predict(data: FVec): Double = {
probabilityToPrediction(predictProbabilities(data))
}

def predictProbabilities(data: FVec): Vector = {
Vectors.dense(predictor.predict(data, false, treeLimit))
}

def predictRaw(data: FVec): Vector = {
Vectors.dense(predictor.predict(data, true, treeLimit))
}

override def rawToProbabilityInPlace(raw: Vector): Vector = {
throw new Exception("XGBoost Classification model does not support \'rawToProbabilityInPlace\'")
}
}

case class XGBoostPredictorClassificationModel(impl: XGBoostPredictorClassificationModelBase) extends ProbabilisticClassificationModel {
override val numClasses: Int = impl.numClasses
override val numFeatures: Int = impl.numFeatures
def treeLimit: Int = impl.treeLimit

def predictor: Predictor = impl.predictor

def binaryClassificationModel: XGBoostPerformantBinaryClassificationModel = impl.asInstanceOf[XGBoostPerformantBinaryClassificationModel]
// def multinomialClassificationModel: XGBoostMultinomialClassificationModel = impl.asInstanceOf[XGBoostMultinomialClassificationModel]
def binaryClassificationModel: XGBoostPredictorBinaryClassificationModel = impl.asInstanceOf[XGBoostPredictorBinaryClassificationModel]
def multinomialClassificationModel: XGBoostPredictorMultinomialClassificationModel = impl.asInstanceOf[XGBoostPredictorMultinomialClassificationModel]

def predict(data: FVec): Double = impl.predict(data)

Expand All @@ -75,10 +97,6 @@ case class XGBoostPerformantClassificationModel(impl: XGBoostPerformantClassific
override def rawToProbabilityInPlace(raw: Vector): Vector = impl.rawToProbabilityInPlace(raw)

override def outputSchema: StructType = StructType(
"raw_prediction" -> TensorType.Double(numClasses),
// "probability" -> TensorType.Double(numClasses),
"prediction" -> ScalarType.Double.nonNullable
// "leaf_prediction" -> ListType.Double,
// "contrib_prediction" -> ListType.Double).get
"probability" -> TensorType.Double(numClasses)
).get
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,57 +2,46 @@ package ml.combust.mleap.xgboost.runtime

import biz.k11i.xgboost.util.FVec
import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor}
import ml.combust.mleap.xgboost.FVecTensorImpl
import ml.combust.mleap.xgboost.runtime.struct.{FVecFactory, FVecTensorImpl}
import ml.dmlc.xgboost4j.LabeledPoint
import ml.dmlc.xgboost4j.scala.DMatrix
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}


trait XgbConverters {
implicit class VectorOps(vector: Vector) {
def asXGB: DMatrix = {
vector match {
case SparseVector(_, indices, values) =>{
case SparseVector(_, indices, values) =>
new DMatrix(Iterator(new LabeledPoint(0.0f, indices, values.map(_.toFloat))))
}

case DenseVector(values) =>
new DMatrix(Iterator(new LabeledPoint(0.0f, null, values.map(_.toFloat))))
}
}

def asXGBPredictor: FVec = {
vector match {
// case SparseVector(_, indices, values) => {
// FVec.Transformer.fromMap(
// }
case DenseVector(values) => {
FVec.Transformer.fromArray(values.map(_.toFloat), false)
}
case sparseVector: SparseVector =>
FVecFactory.fromSparseVector(sparseVector)
case denseVector: DenseVector =>
FVecFactory.fromDenseVector(denseVector)
}
}
}

implicit class DoubleTensorOps(tensor: Tensor[Double]) {
def asXGB: DMatrix = {
tensor match {
case SparseTensor(indices, values, _) =>{
case SparseTensor(indices, values, _) =>
new DMatrix(Iterator(new LabeledPoint(0.0f, indices.map(_.head).toArray, values.map(_.toFloat))))
}
case DenseTensor(_, _) =>{

case DenseTensor(_, _) =>
new DMatrix(Iterator(new LabeledPoint(0.0f, null, tensor.toDense.rawValues.map(_.toFloat))))
}
}
}

def asXGBPredictor: FVec = {
tensor match {
case sparseTensor: SparseTensor[Double] => {
FVecTensorImpl(sparseTensor)
}
case DenseTensor(_, _) => {
FVec.Transformer.fromArray(tensor.toArray.map(_.toFloat), false)
}
}
}
def asXGBPredictor: FVec = FVecTensorImpl(tensor)
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import ml.combust.bundle.dsl.{Model, Value}
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.xgboost.runtime.{XGBoostBinaryClassificationModel, XGBoostClassification, XGBoostClassificationModel, XGBoostMultinomialClassificationModel, XGBoostPerformantBinaryClassificationModel, XGBoostPerformantClassification, XGBoostPerformantClassificationModel}
import ml.combust.mleap.xgboost.runtime.{XGBoostBinaryClassificationModel, XGBoostClassification, XGBoostClassificationModel, XGBoostMultinomialClassificationModel, XGBoostPredictorBinaryClassificationModel, XGBoostPredictorClassification, XGBoostPredictorClassificationModel}
import ml.dmlc.xgboost4j.scala.{Booster, XGBoost}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,32 +14,36 @@ import ml.combust.mleap.xgboost.runtime._
/**
* Created by hollinwilkins on 9/16/17.
*/
class XGBoostPerformantClassificationOp extends MleapOp[XGBoostPerformantClassification, XGBoostPerformantClassificationModel] {
class XGBoostPredictorClassificationOp extends MleapOp[XGBoostPredictorClassification, XGBoostPredictorClassificationModel] {

override val Model: OpModel[MleapContext, XGBoostPerformantClassificationModel] = new OpModel[MleapContext, XGBoostPerformantClassificationModel] {
override val klazz: Class[XGBoostPerformantClassificationModel] = classOf[XGBoostPerformantClassificationModel]
override val Model: OpModel[MleapContext, XGBoostPredictorClassificationModel] = new OpModel[MleapContext, XGBoostPredictorClassificationModel] {
override val klazz: Class[XGBoostPredictorClassificationModel] = classOf[XGBoostPredictorClassificationModel]

override def opName: String = "xgboost.classifier"

@throws[RuntimeException]
override def store(model: Model, obj: XGBoostPerformantClassificationModel)
override def store(model: Model, obj: XGBoostPredictorClassificationModel)
(implicit context: BundleContext[MleapContext]): Model =
throw new RuntimeException("The XGBoostPredictor implementation does not support storing the model.")

override def load(model: Model)
(implicit context: BundleContext[MleapContext]): XGBoostPerformantClassificationModel = {
(implicit context: BundleContext[MleapContext]): XGBoostPredictorClassificationModel = {

val predictor = new Predictor(Files.newInputStream(context.file("xgboost.model")))

val numClasses = model.value("num_classes").getInt
val numFeatures = model.value("num_features").getInt
val treeLimit = model.value("tree_limit").getInt

val impl = XGBoostPerformantBinaryClassificationModel(predictor, numFeatures, treeLimit)
val impl = if(numClasses == 2) {
XGBoostPredictorBinaryClassificationModel(predictor, numFeatures, treeLimit)
} else {
XGBoostPredictorMultinomialClassificationModel(predictor, numClasses, numFeatures, treeLimit)
}

XGBoostPerformantClassificationModel(impl)
XGBoostPredictorClassificationModel(impl)
}
}

override def model(node: XGBoostPerformantClassification): XGBoostPerformantClassificationModel = node.model
override def model(node: XGBoostPredictorClassification): XGBoostPredictorClassificationModel = node.model
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package ml.combust.mleap.xgboost.runtime.struct

import biz.k11i.xgboost.util.FVec
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.{DenseVector, SparseVector}

import scala.collection.JavaConversions.mapAsJavaMap


object FVecFactory {

private def fromScalaMap(map: Map[Int, Float]): FVec = {
val javaMap = mapAsJavaMap(map).asInstanceOf[java.util.Map[java.lang.Integer, java.lang.Float]]

FVec.Transformer.fromMap(javaMap)
}

def fromSparseVector(sparseVector: SparseVector): FVec = {
// Casting to floats, because doubles result in compounding differences from the c++ implementation
// https://github.com/komiya-atsushi/xgboost-predictor-java/issues/21
val scalaMap = (sparseVector.indices zip sparseVector.values.map(_.toFloat)).toMap

FVecFactory.fromScalaMap(scalaMap)
}

def fromDenseVector(denseVector: DenseVector): FVec = {
// Casting to floats, because doubles result in compounding differences from the c++ implementation
// https://github.com/komiya-atsushi/xgboost-predictor-java/issues/21
FVec.Transformer.fromArray(denseVector.values.map(_.toFloat), false)
}

def fromTensor(tensor: Tensor[Double]): FVec = FVecTensorImpl(tensor)
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
package ml.combust.mleap.xgboost
package ml.combust.mleap.xgboost.runtime.struct

import biz.k11i.xgboost.util.FVec
import ml.combust.mleap.tensor.Tensor


/**
* Created by hollinwilkins on 9/16/17.
*/
Expand All @@ -13,4 +14,3 @@ case class FVecTensorImpl(tensor: Tensor[Double]) extends FVec {
// https://github.com/komiya-atsushi/xgboost-predictor-java/issues/21
override def fvalue(index: Int): Double = tensor.get(index).getOrElse(Double.NaN).toFloat
}

0 comments on commit 8474e19

Please sign in to comment.