Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding support for LightGBM ( Classificaction & Regression) #711

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package ml.combust.mleap.core.classification

import com.microsoft.ml.spark.lightgbm.LightGBMBooster
import org.apache.spark.ml.linalg.{Vector, Vectors}

object LightGBMClassifierModel{
def apply(model: String,
labelColName: String,
featuresColName: String,
predictionColName: String,
probColName: String,
rawPredictionColName: String,
actualNumClasses: Int): LightGBMClassifierModel = LightGBMClassifierModel(
model, labelColName, featuresColName, predictionColName, probColName, rawPredictionColName,
actualNumClasses = actualNumClasses)
}

case class LightGBMClassifierModel(
override val booster: LightGBMBooster,
override val labelColName: String,
override val featuresColName: String,
override val predictionColName: String,
override val probColName: String,
override val rawPredictionColName: String,
override val thresholdValues: Option[Seq[Double]],
override val actualNumClasses: Int)
extends ProbabilisticClassificationModel with LightGBMClassifierModelBase with Serializable {
override val numClasses: Int = actualNumClasses

override def rawToProbabilityInPlace(raw: Vector): Vector = {
throw new NotImplementedError("Unexpected error in LightGBMClassificationModel:" +
" raw2probabilityInPlace should not be called!")
}

override def predictRaw(features: Vector): Vector = {
Vectors.dense(booster.score(features, true, true))
}

override def predictProbabilities(features: Vector): Vector = {
Vectors.dense(booster.score(features, false, true))
}

override def predict(features: Vector): Double = {
rawToPrediction(predictRaw(features))
}

override val numFeatures: Int = 0
}

trait LightGBMClassifierModelBase {
def booster: LightGBMBooster
def labelColName: String
def featuresColName: String
def predictionColName: String
def probColName: String
def rawPredictionColName: String
def thresholdValues: Option[Seq[Double]]
def actualNumClasses: Int
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package ml.combust.mleap.core.regression

import com.microsoft.ml.spark.lightgbm.LightGBMBooster
import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.{Vector, Vectors}

object LightGBMRegressionModel{
def apply(model: String,
featuresColName: String,
predictionColName: String): LightGBMRegressionModel = LightGBMRegressionModel(
model, featuresColName, predictionColName)
}

case class LightGBMRegressionModel(
override val booster: LightGBMBooster,
override val featuresColName: String,
override val predictionColName: String)
extends LightGBMRegressionModelBase with Model {

override def inputSchema: StructType = StructType("features" -> TensorType.Double()).get

override def outputSchema: StructType = StructType("prediction" -> ScalarType.Double.nonNullable).get

def predict(features: Vector): Double = {
booster.score(features, false, false)(0)
}
}

trait LightGBMRegressionModelBase {
def booster: LightGBMBooster
def featuresColName: String
def predictionColName: String
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package ml.combust.mleap.bundle.ops.classification

import com.microsoft.ml.spark.lightgbm.LightGBMBooster
import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl.{Model, Value}
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.classification.LightGBMClassifierModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.classification.LightGBMClassifier

class LightGBMClassificationModelOp extends MleapOp[LightGBMClassifier, LightGBMClassifierModel] {
override val Model: OpModel[MleapContext, LightGBMClassifierModel] =
new OpModel[MleapContext, LightGBMClassifierModel] {
override val klazz: Class[LightGBMClassifierModel] = classOf[LightGBMClassifierModel]

override def opName: String = "lightgbm_classifier"

override def store(model: Model, obj: LightGBMClassifierModel)(
implicit context: BundleContext[MleapContext]): Model = {
model
.withValue("booster", Value.string(obj.booster.model))
.withValue("labelColName", Value.string(obj.labelColName))
.withValue("featuresColName", Value.string(obj.featuresColName))
.withValue("predictionColName", Value.string(obj.predictionColName))
.withValue("probColName", Value.string(obj.probColName))
.withValue(
"rawPredictionColName",
Value.string(obj.rawPredictionColName)
)
.withValue(
"thresholdValues",
obj.thresholds.map(_.toSeq).map(Value.doubleList)
).withValue("actualNumClasses", Value.int(obj.numClasses))
}

override def load(model: Model)(implicit context: BundleContext[MleapContext]): LightGBMClassifierModel =
{
val booster = new LightGBMBooster(model.value("booster").getString)
new LightGBMClassifierModel(
booster,
model.value("labelColName").getString,
model.value("featuresColName").getString,
model.value("predictionColName").getString,
model.value("probColName").getString,
model.value("rawPredictionColName").getString,
model.getValue("thresholdValues").map(_.getDoubleList.toArray),
model.value("actualNumClasses").getInt
)

}
}

override def model(node: LightGBMClassifier): LightGBMClassifierModel = node.model
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package ml.combust.mleap.bundle.ops.regression

import com.microsoft.ml.spark.lightgbm.LightGBMBooster
import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl.{Model, Value}
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.regression.LightGBMRegressionModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.regression.LightGBMRegression

class LightGBMRegressionModelOp extends MleapOp[LightGBMRegression, LightGBMRegressionModel] {
override val Model: OpModel[MleapContext, LightGBMRegressionModel] =
new OpModel[MleapContext, LightGBMRegressionModel] {
override val klazz: Class[LightGBMRegressionModel] = classOf[LightGBMRegressionModel]

override def opName: String = "lightgbm_regression"

override def store(model: Model, obj: LightGBMRegressionModel)(
implicit context: BundleContext[MleapContext]): Model = {
model
.withValue("booster", Value.string(obj.booster.model))
.withValue("featuresColName", Value.string(obj.featuresColName))
.withValue("predictionColName", Value.string(obj.predictionColName))
}

override def load(model: Model)(implicit context: BundleContext[MleapContext]): LightGBMRegressionModel =
{
val booster = new LightGBMBooster(model.value("booster").getString)
new LightGBMRegressionModel(
booster,
model.value("featuresColName").getString,
model.value("predictionColName").getString)
}
}

override def model(node: LightGBMRegression): LightGBMRegressionModel = node.model
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package ml.combust.mleap.runtime.transformer.classification

import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.classification.LightGBMClassifierModel
import ml.combust.mleap.core.types.NodeShape
import ml.combust.mleap.core.util.VectorConverters._
import ml.combust.mleap.runtime.frame.{MultiTransformer, Row, Transformer}
import ml.combust.mleap.runtime.function.UserDefinedFunction
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.{Vector, Vectors}

@SparkCode(uri = "https://github.com/Azure/mmlspark/blob/f07e5584459e909223a470e6d2e11135b292f3ea/" +
"src/main/scala/com/microsoft/ml/spark/lightgbm/LightGBMClassifier.scala")
case class LightGBMClassifier(override val uid: String = Transformer.uniqueName("lightgbm_classifier"),
override val shape: NodeShape,
override val model: LightGBMClassifierModel) extends MultiTransformer {
override val exec: UserDefinedFunction = {
val f = (features: Tensor[Double]) => {

if (model.thresholdValues.isDefined) {
require(model.thresholdValues.get.length == model.numClasses, this.getClass.getSimpleName +
".transform() called with non-matching numClasses and thresholds.length." +
s" numClasses=$model.numClasses, but thresholds has length ${model.thresholdValues.get.length}")
}

val rawPrediction: Vector =
if (shape.getOutput("raw_prediction").nonEmpty)
model.predictRaw(features)
else
Vectors.dense(Array.empty[Double])

val probability: Vector =
if (shape.getOutput("probability").nonEmpty)
model.predictProbabilities(features)
else
Vectors.dense(Array.empty[Double])

val prediction =
if (shape.getOutput("prediction").isDefined) {
if (shape.getOutput("raw_prediction").nonEmpty && model.thresholdValues.isEmpty) {
// Note: Only call raw2prediction if thresholds not defined
model.rawToPrediction(rawPrediction)
} else if (shape.getOutput("prediction").nonEmpty) {
model.probabilityToPrediction(probability)
} else {
model.predict(features)
}
}
else
Double.NaN

Row(rawPrediction: Tensor[Double], probability: Tensor[Double], prediction)
}
UserDefinedFunction(f, outputSchema, inputSchema)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package ml.combust.mleap.runtime.transformer.regression

import ml.combust.mleap.core.regression.LightGBMRegressionModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.core.util.VectorConverters._
import ml.combust.mleap.runtime.frame.{MultiTransformer, Row, Transformer}
import ml.combust.mleap.runtime.function.UserDefinedFunction
import ml.combust.mleap.tensor.Tensor

case class LightGBMRegression(override val uid: String = Transformer.uniqueName("lightgbm_regression"),
override val shape: NodeShape,
override val model: LightGBMRegressionModel) extends MultiTransformer {
override val exec: UserDefinedFunction = {
val f = (features: Tensor[Double]) => {
val prediction = model.predict(features)
Row(prediction: Double)
}
UserDefinedFunction(f, outputSchema, inputSchema)
}
}
2 changes: 2 additions & 0 deletions mleap-spark/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ ml.combust.mleap.spark.registry.builtin-ops = [
"org.apache.spark.ml.bundle.ops.classification.MultiLayerPerceptronClassifierOp",
"org.apache.spark.ml.bundle.ops.classification.OneVsRestOp",
"org.apache.spark.ml.bundle.ops.classification.RandomForestClassifierOp",
"org.apache.spark.ml.bundle.ops.classification.LightGBMClassificationModelOp",

"org.apache.spark.ml.bundle.ops.clustering.GaussianMixtureOp",
"org.apache.spark.ml.bundle.ops.clustering.KMeansOp",
Expand Down Expand Up @@ -42,6 +43,7 @@ ml.combust.mleap.spark.registry.builtin-ops = [
"org.apache.spark.ml.bundle.ops.regression.IsotonicRegressionOp",
"org.apache.spark.ml.bundle.ops.regression.LinearRegressionOp",
"org.apache.spark.ml.bundle.ops.regression.RandomForestRegressionOp",
"org.apache.spark.ml.bundle.ops.regression.LightGBMRegressionModelOp",

"org.apache.spark.ml.bundle.ops.recommendation.ALSOp",

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package org.apache.spark.ml.bundle.ops.classification

import com.microsoft.ml.spark.lightgbm.{LightGBMBooster, LightGBMClassificationModel}
import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl.{Model, NodeShape, Value}
import ml.combust.bundle.op.OpModel
import org.apache.spark.ml.bundle._

class LightGBMClassificationModelOp
extends SimpleSparkOp[LightGBMClassificationModel] {
override def sparkInputs(obj: LightGBMClassificationModel): Seq[ParamSpec] = {
Seq("features" -> obj.featuresCol)
}
override def sparkOutputs(obj: LightGBMClassificationModel): Seq[ParamSpec] = {
Seq(
"raw_prediction" -> obj.rawPredictionCol,
"probability" -> obj.probabilityCol,
"prediction" -> obj.predictionCol
)
}
override def sparkLoad(
uid: String,
shape: NodeShape,
model: LightGBMClassificationModel
): LightGBMClassificationModel = {
val booster = new LightGBMBooster(model.getModel.model)
new LightGBMClassificationModel(
"",
booster,
model.getLabelCol,
model.getFeaturesCol,
model.getPredictionCol,
model.getProbabilityCol,
model.getRawPredictionCol,
Some(model.getThresholds),
model.numClasses
)
}

override val Model: OpModel[SparkBundleContext, LightGBMClassificationModel] =
new OpModel[SparkBundleContext, LightGBMClassificationModel] {
override val klazz: Class[LightGBMClassificationModel] =
classOf[LightGBMClassificationModel]
override def opName: String = "lightgbm_classifier"
override def store(model: Model, obj: LightGBMClassificationModel)(
implicit context: BundleContext[SparkBundleContext]
): Model = {
assert(
context.context.dataset.isDefined,
BundleHelper.sampleDataframeMessage(klazz)
)
val thresholds = if (obj.isSet(obj.thresholds)) {
Some(obj.getThresholds)
} else None

model
.withValue("booster", Value.string(obj.getModel.model))
.withValue("labelColName", Value.string(obj.getLabelCol))
.withValue("featuresColName", Value.string(obj.getFeaturesCol))
.withValue("predictionColName", Value.string(obj.getPredictionCol))
.withValue("probColName", Value.string(obj.getProbabilityCol))
.withValue("rawPredictionColName", Value.string(obj.getRawPredictionCol))
.withValue("thresholdValues", thresholds.map(_.toSeq).map(Value.doubleList))
.withValue("actualNumClasses", Value.int(obj.numClasses))
}

override def load(model: Model)(
implicit context: BundleContext[SparkBundleContext]
): LightGBMClassificationModel = {
val booster = new LightGBMBooster(model.value("booster").getString)
new LightGBMClassificationModel(
"",
booster,
model.value("labelColName").getString,
model.value("featuresColName").getString,
model.value("predictionColName").getString,
model.value("probColName").getString,
model.value("rawPredictionColName").getString,
model.getValue("thresholdValues").map(_.getDoubleList.toArray),
model.value("actualNumClasses").getInt
)
}
}
}
Loading