Skip to content

Commit

Permalink
[SW-2559] Expose Scoring History and Variable Importances on H2OMOJOM…
Browse files Browse the repository at this point in the history
…odel (#2525)

* [SW-2559] Expose Scoring History and Variable Importances on H2OMOJOModel

* Deprecate variable importances on H2ODeepLearning

* Rename variable importances to feature importances

* Extend documentation

* Fix deserialization problem

* spotless apply

* Fix python tests and deprecation on DeepLearningMOJOModel

* fix python tests

* fix R tests

* typo

* fix R tests

* spotless apply

* null handling

(cherry picked from commit 3fab572)

# Conflicts:
#	api-generation/src/main/scala/ai/h2o/sparkling/api/generation/common/AlgorithmConfigurations.scala
  • Loading branch information
mn-mikke committed May 24, 2021
1 parent 068f707 commit 403b90f
Show file tree
Hide file tree
Showing 20 changed files with 337 additions and 24 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,22 @@ trait AlgorithmConfigurations {

val noDeprecation = Seq.empty

val dlDeprecations = Seq(
DeprecatedField(
"variable_importances",
"HasDeprecatedVariableImportances",
"variableImportances",
"3.36",
Some("calculateFeatureImportances"),
Some("HasDeprecatedVariableImportancesOnMOJO")))

val algorithmParameters = Seq[(String, Class[_], Class[_], Seq[ExplicitField], Seq[DeprecatedField])](
("H2OXGBoostParams", classOf[XGBParamsV3], classOf[XGBoostParameters], xgboostFields, noDeprecation),
("H2OGBMParams", classOf[GBMV3.GBMParametersV3], classOf[GBMParameters], gbmFields, noDeprecation),
("H2ODRFParams", classOf[DRFV3.DRFParametersV3], classOf[DRFParameters], drfFields, noDeprecation),
("H2OGLMParams", classOf[GLMV3.GLMParametersV3], classOf[GLMParameters], glmFields, glmDeprecations),
("H2OGAMParams", classOf[GAMV3.GAMParametersV3], classOf[GAMParameters], gamFields, noDeprecation),
("H2ODeepLearningParams", classOf[DLParamsV3], classOf[DeepLearningParameters], dlFields, noDeprecation),
("H2ODeepLearningParams", classOf[DLParamsV3], classOf[DeepLearningParameters], dlFields, dlDeprecations),
("H2OKMeansParams", classOf[KMeansParamsV3], classOf[KMeansParameters], kmeansFields, kmeansDeprecations),
("H2OCoxPHParams", classOf[CoxPHParametersV3], classOf[CoxPHParameters], coxPHFields, noDeprecation),
("H2OIsolationForestParams", classOf[IFParamsV3], classOf[IsolationForestParameters], ifFields, noDeprecation))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ object ParameterNameConverter {
"colsample_bynode" -> "colSampleByNode",
"rand_family" -> "randomFamily",
"rand_link" -> "randomLink",
"calibration_frame" -> "calibrationDataFrame")
"calibration_frame" -> "calibrationDataFrame",
"variable_importances" -> "calculateFeatureImportances")

val conversionRules: Map[String, String] = Map("Column" -> "Col")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,13 @@ case class ExplicitField(
sparkName: Option[String] = None,
mojoImplementation: Option[String] = None)

case class DeprecatedField(h2oName: String, implementation: String, sparkName: String, version: String)
case class DeprecatedField(
h2oName: String,
implementation: String,
sparkName: String,
version: String,
replacement: Option[String] = None,
mojoImplementation: Option[String] = None)

object DefaultValueSource extends Enumeration {
type DefaultValueSource = Value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,13 @@ trait AlgorithmTemplateBase extends PythonEntityTemplate {
.map { param =>
val version = param.version
val name = param.sparkName
s""" if '$name' in kwargs:
val valuePropagation = param.replacement match {
case Some(replacement) =>
s"""\n if '$replacement' not in kwargs:
| kwargs['$replacement'] = kwargs['$name']""".stripMargin
case None => ""
}
s""" if '$name' in kwargs:$valuePropagation
| del kwargs['$name']
| warn("The parameter '$name' is deprecated and will be removed in the version $version.")""".stripMargin
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ object MOJOModelTemplate
val entityName = algorithmSubstitutionContext.entityName
val namespace = algorithmSubstitutionContext.namespace
val algorithmType = algorithmSubstitutionContext.algorithmType.replace("Algorithm", "MOJOModelParams")
val explicitFields = parameterSubstitutionContext.explicitFields.flatMap(_.mojoImplementation)
val explicitFields = parameterSubstitutionContext.explicitFields.flatMap(_.mojoImplementation) ++
parameterSubstitutionContext.deprecatedFields.flatMap(_.mojoImplementation)
val parents = Seq(algorithmType) ++ explicitFields

val imports = Seq(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ object MOJOModelTemplate
.filter(parameter =>
!IgnoredParameters.ignoredInMOJOs(algorithmSubstitutionContext.entityName).contains(parameter.h2oName))

val explicitFieldImplementations = parameterSubstitutionContext.explicitFields.flatMap(_.mojoImplementation)
val explicitFieldImplementations = parameterSubstitutionContext.explicitFields.flatMap(_.mojoImplementation) ++
parameterSubstitutionContext.deprecatedFields.flatMap(_.mojoImplementation)

val imports = Seq(
"ai.h2o.sparkling.ml.params.ParameterConstructorMethods",
Expand Down
16 changes: 16 additions & 0 deletions booklet/src/sections/productionazing.tex
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,22 @@ \subsubsection{Obtaining Model Category}

The method \texttt{getModelCategory} can be used to get the model category (such as \texttt{binomial}, \texttt{multinomial} etc).

\subsubsection{Obtaining Feature Types}

The method \texttt{getFeatureTypes} returns a map/dictionary from a feature name to a corresponding feature type
[\texttt{enum} (categorical), \texttt{numeric}, \texttt{string}, etc.]. These pieces helps to understand how individual
columns of the training dataset were treated during the model training.

\subsubsection{Obtaining Feature Importances}

The method \texttt{getFeatureImportances} returns a data frame describing importance of each feature. The importance is expressed
by several numbers (Relative Importance, Scaled Importance and Percentage).

\subsubsection{Obtaining Scoring History}

The method \texttt{getScoringHistory} returns a data frame describing how the model evolved during the training process
according to a certain training and validation metrics.

\subsubsection{Obtaining Training Params}

The method \texttt{getTrainingParams} can be used to get map containing all training parameters used in the H2O. It is a map
Expand Down
14 changes: 14 additions & 0 deletions doc/src/site/sphinx/deployment/load_mojo.rst
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,20 @@ The method ``getFeatureTypes`` returns a map/dictionary from a feature name to a
[``enum`` (categorical), ``numeric``, ``string``, etc.]. These pieces helps to understand how individual columns of
the training dataset were treated during the model training.

Obtaining Feature Importances
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The method ``getFeatureImportances`` returns a data frame describing importance of each feature. The importance is expressed
by several numbers (Relative Importance, Scaled Importance and Percentage). `H2O-3 documentation
<https://h2o-release.s3.amazonaws.com/h2o/rel-SUBST_H2O_RELEASE_NAME/SUBST_H2O_BUILD_NUMBER/docs-website/h2o-docs/variable-importance.html>`__
describes how the numbers are calculated.


Obtaining Scoring History
^^^^^^^^^^^^^^^^^^^^^^^^^

The method ``getScoringHistory`` returns a data frame describing how the model evolved during the training process according to
a certain training and validation metrics.

Obtaining Metrics
^^^^^^^^^^^^^^^^^
Expand Down
6 changes: 6 additions & 0 deletions doc/src/site/sphinx/migration_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ From 3.34 to 3.36
- The methods ``getWithDetailedPredictionCol`` and ``setWithDetailedPredictionCol`` on all SW Algorithms and
MOJO models were removed without replacement.

- The parameter ``variableImportances`` of ``H2ODeepLearning`` has been replaced with ``calculateFeatureImportances`` as
well as the methods ``getVariableImportances`` and ``setVariableImportances`` on ``H2ODeepLearning`` have been replaced
with ``getCalculateFeatureImportances`` and ``setCalculateFeatureImportances``.

- The method ``getVariableImportances`` of ``H2ODeepLearningMOJOModel`` has been replaced with ``getCalculateFeatureImportances``.

From 3.32.1 to 3.34
-------------------

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package ai.h2o.sparkling.ml.params

import ai.h2o.sparkling.macros.DeprecatedMethod
import org.apache.spark.expose.Logging

trait HasDeprecatedVariableImportances extends Logging {

def getCalculateFeatureImportances(): Boolean

@DeprecatedMethod("getCalculateFeatureImportances", "3.36")
def getVariableImportances(): Boolean = getCalculateFeatureImportances()

def setCalculateFeatureImportances(value: Boolean): this.type

@DeprecatedMethod("setCalculateFeatureImportances", "3.36")
def setVariableImportances(value: Boolean): this.type = setCalculateFeatureImportances(value)
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ def getTrainingParams(self):
def getModelCategory(self):
return self._java_obj.getModelCategory()

def getScoringHistory(self):
return H2OTypeConverters.scalaToPythonDataFrame(self._java_obj.getScoringHistory())

def getFeatureImportances(self):
return H2OTypeConverters.scalaToPythonDataFrame(self._java_obj.getFeatureImportances())


class HasOffsetCol:

Expand Down
12 changes: 11 additions & 1 deletion py-scoring/src/ai/h2o/sparkling/ml/params/H2OTypeConverters.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from pyspark.ml.linalg import DenseVector, DenseMatrix
from pyspark.ml.param import TypeConverters
from pyspark.ml.util import _jvm
from pyspark.sql import DataFrame
from pyspark.sql import DataFrame, SparkSession


class H2OTypeConverters(object):
Expand Down Expand Up @@ -500,3 +500,13 @@ def scalaArrayToPythonArray(array):
return [v for v in array]
else:
raise TypeError("Invalid type.")

@staticmethod
def scalaToPythonDataFrame(jdf):
if jdf is None:
return None
elif isinstance(jdf, JavaObject):
sqlContext = SparkSession.builder.getOrCreate()._wrapped
return DataFrame(jdf, sqlContext)
else:
raise TypeError("Invalid type.")
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pyspark.ml.param import *
import warnings

class HasDeprecatedVariableImportances(Params):

def getVariableImportances(self):
warnings.warn("The method 'getVariableImportances' is deprecated and will be removed in the version 3.36." +
"The replacement is 'getCalculateFeatureImportances'.")
return self.getCalculateFeatureImportances()

def setVariableImportances(self, value):
warnings.warn("The method 'setVariableImportances' is deprecated and will be removed in the version 3.36." +
"The replacement is 'setCalculateFeatureImportances'.")
self.setCalculateFeatureImportances(value)
return self
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pyspark.ml.param import *
import warnings

class HasDeprecatedVariableImportancesOnMOJO(Params):

def getVariableImportances(self):
warnings.warn("The method 'getVariableImportances' is deprecated and will be removed in the version 3.36." +
"The replacement is 'getCalculateFeatureImportances'.")
return self.getCalculateFeatureImportances()
27 changes: 27 additions & 0 deletions py/tests/unit/with_runtime_sparkling/test_mojo.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,33 @@ def testFeatureTypes(gbmModel):
assert len(types) == 9


def testScoringHistory(gbmModel):
scoringHistoryDF = gbmModel.getScoringHistory()
assert scoringHistoryDF.count() > 0
assert len(scoringHistoryDF.columns) > 0


def testFeatureImportances(gbmModel):
featureImportancesDF = gbmModel.getFeatureImportances()
assert featureImportancesDF.select("Variable").collect().sort() == gbmModel.getFeaturesCols().sort()
assert len(featureImportancesDF.columns) == 4


def testFeatureImportancesAndScoringHistoryAreSameAfterSerde(gbmModel):
expectedScoringHistoryDF = gbmModel.getScoringHistory()
expectedFeatureImportancesDF = gbmModel.getFeatureImportances()

filePath = "file://" + os.path.abspath("build/scoringHistoryAndFeatureImportancesSerde")
gbmModel.write().overwrite().save(filePath)
loadedModel = H2OMOJOModel.load(filePath)

loadedScoringHistoryDF = loadedModel.getScoringHistory()
loadedFeatureImportancesDF = loadedModel.getFeatureImportances()

unit_test_utils.assert_data_frames_are_identical(expectedScoringHistoryDF, loadedScoringHistoryDF)
unit_test_utils.assert_data_frames_are_identical(expectedFeatureImportancesDF, loadedFeatureImportancesDF)


def getCurrentMetrics():
metrics = gbmModel.getCurrentMetrics()
assert metrics == gbmModel.getTrainingMetrics()
Expand Down
10 changes: 8 additions & 2 deletions r/src/R/ai/h2o/sparkling/ml/models/H2OMOJOModel.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,9 @@ H2OMOJOModel <- setRefClass("H2OMOJOModel", contains = ("H2OMOJOModelBase"), met
getModelDetails = function() {
invoke(.self$jmojo, "getModelDetails")
},

getDomainValues = function() {
invoke(.self$jmojo, "getDomainValues")
},

getTrainingMetrics = function() {
invoke(.self$jmojo, "getTrainingMetrics")
},
Expand All @@ -61,6 +59,14 @@ H2OMOJOModel <- setRefClass("H2OMOJOModel", contains = ("H2OMOJOModelBase"), met
},
getModelCategory = function() {
invoke(.self$jmojo, "getModelCategory")
},
getScoringHistory = function() {
outputFrame <- invoke(.self$jmojo, "getScoringHistory")
sdf_register(outputFrame)
},
getFeatureImportances = function() {
outputFrame <- invoke(.self$jmojo, "getFeatureImportances")
sdf_register(outputFrame)
}
))

Expand Down
21 changes: 21 additions & 0 deletions r/src/tests/testthat/testMojo.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,27 @@ test_that("test getDomainValues", {
expect_true(is.null(domainValues[["ID"]]))
})

test_that("test getScoringHistory", {
model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo")))
scoringHistory <- model$getScoringHistory()

numberOfRecordsFrame <- dplyr::tally(scoringHistory)
count <- as.double(dplyr::collect(numberOfRecordsFrame)[[1]])

expect_true(count > 0)
})

test_that("test getFeatureImportances", {
model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo")))
featureImportances <- model$getFeatureImportances()
expectedCount <- length(model$getFeaturesCols())

numberOfRecordsFrame <- dplyr::tally(featureImportances)
count <- as.double(dplyr::collect(numberOfRecordsFrame)[[1]])

expect_equal(count, expectedCount)
})

test_that("test training params", {
model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo")))
params <- model$getTrainingParams()
Expand Down

0 comments on commit 403b90f

Please sign in to comment.