Skip to content

Commit

Permalink
[SW-2559] Expose Scoring History and Variable Importances on H2OMOJOM…
Browse files Browse the repository at this point in the history
…odel (#2525)

* [SW-2559] Expose Scoring History and Variable Importances on H2OMOJOModel

* Deprecate variable importances on H2ODeepLearning

* Rename variable importances to feature importances

* Extend documentation

* Fix deserialization problem

* spotless apply

* Fix python tests and deprecation on DeepLearningMOJOModel

* fix python tests

* fix R tests

* typo

* fix R tests

* spotless apply

* null handling
  • Loading branch information
mn-mikke committed May 24, 2021
1 parent 177ec6e commit 3fab572
Show file tree
Hide file tree
Showing 20 changed files with 337 additions and 24 deletions.
Expand Up @@ -93,13 +93,22 @@ trait AlgorithmConfigurations {

val noDeprecation = Seq.empty

val dlDeprecations = Seq(
DeprecatedField(
"variable_importances",
"HasDeprecatedVariableImportances",
"variableImportances",
"3.36",
Some("calculateFeatureImportances"),
Some("HasDeprecatedVariableImportancesOnMOJO")))

val algorithmParameters = Seq[(String, Class[_], Class[_], Seq[ExplicitField], Seq[DeprecatedField])](
("H2OXGBoostParams", classOf[XGBParamsV3], classOf[XGBoostParameters], xgboostFields, noDeprecation),
("H2OGBMParams", classOf[GBMV3.GBMParametersV3], classOf[GBMParameters], gbmFields, noDeprecation),
("H2ODRFParams", classOf[DRFV3.DRFParametersV3], classOf[DRFParameters], drfFields, noDeprecation),
("H2OGLMParams", classOf[GLMV3.GLMParametersV3], classOf[GLMParameters], glmFields, noDeprecation),
("H2OGAMParams", classOf[GAMV3.GAMParametersV3], classOf[GAMParameters], gamFields, noDeprecation),
("H2ODeepLearningParams", classOf[DLParamsV3], classOf[DeepLearningParameters], dlFields, noDeprecation),
("H2ODeepLearningParams", classOf[DLParamsV3], classOf[DeepLearningParameters], dlFields, dlDeprecations),
("H2OKMeansParams", classOf[KMeansParamsV3], classOf[KMeansParameters], kmeansFields, noDeprecation),
("H2OCoxPHParams", classOf[CoxPHParametersV3], classOf[CoxPHParameters], coxPHFields, noDeprecation),
("H2OIsolationForestParams", classOf[IFParamsV3], classOf[IsolationForestParameters], ifFields, noDeprecation))
Expand Down
Expand Up @@ -29,7 +29,8 @@ object ParameterNameConverter {
"colsample_bynode" -> "colSampleByNode",
"rand_family" -> "randomFamily",
"rand_link" -> "randomLink",
"calibration_frame" -> "calibrationDataFrame")
"calibration_frame" -> "calibrationDataFrame",
"variable_importances" -> "calculateFeatureImportances")

val conversionRules: Map[String, String] = Map("Column" -> "Col")

Expand Down
Expand Up @@ -40,7 +40,13 @@ case class ExplicitField(
sparkName: Option[String] = None,
mojoImplementation: Option[String] = None)

case class DeprecatedField(h2oName: String, implementation: String, sparkName: String, version: String)
case class DeprecatedField(
h2oName: String,
implementation: String,
sparkName: String,
version: String,
replacement: Option[String] = None,
mojoImplementation: Option[String] = None)

object DefaultValueSource extends Enumeration {
type DefaultValueSource = Value
Expand Down
Expand Up @@ -41,7 +41,13 @@ trait AlgorithmTemplateBase extends PythonEntityTemplate {
.map { param =>
val version = param.version
val name = param.sparkName
s""" if '$name' in kwargs:
val valuePropagation = param.replacement match {
case Some(replacement) =>
s"""\n if '$replacement' not in kwargs:
| kwargs['$replacement'] = kwargs['$name']""".stripMargin
case None => ""
}
s""" if '$name' in kwargs:$valuePropagation
| del kwargs['$name']
| warn("The parameter '$name' is deprecated and will be removed in the version $version.")""".stripMargin
}
Expand Down
Expand Up @@ -33,7 +33,8 @@ object MOJOModelTemplate
val entityName = algorithmSubstitutionContext.entityName
val namespace = algorithmSubstitutionContext.namespace
val algorithmType = algorithmSubstitutionContext.algorithmType.replace("Algorithm", "MOJOModelParams")
val explicitFields = parameterSubstitutionContext.explicitFields.flatMap(_.mojoImplementation)
val explicitFields = parameterSubstitutionContext.explicitFields.flatMap(_.mojoImplementation) ++
parameterSubstitutionContext.deprecatedFields.flatMap(_.mojoImplementation)
val parents = Seq(algorithmType) ++ explicitFields

val imports = Seq(
Expand Down
Expand Up @@ -34,7 +34,8 @@ object MOJOModelTemplate
.filter(parameter =>
!IgnoredParameters.ignoredInMOJOs(algorithmSubstitutionContext.entityName).contains(parameter.h2oName))

val explicitFieldImplementations = parameterSubstitutionContext.explicitFields.flatMap(_.mojoImplementation)
val explicitFieldImplementations = parameterSubstitutionContext.explicitFields.flatMap(_.mojoImplementation) ++
parameterSubstitutionContext.deprecatedFields.flatMap(_.mojoImplementation)

val imports = Seq(
"ai.h2o.sparkling.ml.params.ParameterConstructorMethods",
Expand Down
16 changes: 16 additions & 0 deletions booklet/src/sections/productionazing.tex
Expand Up @@ -198,6 +198,22 @@ \subsubsection{Obtaining Model Category}

The method \texttt{getModelCategory} can be used to get the model category (such as \texttt{binomial}, \texttt{multinomial} etc).

\subsubsection{Obtaining Feature Types}

The method \texttt{getFeatureTypes} returns a map/dictionary from a feature name to a corresponding feature type
[\texttt{enum} (categorical), \texttt{numeric}, \texttt{string}, etc.]. These pieces helps to understand how individual
columns of the training dataset were treated during the model training.

\subsubsection{Obtaining Feature Importances}

The method \texttt{getFeatureImportances} returns a data frame describing importance of each feature. The importance is expressed
by several numbers (Relative Importance, Scaled Importance and Percentage).

\subsubsection{Obtaining Scoring History}

The method \texttt{getScoringHistory} returns a data frame describing how the model evolved during the training process
according to a certain training and validation metrics.

\subsubsection{Obtaining Training Params}

The method \texttt{getTrainingParams} can be used to get map containing all training parameters used in the H2O. It is a map
Expand Down
14 changes: 14 additions & 0 deletions doc/src/site/sphinx/deployment/load_mojo.rst
Expand Up @@ -341,6 +341,20 @@ The method ``getFeatureTypes`` returns a map/dictionary from a feature name to a
[``enum`` (categorical), ``numeric``, ``string``, etc.]. These pieces helps to understand how individual columns of
the training dataset were treated during the model training.

Obtaining Feature Importances
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

The method ``getFeatureImportances`` returns a data frame describing importance of each feature. The importance is expressed
by several numbers (Relative Importance, Scaled Importance and Percentage). `H2O-3 documentation
<https://h2o-release.s3.amazonaws.com/h2o/rel-SUBST_H2O_RELEASE_NAME/SUBST_H2O_BUILD_NUMBER/docs-website/h2o-docs/variable-importance.html>`__
describes how the numbers are calculated.


Obtaining Scoring History
^^^^^^^^^^^^^^^^^^^^^^^^^

The method ``getScoringHistory`` returns a data frame describing how the model evolved during the training process according to
a certain training and validation metrics.

Obtaining Metrics
^^^^^^^^^^^^^^^^^
Expand Down
6 changes: 6 additions & 0 deletions doc/src/site/sphinx/migration_guide.rst
Expand Up @@ -9,6 +9,12 @@ From 3.34 to 3.36
- The methods ``getWithDetailedPredictionCol`` and ``setWithDetailedPredictionCol`` on all SW Algorithms and
MOJO models were removed without replacement.

- The parameter ``variableImportances`` of ``H2ODeepLearning`` has been replaced with ``calculateFeatureImportances`` as
well as the methods ``getVariableImportances`` and ``setVariableImportances`` on ``H2ODeepLearning`` have been replaced
with ``getCalculateFeatureImportances`` and ``setCalculateFeatureImportances``.

- The method ``getVariableImportances`` of ``H2ODeepLearningMOJOModel`` has been replaced with ``getCalculateFeatureImportances``.

From 3.32.1 to 3.34
-------------------

Expand Down
@@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package ai.h2o.sparkling.ml.params

import ai.h2o.sparkling.macros.DeprecatedMethod
import org.apache.spark.expose.Logging

trait HasDeprecatedVariableImportances extends Logging {

def getCalculateFeatureImportances(): Boolean

@DeprecatedMethod("getCalculateFeatureImportances", "3.36")
def getVariableImportances(): Boolean = getCalculateFeatureImportances()

def setCalculateFeatureImportances(value: Boolean): this.type

@DeprecatedMethod("setCalculateFeatureImportances", "3.36")
def setVariableImportances(value: Boolean): this.type = setCalculateFeatureImportances(value)
}
Expand Up @@ -46,6 +46,12 @@ def getTrainingParams(self):
def getModelCategory(self):
return self._java_obj.getModelCategory()

def getScoringHistory(self):
return H2OTypeConverters.scalaToPythonDataFrame(self._java_obj.getScoringHistory())

def getFeatureImportances(self):
return H2OTypeConverters.scalaToPythonDataFrame(self._java_obj.getFeatureImportances())


class HasOffsetCol:

Expand Down
12 changes: 11 additions & 1 deletion py-scoring/src/ai/h2o/sparkling/ml/params/H2OTypeConverters.py
Expand Up @@ -19,7 +19,7 @@
from pyspark.ml.linalg import DenseVector, DenseMatrix
from pyspark.ml.param import TypeConverters
from pyspark.ml.util import _jvm
from pyspark.sql import DataFrame
from pyspark.sql import DataFrame, SparkSession


class H2OTypeConverters(object):
Expand Down Expand Up @@ -500,3 +500,13 @@ def scalaArrayToPythonArray(array):
return [v for v in array]
else:
raise TypeError("Invalid type.")

@staticmethod
def scalaToPythonDataFrame(jdf):
if jdf is None:
return None
elif isinstance(jdf, JavaObject):
sqlContext = SparkSession.builder.getOrCreate()._wrapped
return DataFrame(jdf, sqlContext)
else:
raise TypeError("Invalid type.")
@@ -0,0 +1,32 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pyspark.ml.param import *
import warnings

class HasDeprecatedVariableImportances(Params):

def getVariableImportances(self):
warnings.warn("The method 'getVariableImportances' is deprecated and will be removed in the version 3.36." +
"The replacement is 'getCalculateFeatureImportances'.")
return self.getCalculateFeatureImportances()

def setVariableImportances(self, value):
warnings.warn("The method 'setVariableImportances' is deprecated and will be removed in the version 3.36." +
"The replacement is 'setCalculateFeatureImportances'.")
self.setCalculateFeatureImportances(value)
return self
@@ -0,0 +1,26 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from pyspark.ml.param import *
import warnings

class HasDeprecatedVariableImportancesOnMOJO(Params):

def getVariableImportances(self):
warnings.warn("The method 'getVariableImportances' is deprecated and will be removed in the version 3.36." +
"The replacement is 'getCalculateFeatureImportances'.")
return self.getCalculateFeatureImportances()
27 changes: 27 additions & 0 deletions py/tests/unit/with_runtime_sparkling/test_mojo.py
Expand Up @@ -80,6 +80,33 @@ def testFeatureTypes(gbmModel):
assert len(types) == 9


def testScoringHistory(gbmModel):
scoringHistoryDF = gbmModel.getScoringHistory()
assert scoringHistoryDF.count() > 0
assert len(scoringHistoryDF.columns) > 0


def testFeatureImportances(gbmModel):
featureImportancesDF = gbmModel.getFeatureImportances()
assert featureImportancesDF.select("Variable").collect().sort() == gbmModel.getFeaturesCols().sort()
assert len(featureImportancesDF.columns) == 4


def testFeatureImportancesAndScoringHistoryAreSameAfterSerde(gbmModel):
expectedScoringHistoryDF = gbmModel.getScoringHistory()
expectedFeatureImportancesDF = gbmModel.getFeatureImportances()

filePath = "file://" + os.path.abspath("build/scoringHistoryAndFeatureImportancesSerde")
gbmModel.write().overwrite().save(filePath)
loadedModel = H2OMOJOModel.load(filePath)

loadedScoringHistoryDF = loadedModel.getScoringHistory()
loadedFeatureImportancesDF = loadedModel.getFeatureImportances()

unit_test_utils.assert_data_frames_are_identical(expectedScoringHistoryDF, loadedScoringHistoryDF)
unit_test_utils.assert_data_frames_are_identical(expectedFeatureImportancesDF, loadedFeatureImportancesDF)


def getCurrentMetrics():
metrics = gbmModel.getCurrentMetrics()
assert metrics == gbmModel.getTrainingMetrics()
Expand Down
10 changes: 8 additions & 2 deletions r/src/R/ai/h2o/sparkling/ml/models/H2OMOJOModel.R
Expand Up @@ -39,11 +39,9 @@ H2OMOJOModel <- setRefClass("H2OMOJOModel", contains = ("H2OMOJOModelBase"), met
getModelDetails = function() {
invoke(.self$jmojo, "getModelDetails")
},

getDomainValues = function() {
invoke(.self$jmojo, "getDomainValues")
},

getTrainingMetrics = function() {
invoke(.self$jmojo, "getTrainingMetrics")
},
Expand All @@ -61,6 +59,14 @@ H2OMOJOModel <- setRefClass("H2OMOJOModel", contains = ("H2OMOJOModelBase"), met
},
getModelCategory = function() {
invoke(.self$jmojo, "getModelCategory")
},
getScoringHistory = function() {
outputFrame <- invoke(.self$jmojo, "getScoringHistory")
sdf_register(outputFrame)
},
getFeatureImportances = function() {
outputFrame <- invoke(.self$jmojo, "getFeatureImportances")
sdf_register(outputFrame)
}
))

Expand Down
21 changes: 21 additions & 0 deletions r/src/tests/testthat/testMojo.R
Expand Up @@ -69,6 +69,27 @@ test_that("test getDomainValues", {
expect_true(is.null(domainValues[["ID"]]))
})

test_that("test getScoringHistory", {
model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo")))
scoringHistory <- model$getScoringHistory()

numberOfRecordsFrame <- dplyr::tally(scoringHistory)
count <- as.double(dplyr::collect(numberOfRecordsFrame)[[1]])

expect_true(count > 0)
})

test_that("test getFeatureImportances", {
model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo")))
featureImportances <- model$getFeatureImportances()
expectedCount <- length(model$getFeaturesCols())

numberOfRecordsFrame <- dplyr::tally(featureImportances)
count <- as.double(dplyr::collect(numberOfRecordsFrame)[[1]])

expect_equal(count, expectedCount)
})

test_that("test training params", {
model <- H2OMOJOModel.createFromMojo(paste0("file://", normalizePath("../../../../../ml/src/test/resources/binom_model_prostate.mojo")))
params <- model$getTrainingParams()
Expand Down

0 comments on commit 3fab572

Please sign in to comment.