Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML] adds support for transforming LGBMRegressor models #247

Merged
merged 5 commits into from
Aug 11, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions eland/ml/_model_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ def __init__(
self._leaf_value = leaf_value
self._default_left = default_left

@property
def node_idx(self) -> int:
return self._node_idx

def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {}
add_if_exists(d, "node_index", self._node_idx)
Expand Down
23 changes: 21 additions & 2 deletions eland/ml/imported_ml_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@
from xgboost import XGBRegressor, XGBClassifier # type: ignore # noqa: F401
except ImportError:
pass
try:
from lightgbm import LGBMRegressor # type: ignore # noqa: f401
except ImportError:
pass


class ImportedMLModel(MLModel):
Expand All @@ -59,14 +63,28 @@ class ImportedMLModel(MLModel):
- sklearn.tree.DecisionTreeRegressor
- sklearn.ensemble.RandomForestRegressor
- sklearn.ensemble.RandomForestClassifier
- lightgbm.LGBMRegressor
- Only the following objectives are supported
- "regression"
- "regression_l1"
- "huber"
- "fair"
- "quantile"
- "mape"
- The following booster types are supported
benwtrent marked this conversation as resolved.
Show resolved Hide resolved
- "gbdt"
- "rf"
- "dart"
- "goss"
- Categorical fields are expected to already be processed
benwtrent marked this conversation as resolved.
Show resolved Hide resolved
- xgboost.XGBClassifier
- only the following operators are supported:
- only the following objectives are supported:
- "binary:logistic"
- "binary:hinge"
- "multi:softmax"
- "multi:softprob"
- xgboost.XGBRegressor
- only the following operators are supportd:
- only the following objectives are supported:
tveasey marked this conversation as resolved.
Show resolved Hide resolved
- "reg:squarederror"
- "reg:linear"
- "reg:squaredlogerror"
Expand Down Expand Up @@ -130,6 +148,7 @@ def __init__(
"RandomForestClassifier",
"XGBClassifier",
"XGBRegressor",
"LGBMRegressor",
],
feature_names: List[str],
classification_labels: Optional[List[str]] = None,
Expand Down
13 changes: 13 additions & 0 deletions eland/ml/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,16 @@ def get_model_transformer(model: Any, **kwargs: Any) -> ModelTransformer:
_MODEL_TRANSFORMERS.update(_XGBOOST_MODEL_TRANSFORMERS)
except ImportError:
pass

try:
from .lightgbm import (
LGBMRegressor,
LGBMForestTransformer,
LGBMRegressorTransformer,
_MODEL_TRANSFORMERS as _LGBOOST_MODEL_TRANSFORMERS,
benwtrent marked this conversation as resolved.
Show resolved Hide resolved
)

__all__ += ["LGBMRegressor", "LGBMForestTransformer", "LGBMRegressorTransformer"]
_MODEL_TRANSFORMERS.update(_LGBOOST_MODEL_TRANSFORMERS)
except ImportError:
pass
202 changes: 202 additions & 0 deletions eland/ml/transformers/lightgbm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
# Licensed to Elasticsearch B.V. under one or more contributor
# license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright
# ownership. Elasticsearch B.V. licenses this file to you under
# the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

from typing import Optional, List, Dict, Any, Type
from .base import ModelTransformer
from .._model_serializer import Ensemble, Tree, TreeNode
from ..ml_model import MLModel
from .._optional import import_optional_dependency

import_optional_dependency("lightgbm", on_version="warn")

from lightgbm import Booster, LGBMRegressor # type: ignore


def transform_decider(decider: str) -> str:
if decider == "<=":
return "lte"
if decider == "<":
return "lt"
if decider == ">":
return "gt"
if decider == ">=":
return "gte"
raise ValueError(
"Unsupported splitting decider: %s. Only <=, " "<, >=, and > are allowed."
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is two string literals joining together, you can change it to one. (see "...<=, " "<,...")

)


class Counter:
def __init__(self, start: int = 0):
self._value = start

def inc(self) -> int:
self._value += 1
return self._value

def value(self) -> int:
benwtrent marked this conversation as resolved.
Show resolved Hide resolved
return self._value


class LGBMForestTransformer(ModelTransformer):
"""
Base class for transforming LightGBM models into ensemble models supported by Elasticsearch

warning: do not use directly. Use a derived classes instead
"""

def __init__(
self,
model: Booster,
feature_names: List[str],
classification_labels: Optional[List[str]] = None,
classification_weights: Optional[List[float]] = None,
):
benwtrent marked this conversation as resolved.
Show resolved Hide resolved
super().__init__(
model, feature_names, classification_labels, classification_weights
)
self._node_decision_type = "lte"
self._objective = model.params["objective"]

def build_tree(self, tree_json_obj: dict) -> Tree:
tree_nodes = list()
next_id = Counter()
py_id_to_node_id = dict()

def add_tree_node(tree_node_json_obj: dict, counter: Counter):
curr_id = py_id_to_node_id[id(tree_node_json_obj)]
benwtrent marked this conversation as resolved.
Show resolved Hide resolved
if "leaf_value" in tree_node_json_obj:
tree_nodes.append(
TreeNode(
node_idx=curr_id,
leaf_value=float(tree_node_json_obj["leaf_value"]),
)
)
return
left_py_id = id(tree_node_json_obj["left_child"])
right_py_id = id(tree_node_json_obj["right_child"])
parse_left = False
parse_right = False
if left_py_id not in py_id_to_node_id:
parse_left = True
py_id_to_node_id[left_py_id] = counter.inc()
if right_py_id not in py_id_to_node_id:
parse_right = True
py_id_to_node_id[right_py_id] = counter.inc()

tree_nodes.append(
TreeNode(
node_idx=curr_id,
default_left=tree_node_json_obj["default_left"],
split_feature=tree_node_json_obj["split_feature"],
threshold=float(tree_node_json_obj["threshold"]),
decision_type=transform_decider(
tree_node_json_obj["decision_type"]
),
left_child=py_id_to_node_id[left_py_id],
right_child=py_id_to_node_id[right_py_id],
)
)
if parse_left:
add_tree_node(tree_node_json_obj["left_child"], counter)
if parse_right:
add_tree_node(tree_node_json_obj["right_child"], counter)

py_id_to_node_id[id(tree_json_obj["tree_structure"])] = next_id.value()
add_tree_node(tree_json_obj["tree_structure"], next_id)
tree_nodes.sort(key=lambda n: n.node_idx)
return Tree(
self._feature_names,
benwtrent marked this conversation as resolved.
Show resolved Hide resolved
target_type=self.determine_target_type(),
tree_structure=tree_nodes,
)

def build_forest(self) -> List[Tree]:
"""
This builds out the forest of trees as described by LightGBM into a format
supported by Elasticsearch

:return: A list of Tree objects
"""
self.check_model_booster()
json_dump = self._model.dump_model()
return [self.build_tree(t) for t in json_dump["tree_info"]]

def build_aggregator_output(self) -> Dict[str, Any]:
raise NotImplementedError("build_aggregator_output must be implemented")

def determine_target_type(self) -> str:
raise NotImplementedError("determine_target_type must be implemented")

def is_objective_supported(self) -> bool:
return False

def check_model_booster(self):
raise NotImplementedError("check_model_booster must be implemented")

def transform(self) -> Ensemble:
self.check_model_booster()

if not self.is_objective_supported():
raise ValueError(f"Unsupported objective '{self._objective}'")

forest = self.build_forest()
return Ensemble(
feature_names=self._feature_names,
trained_models=forest,
output_aggregator=self.build_aggregator_output(),
classification_labels=self._classification_labels,
classification_weights=self._classification_weights,
target_type=self.determine_target_type(),
)


class LGBMRegressorTransformer(LGBMForestTransformer):
def __init__(self, model: LGBMRegressor, feature_names: List[str]):
super().__init__(model.booster_, feature_names)

def is_objective_supported(self) -> bool:
return self._objective in {
"regression",
"regression_l1",
"huber",
"fair",
"quantile",
"mape",
}

def check_model_booster(self) -> None:
if self._model.params["boosting_type"] not in {"gbdt", "rf", "dart", "goss"}:
raise ValueError(
f"boosting type must exist and be of type 'gbdt', 'rf', 'dart', or 'goss'"
f", was {self._model.params['boosting_type']!r}"
)

def determine_target_type(self) -> str:
return "regression"

def build_aggregator_output(self) -> Dict[str, Any]:
return {"weighted_sum": {}}

@property
def model_type(self) -> str:
return MLModel.TYPE_REGRESSION


_MODEL_TRANSFORMERS: Dict[type, Type[ModelTransformer]] = {
LGBMRegressor: LGBMRegressorTransformer
}
43 changes: 43 additions & 0 deletions eland/tests/ml/test_imported_ml_model_pytest.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@
except ImportError:
HAS_XGBOOST = False

try:
from lightgbm import LGBMRegressor

HAS_LGBOOST = True
except ImportError:
HAS_LGBOOST = False


requires_sklearn = pytest.mark.skipif(
not HAS_SKLEARN, reason="This test requires 'scikit-learn' package to run"
Expand All @@ -50,6 +57,10 @@
reason="This test requires 'scikit-learn' and 'xgboost' to not be installed",
)

requires_lgboost = pytest.mark.skipif(
benwtrent marked this conversation as resolved.
Show resolved Hide resolved
not HAS_LGBOOST, reason="This test requires 'lightgbm' package to run"
)


class TestImportedMLModel:
@requires_no_ml_extras
Expand Down Expand Up @@ -322,3 +333,35 @@ def test_predict_single_feature_vector(self):

# Clean up
es_model.delete_model()

@requires_lgboost
@pytest.mark.parametrize("compress_model_definition", [True, False])
def test_lgbm_regressor(self, compress_model_definition):
tveasey marked this conversation as resolved.
Show resolved Hide resolved
# Train model
training_data = datasets.make_regression(n_features=5)
regressor = LGBMRegressor()
regressor.fit(training_data[0], training_data[1])

# Get some test results
test_data = [[0.1, 0.2, 0.3, -0.5, 1.0], [1.6, 2.1, -10, 50, -1.0]]
test_results = regressor.predict(np.asarray(test_data))

# Serialise the models to Elasticsearch
feature_names = ["Column_0", "Column_1", "Column_2", "Column_3", "Column_4"]
model_id = "test_lgbm_regressor"

es_model = ImportedMLModel(
ES_TEST_CLIENT,
model_id,
regressor,
feature_names,
overwrite=True,
es_compress_model_definition=compress_model_definition,
)

es_results = es_model.predict(test_data)

np.testing.assert_almost_equal(test_results, es_results, decimal=2)

# Clean up
es_model.delete_model()
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ numpydoc>=0.9.0
scikit-learn>=0.22.1
xgboost>=1
nox
lightgbm>=2.3.0