Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for isolation forests #322

Merged
merged 12 commits into from
Nov 3, 2021
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions include/treelite/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,36 @@ TREELITE_DLL int TreeliteLoadSKLearnRandomForestRegressor(
const double** value, const int64_t** n_node_samples, const double** impurity,
ModelHandle* out);

/*!
* \brief Load a scikit-learn isolation forest model from a collection of arrays. Refer to
* https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to
* learn the mearning of the arrays in detail.
* \param n_estimators number of trees in the random forest
* \param n_features number of features in the training data
* \param node_count node_count[i] stores the number of nodes in the i-th tree
* \param children_left children_left[i][k] stores the ID of the left child node of node k of the
* i-th tree. This is only defined if node k is an internal (non-leaf) node.
* \param children_right children_right[i][k] stores the ID of the right child node of node k of the
* i-th tree. This is only defined if node k is an internal (non-leaf) node.
* \param feature feature[i][k] stores the ID of the feature used in the binary tree split at node k
* of the i-th tree. This is only defined if node k is an internal (non-leaf) node.
* \param threshold threshold[i][k] stores the threshold used in the binary tree split at node k of
* the i-th tree. This is only defined if node k is an internal (non-leaf) node.
* \param value value[i][k] stores the expected isolation depth of node k of the i-th tree. This is
* only defined if node k is a leaf node.
* \param n_node_samples n_node_samples[i][k] stores the number of data samples associated with
* node k of the i-th tree.
* \param impurity not used, but must be passed as array of arrays for each tree and node.
* \param ratio_c standardizing constant to use for calculation of the anomaly score.
* \param out pointer to store the loaded model
* \return 0 for success, -1 for failure
*/
TREELITE_DLL int TreeliteLoadSKLearnIsolationForest(
int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
const int64_t** children_right, const int64_t** feature, const double** threshold,
const double** value, const int64_t** n_node_samples, const double** impurity,
const double ratio_c, ModelHandle* out);

/*!
* \brief Load a scikit-learn random forest classifier model from a collection of arrays. Refer to
* https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to
Expand Down
8 changes: 8 additions & 0 deletions include/treelite/c_api_runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,14 @@ TREELITE_DLL int TreelitePredictorQueryPredTransform(PredictorHandle handle, con
* \return 0 for success, -1 for failure
*/
TREELITE_DLL int TreelitePredictorQuerySigmoidAlpha(PredictorHandle handle, float* out);
/*!
* \brief Get c value of exponential standard ratio transformation used to train
* the loaded model
* \param handle predictor
* \param out C value of transformation
* \return 0 for success, -1 for failure
*/
TREELITE_DLL int TreelitePredictorQueryRatioC(PredictorHandle handle, float* out);

/*!
* \brief Get global bias which adjusting predicted margin scores
Expand Down
28 changes: 28 additions & 0 deletions include/treelite/frontend.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,34 @@ std::unique_ptr<treelite::Model> LoadSKLearnRandomForestRegressor(
int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
const int64_t** children_right, const int64_t** feature, const double** threshold,
const double** value, const int64_t** n_node_samples, const double** impurity);
/*!
* \brief Load a scikit-learn isolation forest model from a collection of arrays. Refer to
* https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to
* learn the mearning of the arrays in detail.
* \param n_estimators number of trees in the isolation forest
* \param n_features number of features in the training data
* \param node_count node_count[i] stores the number of nodes in the i-th tree
* \param children_left children_left[i][k] stores the ID of the left child node of node k of the
* i-th tree. This is only defined if node k is an internal (non-leaf) node.
* \param children_right children_right[i][k] stores the ID of the right child node of node k of the
* i-th tree. This is only defined if node k is an internal (non-leaf) node.
* \param feature feature[i][k] stores the ID of the feature used in the binary tree split at node k
* of the i-th tree. This is only defined if node k is an internal (non-leaf) node.
* \param threshold threshold[i][k] stores the threshold used in the binary tree split at node k of
* the i-th tree. This is only defined if node k is an internal (non-leaf) node.
* \param value value[i][k] stores the expected isolation depth of node k of the i-th tree. This is
* only defined if node k is a leaf node.
* \param n_node_samples n_node_samples[i][k] stores the number of data samples associated with
* node k of the i-th tree.
* \param impurity not used, but must be passed as array of arrays for each tree and node.
* \param ratio_c standardizing constant to use for calculation of the anomaly score.
* \return loaded model
*/
std::unique_ptr<treelite::Model> LoadSKLearnIsolationForest(
int n_estimators, int n_features, const int64_t* node_count, const int64_t** children_left,
const int64_t** children_right, const int64_t** feature, const double** threshold,
const double** value, const int64_t** n_node_samples, const double** impurity,
const double ratio_c);
/*!
* \brief Load a scikit-learn random forest classifier model from a collection of arrays. Refer to
* https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html to
Expand Down
8 changes: 8 additions & 0 deletions include/treelite/predictor.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,13 @@ class Predictor {
inline float QuerySigmoidAlpha() const {
return sigmoid_alpha_;
}
/*!
* \brief Get c value in exponential standard ratio used to train the loaded model
* \return c value in exponential standard ratio transformation
*/
inline float QueryRatioC() const {
return ratio_c_;
}
/*!
* \brief Get global bias which adjusting predicted margin scores
* \return global bias
Expand Down Expand Up @@ -240,6 +247,7 @@ class Predictor {
size_t num_feature_;
std::string pred_transform_;
float sigmoid_alpha_;
float ratio_c_;
float global_bias_;
int num_worker_thread_;
TypeInfo threshold_type_;
Expand Down
10 changes: 9 additions & 1 deletion include/treelite/tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,14 @@ struct ModelParam {
* It must be strictly positive; if unspecified, it is set to 1.0.
*/
float sigmoid_alpha;
/*!
* \brief scaling parameter for exponential standard ratio transformation
* `expstdratio(x) = exp2(-x / c)`
*
* This parameter is used only when `pred_transform` is set to `'exponential_standard_ratio'`.
* If unspecified, it is set to 1.0.
*/
float ratio_c;
/*!
* \brief global bias of the model
*
Expand All @@ -621,7 +629,7 @@ struct ModelParam {
float global_bias;
/*! \} */

ModelParam() : sigmoid_alpha(1.0f), global_bias(0.0f) {
ModelParam() : sigmoid_alpha(1.0f), ratio_c(1.0f), global_bias(0.0f) {
std::memset(pred_transform, 0, TREELITE_MAX_PRED_TRANSFORM_LENGTH * sizeof(char));
std::strncpy(pred_transform, "identity", sizeof(pred_transform));
}
Expand Down
3 changes: 3 additions & 0 deletions include/treelite/tree_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ ModelParam::InitAllowUnknown(const Container& kwargs) {
this->pred_transform[TREELITE_MAX_PRED_TRANSFORM_LENGTH - 1] = '\0';
} else if (e.first == "sigmoid_alpha") {
this->sigmoid_alpha = std::stof(e.second, nullptr);
} else if (e.first == "ratio_c") {
this->ratio_c = std::stof(e.second, nullptr);
} else if (e.first == "global_bias") {
this->global_bias = std::stof(e.second, nullptr);
}
Expand All @@ -315,6 +317,7 @@ ModelParam::__DICT__() const {
std::map<std::string, std::string> ret;
ret.emplace("pred_transform", std::string(this->pred_transform));
ret.emplace("sigmoid_alpha", GetString(this->sigmoid_alpha));
ret.emplace("ratio_c", GetString(this->ratio_c));
ret.emplace("global_bias", GetString(this->global_bias));
return ret;
}
Expand Down
64 changes: 59 additions & 5 deletions python/treelite/sklearn/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import ctypes
import numpy as np
from scipy.special import psi

from ..util import TreeliteError
from ..core import _LIB, c_array, _check_call
Expand Down Expand Up @@ -38,12 +39,43 @@ def as_c_array(self):
"""Prepare the collection to pass as an argument of a C function"""
return c_array(self.ptr_type, self.collection)

# Helpers for isolation forests
def harmonic(number):
"""Calculates the n-th harmonic number"""
return psi(number+1) + np.euler_gamma

def expected_depth(n_remainder):
"""Calculates the expected isolation depth for a remainder of uniform points"""
if n_remainder <= 1:
return 0
if n_remainder == 2:
return 1
return float(2 * (harmonic(n_remainder) - 1))

def calculate_depths(isolation_depths, tree, curr_node, curr_depth):
"""Fill in an array of isolation depths for a scikit-learn isolation forest model"""
if tree.children_left[curr_node] == -1:
isolation_depths[curr_node] \
= curr_depth + expected_depth(tree.n_node_samples[curr_node])
else:
calculate_depths(
isolation_depths, tree, tree.children_left[curr_node], curr_depth+1)
calculate_depths(
isolation_depths, tree, tree.children_right[curr_node], curr_depth+1)


def import_model(sklearn_model):
# pylint: disable=R0914,R0912,R0915
"""
Load a tree ensemble model from a scikit-learn model object

Note
----
For 'IsolationForest', it will calculate the outlier score using the standardized ratio as
proposed in the original reference, which matches with
'IsolationForest._compute_chunked_score_samples' but is a bit different from
'IsolationForest.decision_function'.

Parameters
----------
sklearn_model : object of type \
Expand All @@ -52,7 +84,8 @@ def import_model(sklearn_model):
:py:class:`~sklearn.ensemble.ExtraTreesRegressor` / \
:py:class:`~sklearn.ensemble.ExtraTreesClassifier` / \
:py:class:`~sklearn.ensemble.GradientBoostingRegressor` / \
:py:class:`~sklearn.ensemble.GradientBoostingClassifier`
:py:class:`~sklearn.ensemble.GradientBoostingClassifier` / \
:py:class:`~sklearn.ensemble.IsolationForest`
Python handle to scikit-learn model

Returns
Expand Down Expand Up @@ -82,11 +115,12 @@ def import_model(sklearn_model):
from sklearn.ensemble import ExtraTreesClassifier as ExtraTreesC
from sklearn.ensemble import GradientBoostingRegressor as GradientBoostingR
from sklearn.ensemble import GradientBoostingClassifier as GradientBoostingC
from sklearn.ensemble import IsolationForest
except ImportError as e:
raise TreeliteError('This function requires scikit-learn package') from e

if isinstance(sklearn_model,
(RandomForestR, ExtraTreesR, GradientBoostingR, GradientBoostingC)):
(RandomForestR, ExtraTreesR, GradientBoostingR, GradientBoostingC, IsolationForest)):
leaf_value_expected_shape = lambda node_count: (node_count, 1, 1)
elif isinstance(sklearn_model, (RandomForestC, ExtraTreesC)):
leaf_value_expected_shape = lambda node_count: (node_count, 1, sklearn_model.n_classes_)
Expand All @@ -97,6 +131,9 @@ def import_model(sklearn_model):
(GradientBoostingR, GradientBoostingC)) and sklearn_model.init != 'zero':
raise TreeliteError("Gradient boosted trees must be trained with the option init='zero'")

if isinstance(sklearn_model, IsolationForest):
ratio_c = expected_depth(sklearn_model.max_samples)

node_count = []
children_left = ArrayOfArrays(dtype=np.int64)
children_right = ArrayOfArrays(dtype=np.int64)
Expand All @@ -112,16 +149,26 @@ def import_model(sklearn_model):
else:
estimator_range = [estimator]
learning_rate = 1.0
if isinstance(sklearn_model, IsolationForest):
isolation_depths = np.zeros(
estimator.tree_.n_node_samples.shape[0],
dtype = 'float64'
)
calculate_depths(isolation_depths, estimator.tree_, 0, 0.0)
for sub_estimator in estimator_range:
tree = sub_estimator.tree_
node_count.append(tree.node_count)
children_left.add(tree.children_left, expected_shape=(tree.node_count,))
children_right.add(tree.children_right, expected_shape=(tree.node_count,))
feature.add(tree.feature, expected_shape=(tree.node_count,))
threshold.add(tree.threshold, expected_shape=(tree.node_count,))
# Note: for gradient boosted trees, we shrink each leaf output by the learning rate
value.add(tree.value * learning_rate,
expected_shape=leaf_value_expected_shape(tree.node_count))
if not isinstance(sklearn_model, IsolationForest):
# Note: for gradient boosted trees, we shrink each leaf output by the learning rate
value.add(tree.value * learning_rate,
expected_shape=leaf_value_expected_shape(tree.node_count))
else:
value.add(isolation_depths.reshape((-1,1,1)),
expected_shape=leaf_value_expected_shape(tree.node_count))
n_node_samples.add(tree.n_node_samples, expected_shape=(tree.node_count,))
impurity.add(tree.impurity, expected_shape=(tree.node_count,))

Expand All @@ -133,6 +180,13 @@ def import_model(sklearn_model):
children_right.as_c_array(), feature.as_c_array(), threshold.as_c_array(),
value.as_c_array(), n_node_samples.as_c_array(), impurity.as_c_array(),
ctypes.byref(handle)))
elif isinstance(sklearn_model, IsolationForest):
_check_call(_LIB.TreeliteLoadSKLearnIsolationForest(
ctypes.c_int(sklearn_model.n_estimators), ctypes.c_int(sklearn_model.n_features_),
c_array(ctypes.c_int64, node_count), children_left.as_c_array(),
children_right.as_c_array(), feature.as_c_array(), threshold.as_c_array(),
value.as_c_array(), n_node_samples.as_c_array(), impurity.as_c_array(),
ctypes.c_double(ratio_c), ctypes.byref(handle)))
elif isinstance(sklearn_model, (RandomForestC, ExtraTreesC)):
_check_call(_LIB.TreeliteLoadSKLearnRandomForestClassifier(
ctypes.c_int(sklearn_model.n_estimators), ctypes.c_int(sklearn_model.n_features_),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ public class Predictor implements Serializable, KryoSerializable {
private transient int num_feature;
private transient String pred_transform;
private transient float sigmoid_alpha;
private transient float ratio_c;
private transient float global_bias;
private transient int num_thread;
private transient boolean verbose;
Expand Down Expand Up @@ -109,6 +110,9 @@ private void initNativeLibrary(String libpath) throws TreeliteError {
TreeliteJNI.checkCall(TreeliteJNI.TreelitePredictorQuerySigmoidAlpha(
handle, fp_out));
sigmoid_alpha = fp_out[0];
TreeliteJNI.checkCall(TreeliteJNI.TreelitePredictorQueryRatioC(
handle, fp_out));
ratio_c = fp_out[0];
TreeliteJNI.checkCall(TreeliteJNI.TreelitePredictorQueryGlobalBias(
handle, fp_out));
global_bias = fp_out[0];
Expand Down Expand Up @@ -158,6 +162,15 @@ public float GetSigmoidAlpha() {
return this.sigmoid_alpha;
}

/**
* Get c value in exponential standard ratio transformation used to train the loaded model.
*
* @return C value of sigmoid transformation
*/
public float GetRatioC() {
return this.ratio_c;
}

/**
* Get global bias which adjusting predicted margin scores.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ public static native int TreelitePredictorQueryPredTransform(
public static native int TreelitePredictorQuerySigmoidAlpha(
long handle, float[] out);

public static native int TreelitePredictorQueryRatioC(
long handle, float[] out);

public static native int TreelitePredictorQueryGlobalBias(
long handle, float[] out);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ class Predictor private[treelite4j](private[treelite4j] val pred: JPredictor)
@throws(classOf[TreeliteError])
def sigmoidAlpha: Float = pred.GetSigmoidAlpha()

@throws(classOf[TreeliteError])
def ratioC: Float = pred.GetRatioC()

@throws(classOf[TreeliteError])
def globalBias: Float = pred.GetGlobalBias()

Expand Down
19 changes: 19 additions & 0 deletions runtime/java/treelite4j/src/native/treelite4j.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,25 @@ Java_ml_dmlc_treelite4j_java_TreeliteJNI_TreelitePredictorQuerySigmoidAlpha(
return static_cast<jint>(ret);
}

/*
* Class: ml_dmlc_treelite4j_java_TreeliteJNI
* Method: TreelitePredictorQueryRatioC
* Signature: (J[F)I
*/
JNIEXPORT jint JNICALL
Java_ml_dmlc_treelite4j_java_TreeliteJNI_TreelitePredictorQueryRatioC(
JNIEnv* jenv, jclass jcls, jlong jpredictor, jfloatArray jout) {
PredictorHandle predictor = reinterpret_cast<PredictorHandle>(jpredictor);
float ratio_c = std::numeric_limits<float>::quiet_NaN();
const int ret = TreelitePredictorQueryRatioC(predictor, &ratio_c);
// store data
jfloat* out = jenv->GetFloatArrayElements(jout, nullptr);
out[0] = static_cast<jfloat>(ratio_c);
jenv->ReleaseFloatArrayElements(jout, out, 0);

return static_cast<jint>(ret);
}

/*
* Class: ml_dmlc_treelite4j_java_TreeliteJNI
* Method: TreelitePredictorQueryGlobalBias
Expand Down
8 changes: 8 additions & 0 deletions runtime/java/treelite4j/src/native/treelite4j.h

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ public void testPredictorBasic() throws TreeliteError {
TestCase.assertEquals(127, predictor.GetNumFeature());
TestCase.assertEquals("sigmoid", predictor.GetPredTransform());
TestCase.assertEquals(1.0f, predictor.GetSigmoidAlpha());
TestCase.assertEquals(1.0f, predictor.GetRatioC());
TestCase.assertEquals(0.0f, predictor.GetGlobalBias());
}

Expand Down Expand Up @@ -98,6 +99,7 @@ public void testSerialization() throws TreeliteError, IOException, ClassNotFound
TestCase.assertEquals(predictor.GetNumClass(), predictor2.GetNumClass());
TestCase.assertEquals(predictor.GetPredTransform(), predictor2.GetPredTransform());
TestCase.assertEquals(predictor.GetSigmoidAlpha(), predictor2.GetSigmoidAlpha());
TestCase.assertEquals(predictor.GetRatioC(), predictor2.GetRatioC());
TestCase.assertEquals(predictor.GetGlobalBias(), predictor2.GetGlobalBias());

List<DataPoint> dataset = DMatrixBuilder.LoadDatasetFromLibSVM(mushroomTestDataLocation);
Expand Down
Loading