# Comparing Isolation Forest implementations

This is a continuation of the comparisons in [this link](https://github.com/david-cortes/isotree/blob/master/example/comparison_model_quality.ipynb) for different implementations of isolation forests, this time including the H2O package (version 3.34.0.1 at the time of writing). For more details, see the link above and the [Github repository](https://www.github.com/david-cortes/isotree).

In [1]:
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from scipy.io import loadmat

The H2O library does not follow a scikit-learn-compatible interface design, so a wrapper needs to be made around it:

In [2]:
import h2o
from h2o.estimators import (
    H2OIsolationForestEstimator,
    H2OExtendedIsolationForestEstimator
)
h2o.no_progress()

In [3]:
%%capture
import os, contextlib
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    h2o.init()

In [4]:
from sklearn.base import BaseEstimator

class H2O_iso_sk_compat(BaseEstimator):
    def __init__(self, sample_size=256, ntrees=100, seed=123, extension_level=0):
        self.sample_size = sample_size
        self.ntrees = ntrees
        self.seed = seed
        self.extension_level = extension_level
    def fit(self, X, y=None):
        X = h2o.H2OFrame(X)
        with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
            if self.extension_level == 0:
                self._model = H2OIsolationForestEstimator(
                    training_frame=X, ntrees=self.ntrees, sample_size=self.sample_size,
                    seed=self.seed, max_depth=int(np.ceil(np.log2(self.sample_size)))
                )
            else:
                self._model = H2OExtendedIsolationForestEstimator(
                    training_frame=X, ntrees=self.ntrees, sample_size=self.sample_size,
                    seed=self.seed, extension_level=self.extension_level
                )
            self._model.train()
        return self
    def decision_function(self, X):
        pred = self._model.predict(test_data=h2o.H2OFrame(X))
        if self.extension_level == 0:
            pred = pred["predict"]
        else:
            pred = pred["anomaly_score"]
        return pred.as_data_frame().to_numpy().reshape(-1)

<a id="p1"></a>
## Satellite (6435 rows, 36 columns)

In [5]:
satellite = loadmat("satellite.mat")
X = np.asfortranarray(satellite["X"]).astype(np.float64)
y = satellite["y"].astype(np.float64).reshape(-1)
X.shape

(6435, 36)

In [6]:
p = H2O_iso_sk_compat().fit(X).decision_function(X)
roc_auc_score(y, p)

0.6621688220800316

In [7]:
from sklearn.model_selection import cross_validate

cv_res = cross_validate(H2O_iso_sk_compat(), X, y, scoring="roc_auc",
                        cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1))
cv_res["test_score"].mean()

0.6569443499759784

In [8]:
params_try = {
    "sample_size" : [256, 1024, 5000],
    "extension_level" : [0, 1],
}
cv_model = GridSearchCV(estimator=H2O_iso_sk_compat(),
                        param_grid=params_try,
                        scoring="roc_auc", refit=True,
                        cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1))
cv_model.fit(X,y)
cv_model.best_params_

{'extension_level': 1, 'sample_size': 5000}

In [9]:
pred_tuned = cv_model.decision_function(X)
roc_auc_score(y, pred_tuned)

0.7484739342885127

<a id="p2"></a>
## Antthyroid (7200 rows, 6 columns)

In [10]:
annthyroid = loadmat("annthyroid.mat")
X = np.asfortranarray(annthyroid["X"]).astype(np.float64)
y = annthyroid["y"].astype(np.float64).reshape(-1)
X.shape

(7200, 6)

Checking isotree library:

In [11]:
p = H2O_iso_sk_compat().fit(X).decision_function(X)
roc_auc_score(y, p)

0.8001398454452188

In [12]:
cv_res = cross_validate(H2O_iso_sk_compat(), X, y, scoring="roc_auc",
                        cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1))
cv_res["test_score"].mean()

0.8239561166873454

In [13]:
params_try = {
    "sample_size" : [256, 1024, 5000],
    "extension_level" : [0, 1],
}
cv_model = GridSearchCV(estimator=H2O_iso_sk_compat(),
                        param_grid=params_try,
                        scoring="roc_auc", refit=True,
                        cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1))
cv_model.fit(X,y)
cv_model.best_params_

{'extension_level': 0, 'sample_size': 256}

In [14]:
pred_tuned = cv_model.decision_function(X)
roc_auc_score(y, pred_tuned)

0.8001398454452188

<a id="p3"></a>
## Pendigits (6870 rows, 16 columns)

In [15]:
pendigits = loadmat("pendigits.mat")
X = np.asfortranarray(pendigits["X"]).astype(np.float64)
y = pendigits["y"].astype(np.float64).reshape(-1)
X.shape

(6870, 16)

Checking isotree library:

In [16]:
p = H2O_iso_sk_compat().fit(X).decision_function(X)
roc_auc_score(y, p)

0.8747283708744834

In [17]:
cv_res = cross_validate(H2O_iso_sk_compat(), X, y, scoring="roc_auc",
                        cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1))
cv_res["test_score"].mean()

0.9117002061008209

In [18]:
params_try = {
    "sample_size" : [256, 1024, 5000],
    "extension_level" : [0, 1],
}
cv_model = GridSearchCV(estimator=H2O_iso_sk_compat(),
                        param_grid=params_try,
                        scoring="roc_auc", refit=True,
                        cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=1))
cv_model.fit(X,y)
cv_model.best_params_

{'extension_level': 1, 'sample_size': 256}

In [19]:
pred_tuned = cv_model.decision_function(X)
roc_auc_score(y, pred_tuned)

0.9677892730841792