# Timing Isolation Forest libraries

This is a continuation of the timings in [this link](https://github.com/david-cortes/isotree/blob/master/timings/timings_python.ipynb) for different implementations of isolation forests, this time including the H2O package (version 3.34.0.1 at the time of writing). For more details, see the link above and the [Github repository](https://www.github.com/david-cortes/isotree).

Note that this library always run multi-threaded, so no separate comparison for single-threaded mode was performed.

In [1]:
import numpy as np
from scipy.io import loadmat

In [2]:
import h2o
from h2o.estimators import (
    H2OIsolationForestEstimator,
    H2OExtendedIsolationForestEstimator
)
h2o.no_progress()

In [3]:
%%capture
import os, contextlib
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    h2o.init()

### Small dataset: Satellite (6435 rows, 36 columns)

Data was taken from the ODDS repository - [link](http://odds.cs.stonybrook.edu/satellite-dataset/).

In [4]:
satellite = loadmat("satellite.mat")
X = np.asfortranarray(satellite["X"]).astype(np.float64)
X = h2o.H2OFrame(X)
X.shape

(6435, 36)

In [5]:
%%timeit
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    iso = H2OIsolationForestEstimator(
        training_frame=X,
        ntrees=100,
        sample_size=256,
        max_depth=int(np.ceil(np.log2(256))),
        seed=1
    )
    iso.train()

585 ms ± 95.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%%timeit
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    iso = H2OIsolationForestEstimator(
        training_frame=X,
        ntrees=100,
        sample_size=1024,
        max_depth=int(np.ceil(np.log2(1024))),
        seed=1
    )
    iso.train()

950 ms ± 541 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    iso = H2OIsolationForestEstimator(
        training_frame=X,
        ntrees=100,
        sample_size=6435,
        max_depth=int(np.ceil(np.log2(6435))),
        seed=1
    )
    iso.train()

1.4 s ± 124 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%%timeit
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    iso = H2OExtendedIsolationForestEstimator(
        training_frame=X,
        ntrees=100,
        sample_size=256,
        extension_level=1,
        seed=1
    )
    iso.train()

456 ms ± 72.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%%timeit
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    iso = H2OExtendedIsolationForestEstimator(
        training_frame=X,
        ntrees=100,
        sample_size=1024,
        extension_level=1,
        seed=1
    )
    iso.train()

1.25 s ± 851 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    iso = H2OExtendedIsolationForestEstimator(
        training_frame=X,
        ntrees=100,
        sample_size=6435,
        extension_level=1,
        seed=1
    )
    iso.train()

6.2 s ± 30.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Mid-sized dataset: CovType (581,012 rows, 54 columns)

In [11]:
from sklearn.datasets import fetch_covtype

X, y = fetch_covtype(return_X_y=True)
X = np.asfortranarray(X).astype(np.float64)
X = h2o.H2OFrame(X)
X.shape

(581012, 54)

In [12]:
%%timeit
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    iso = H2OIsolationForestEstimator(
        training_frame=X,
        ntrees=100,
        sample_size=256,
        max_depth=int(np.ceil(np.log2(256))),
        seed=1
    )
    iso.train()

7.32 s ± 26.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%%timeit
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    iso = H2OIsolationForestEstimator(
        training_frame=X,
        ntrees=100,
        sample_size=1024,
        max_depth=int(np.ceil(np.log2(1024))),
        seed=1
    )
    iso.train()

8.79 s ± 317 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%%timeit
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    iso = H2OIsolationForestEstimator(
        training_frame=X,
        ntrees=100,
        sample_size=10000,
        max_depth=int(np.ceil(np.log2(10000))),
        seed=1
    )
    iso.train()

11.8 s ± 1.06 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%%timeit
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    iso = H2OExtendedIsolationForestEstimator(
        training_frame=X,
        ntrees=100,
        sample_size=256,
        extension_level=1,
        seed=1
    )
    iso.train()

651 ms ± 71 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%%timeit
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    iso = H2OExtendedIsolationForestEstimator(
        training_frame=X,
        ntrees=100,
        sample_size=1024,
        extension_level=1,
        seed=1
    )
    iso.train()

2.14 s ± 992 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%%timeit
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    iso = H2OExtendedIsolationForestEstimator(
        training_frame=X,
        ntrees=100,
        sample_size=10000,
        extension_level=1,
        seed=1
    )
    iso.train()

18.1 s ± 521 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
