# Isolation forest

An anomaly detection method. 

The idea is beautifully simple. For each node, until you hit max depth or such:

1. select a column, $x$, uniformly at random
2. select a split point between $\max x$ and $\min x$ uniformly at random (so don't sample from the available observations but create an array that linearly chunks the dimension and smaple from that)
3. split into left and right and recurse from 1.

## References

* [Liu et al. 2006, Isolation Forest](https://ieeexplore.ieee.org/abstract/document/4781136)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.datasets as sk_datasets

from random_tree_models.models.decisiontree.visualize import show_tree
import random_tree_models.models.isolationforest as iforest
from random_tree_models.params import ColumnSelectionMethod, ThresholdSelectionMethod

In [None]:
rng = np.random.RandomState(42)

## Outlier detection

In [None]:
X, y = sk_datasets.make_classification(
    n_samples=1_000,
    n_features=2,
    n_classes=2,
    n_redundant=0,
    class_sep=2,
    random_state=rng,
)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, alpha=0.3);

Let's say `y = True` is anomalous

In [None]:
frac_subsamples = 2 / 3
frac_features = 1  # math.sqrt(X.shape[1]) / X.shape[1]

# threshold_method =  ThresholdSelectionMethod.uniform  # selects a random threshold from the linear space between the min and max values in X
threshold_method = (
    ThresholdSelectionMethod.random
)  # selects a random threshold from the values in X

column_method = ColumnSelectionMethod.random

In [None]:
X_inlier = X[y == 0]
X_outlier = X[y == 1]
X_inlier.shape, X_outlier.shape

In [None]:
model = iforest.IsolationForest(
    max_depth=10,
    n_trees=100,
    random_state=42,
    threshold_method=threshold_method,
    n_thresholds=1,
    frac_subsamples=frac_subsamples,
    frac_features=frac_features,
    column_method=column_method,
)

In [None]:
model.fit(
    X_inlier,
)

In [None]:
show_tree(model.trees_[0])

In [None]:
y_pred = model.predict(X)
y_pred[:5]

In [None]:
x0 = np.linspace(X[:, 0].min(), X[:, 0].max(), 100)
x1 = np.linspace(X[:, 1].min(), X[:, 1].max(), 100)
X0, X1 = np.meshgrid(x0, x1)
X_plot = np.array([X0.ravel(), X1.ravel()]).T

In [None]:
y_pred = model.predict(X_plot)
y_pred[:5]

In [None]:
fig, ax = plt.subplots()
im = ax.pcolormesh(X0, X1, y_pred.reshape(X0.shape), alpha=0.2)
fig.colorbar(im, ax=ax)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, ax=ax, alpha=0.3)
ax.set_title("mean tree depth")
plt.show()

In [None]:
fig, ax = plt.subplots()
im = ax.pcolormesh(X0, X1, 1 / y_pred.reshape(X0.shape), alpha=0.2)
fig.colorbar(im, ax=ax)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, ax=ax, alpha=0.3)
ax.set_title("Anomaly score (1/mean tree depth)")
plt.show()

In [None]:
y_pred_inlier = model.predict(X_inlier)
y_pred_outlier = model.predict(X_outlier)

In [None]:
pd.Series(y_pred_inlier).describe().to_frame()

In [None]:
pd.Series(y_pred_outlier).describe().to_frame()

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

sns.histplot(x=y_pred_inlier, ax=ax, label="inlier", color="C0", alpha=0.5)
sns.histplot(x=y_pred_outlier, ax=ax, label="outlier", color="C1", alpha=0.5)
ax.legend()
plt.tight_layout()