# Robust random cut forest

An anomaly detection method building on top of the isolation forest. 

Definition 1 in the Robust Random Cut Forest introduces the Robust Random Cut Tree, which seems to only differ from an isolation tree in how it chooses the column order. The isolation tree samples the columns uniformly at random. The robust random cut tree uses the difference between the maximum and minimum values observed for each feature to weight that sampling, i.e.
$$ \text{weight of column } i = \frac{l_i}{\sum_i l_i}$$

where $l_i = max(X_i) - min(X_i)$ of the column values available to the current node.

## References

* [Guha et al. 2016, Robust Random Cut Forest Based Anomaly Detection on Stream](https://proceedings.mlr.press/v48/guha16.html)

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import math

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.datasets as sk_datasets

import random_tree_models.decisiontree as dtree
import random_tree_models.isolationforest as iforest

In [None]:
rng = np.random.RandomState(42)

## Outlier detection

In [None]:
X, y = sk_datasets.make_classification(
    n_samples=1_000,
    n_features=2,
    n_classes=2,
    n_redundant=0,
    class_sep=2,
    random_state=rng,
)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, alpha=0.3);

Let's say `y = True` is anomalous

In [None]:
frac_subsamples = 2 / 3
frac_features = 1  # math.sqrt(X.shape[1]) / X.shape[1]
frac_subsamples, frac_features, X.shape[1]

threshold_method = "uniform"  # selects a random threshold from the linear space between the min and max values in X
# threshold_method="random" # selects a random threshold from the values in X
column_method = "largest_delta"  # selects the column with the largest difference between min and max values in X

In [None]:
X_inlier = X[y == 0]
X_outlier = X[y == 1]
X_inlier.shape, X_outlier.shape

In [None]:
model = iforest.IsolationForest(
    max_depth=10,
    n_trees=100,
    random_state=42,
    threshold_method=threshold_method,
    n_thresholds=1,
    frac_subsamples=frac_subsamples,
    frac_features=frac_features,
    column_method=column_method,
)

In [None]:
model.fit(
    X_inlier,
)

In [None]:
dtree.show_tree(model.trees_[0])

In [None]:
y_pred = model.predict(X)
y_pred[:5]

In [None]:
x0 = np.linspace(X[:, 0].min(), X[:, 0].max(), 100)
x1 = np.linspace(X[:, 1].min(), X[:, 1].max(), 100)
X0, X1 = np.meshgrid(x0, x1)
X_plot = np.array([X0.ravel(), X1.ravel()]).T

In [None]:
y_pred = model.predict(X_plot)
y_pred[:5]

In [None]:
model.trees_[0].growth_params_

In [None]:
fig, ax = plt.subplots()
im = ax.pcolormesh(X0, X1, y_pred.reshape(X0.shape), alpha=0.2)
fig.colorbar(im, ax=ax)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, ax=ax, alpha=0.3)
ax.set_title("mean tree depth")
plt.show()

In [None]:
fig, ax = plt.subplots()
im = ax.pcolormesh(X0, X1, 1 / y_pred.reshape(X0.shape), alpha=0.2)
fig.colorbar(im, ax=ax)
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, ax=ax, alpha=0.3)
ax.set_title("Anomaly score (1/mean tree depth)")
plt.show()

In [None]:
y_pred_inlier = model.predict(X_inlier)
y_pred_outlier = model.predict(X_outlier)

In [None]:
pd.Series(y_pred_inlier).describe().to_frame()

In [None]:
pd.Series(y_pred_outlier).describe().to_frame()

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

bins = np.linspace(0, max(y_pred_inlier.max(), y_pred_outlier.max()), 100)
sns.histplot(
    x=y_pred_inlier, ax=ax, label="inlier", color="C0", alpha=0.5, bins=bins
)
sns.histplot(
    x=y_pred_outlier, ax=ax, label="outlier", color="C1", alpha=0.5, bins=bins
)
ax.legend(title="group")
ax.set_title("Mean path length distributions")
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

bins = np.linspace(0, 1, 100)
sns.histplot(
    x=1 / y_pred_inlier, ax=ax, label="inlier", color="C0", alpha=0.5, bins=bins
)
sns.histplot(
    x=1 / y_pred_outlier,
    ax=ax,
    label="outlier",
    color="C1",
    alpha=0.5,
    bins=bins,
)
ax.legend(title="group")
ax.set_title("Anomaly score (1/mean path length) distributions")
plt.tight_layout()