In [1]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KDTree

In [2]:
X, y = make_blobs(n_samples=10000, n_features=100, centers=20, random_state=42)

In [3]:
tree = KDTree(X, leaf_size=int(0.01 * X.shape[1]), metric='euclidean')

In [4]:
bounds = np.array(tree.node_bounds)
bounds.shape

(2, 16383, 100)

In [5]:
bounds[:, 0, 0]

array([-12.21081851,  12.57636634])

In [6]:
X.min(axis=0)[0], X.max(axis=0)[0]

(-12.210818514035115, 12.576366337001314)

In [7]:
bounds[:, 1, 0]

array([-12.0838077 ,   7.19676458])

In [8]:
bounds.mean(axis=0)

array([[ 0.18277391, -0.51436642, -0.68210713, ...,  0.50199554,
        -0.49262899,  0.21379267],
       [-2.44352156, -0.24541801, -1.48062001, ...,  0.13923568,
        -0.33295147,  0.58457503],
       [ 0.18277391, -0.51436642, -0.68210713, ...,  0.50199554,
        -0.79917621, -1.48478893],
       ...,
       [ 9.10086032,  4.80224593, -3.74003928, ..., -5.89386368,
        -2.16831912,  5.79801827],
       [10.41554162,  5.39352011, -1.28739736, ..., -4.60830813,
        -3.37786247,  5.91997936],
       [10.73240005,  5.10310719, -3.49041989, ..., -6.30168432,
        -2.33656841,  5.16790956]])

In [9]:
# density = tree.kernel_density(bounds.mean(axis=0), 0.01428489, kernel='linear')

In [10]:
radii = np.sqrt(np.sum((bounds[1] - bounds[0]) ** 2, axis=1)) / 2
counts = tree.query_radius(bounds.mean(axis=0), radii, count_only=True)
counts

array([10000, 10000, 10000, ...,     1,     1,     0])

In [11]:
np.histogram(counts)

(array([16169,   138,     5,     7,     8,     1,     2,     3,     3,
           47]),
 array([    0.,  1000.,  2000.,  3000.,  4000.,  5000.,  6000.,  7000.,
         8000.,  9000., 10000.]))

In [12]:
np.array(tree.idx_array)

array([6022, 3363,  956, ..., 4648,  795, 7036], dtype=int64)

In [14]:
def nth_sum(n):
    if n == 0:
        return 1
    return 2 ** n + nth_sum(n-1)

[nth_sum(i) for i in range(15)]

[1, 3, 7, 15, 31, 63, 127, 255, 511, 1023, 2047, 4095, 8191, 16383, 32767]

In [31]:
n_nodes = bounds.shape[1]
n_leaves = (n_nodes + 1) // 2 - 1
leaf_bounds = bounds[:, -n_leaves:, :]
leaf_bounds.shape

(2, 8191, 100)

In [32]:
midpoints = leaf_bounds.mean(axis=0)
r = np.min(midpoints - leaf_bounds[0], axis=1)

In [36]:
((bounds[0] != bounds[1]).sum(axis=1) == 100).sum()

9999