In [1]:
from sklearn.datasets import make_blobs

from divik.cluster._kmeans._initialization import PercentileInitialization, KDTreeInitialization

In [2]:
def init(n_samples, n_features, initialize):
    X, _ = make_blobs(n_samples=n_samples, n_features=n_features, centers=10, random_state=42)
    initialize(X, 10)

In [3]:
perc_corr = PercentileInitialization(distance='correlation')
perc_euc = PercentileInitialization(distance='euclidean')
kdtree_corr = KDTreeInitialization('correlation')
kdtree_euc = KDTreeInitialization('euclidean')

# Percentile Correlation

In [4]:
%timeit init(1_000, 5_000, perc_corr)

3.35 s ± 1.67 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%timeit init(10_000, 1_000, perc_corr)

2.32 s ± 223 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%timeit init(100_000, 100, perc_corr)

2.56 s ± 1.11 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%timeit init(1_000_000, 100, perc_corr)

21.7 s ± 163 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Percentile Euclidean

In [8]:
%timeit init(1_000, 5_000, perc_euc)

1.98 s ± 264 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%timeit init(10_000, 1_000, perc_euc)

2.08 s ± 56.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit init(100_000, 100, perc_euc)

1.5 s ± 52.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%timeit init(1_000_000, 100, perc_euc)

19.1 s ± 255 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# KDTree Correlation

In [12]:
%timeit init(1_000, 5_000, kdtree_corr)

439 ms ± 7.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%timeit init(10_000, 1_000, kdtree_corr)

754 ms ± 34.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%timeit init(100_000, 100, kdtree_corr)

729 ms ± 11.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%timeit init(1_000_000, 100, kdtree_corr)

7.19 s ± 29.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# KDTree Euclidean

In [16]:
%timeit init(1_000, 5_000, kdtree_euc)

414 ms ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%timeit init(10_000, 1_000, kdtree_euc)

735 ms ± 46.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%timeit init(100_000, 100, kdtree_euc)

713 ms ± 15 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%timeit init(1_000_000, 100, kdtree_euc)

7.24 s ± 102 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%load_ext line_profiler
import divik.cluster._kmeans._initialization

In [21]:
%lprun \
    -f divik.cluster._kmeans._initialization.make_tree \
    -f KDTreeInitialization.__call__ \
    init(1_000_000, 100, kdtree_euc)

Timer unit: 1e-06 s

Total time: 1.56127 s
File: /app/divik/cluster/_kmeans/_initialization.py
Function: make_tree at line 130

Line #      Hits         Time  Per Hit   % Time  Line Contents
   130                                           def make_tree(X, leaf_size: int, _feature_idx: int = 0) -> KDTree:
   131                                               """Make KDTree out of the data
   132                                           
   133                                               Construct a KDTree out of data using mean as a pivoting element.
   134                                               Each split makes two segments. The result doesn't contain the original
   135                                               data, just centroids in each box and count of items.
   136                                           
   137                                               Parameters
   139                                               X : array_like, (n_samples, n_features)
   1