In [1]:
from sklearn.datasets import make_blobs

from divik.cluster._kmeans._initialization import PercentileInitialization, KDTreeInitialization

In [2]:
def init(n_samples, n_features, initialize):
    X, _ = make_blobs(n_samples=n_samples, n_features=n_features, centers=10, random_state=42)
    initialize(X, 10)

In [3]:
perc_corr = PercentileInitialization(distance='correlation')
perc_euc = PercentileInitialization(distance='euclidean')
kdtree_corr = KDTreeInitialization('correlation')
kdtree_euc = KDTreeInitialization('euclidean')

# Percentile Correlation

In [4]:
%timeit init(1_000, 5_000, perc_corr)

2.54 s ± 615 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%timeit init(10_000, 1_000, perc_corr)

2.27 s ± 316 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%timeit init(100_000, 100, perc_corr)

1.78 s ± 82.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%timeit init(1_000_000, 100, perc_corr)

22.1 s ± 1.25 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Percentile Euclidean

In [8]:
%timeit init(1_000, 5_000, perc_euc)

1.77 s ± 312 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%timeit init(10_000, 1_000, perc_euc)

2.09 s ± 250 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit init(100_000, 100, perc_euc)

1.44 s ± 58.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%timeit init(1_000_000, 100, perc_euc)

21.1 s ± 1.88 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


# KDTree Correlation

In [12]:
%timeit init(1_000, 5_000, kdtree_corr)

558 ms ± 33.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%timeit init(10_000, 1_000, kdtree_corr)

917 ms ± 65.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%timeit init(100_000, 100, kdtree_corr)

873 ms ± 60.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%timeit init(1_000_000, 100, kdtree_corr)

8.63 s ± 175 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# KDTree Euclidean

In [16]:
%timeit init(1_000, 5_000, kdtree_euc)

513 ms ± 40.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%timeit init(10_000, 1_000, kdtree_euc)

970 ms ± 86.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [18]:
%timeit init(100_000, 100, kdtree_euc)

801 ms ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
%timeit init(1_000_000, 100, kdtree_euc)

8.43 s ± 68.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
%load_ext line_profiler
import divik.cluster._kmeans._initialization

In [22]:
%lprun \
    -f divik.cluster._kmeans._initialization.make_tree \
    -f KDTreeInitialization.__call__ \
    init(1_000_000, 100, kdtree_euc)

Timer unit: 1e-06 s

Total time: 2.5209 s
File: /app/divik/cluster/_kmeans/_initialization.py
Function: make_tree at line 130

Line #      Hits         Time  Per Hit   % Time  Line Contents
   130                                           def make_tree(X, leaf_size: int, _feature_idx: int = 0) -> KDTree:
   131                                               """Make KDTree out of the data
   132                                           
   133                                               Construct a KDTree out of data using mean as a pivoting element.
   134                                               Each split makes two segments. The result doesn't contain the original
   135                                               data, just centroids in each box and count of items.
   136                                           
   137                                               Parameters
   139                                               X : array_like, (n_samples, n_features)
   14