In [1]:
%load_ext line_profiler

In [2]:
from sklearn.datasets import make_blobs

from divik.cluster import KMeans, DunnDiviK
import divik.cluster._kmeans._core
import divik.cluster._kmeans._gap
import divik.cluster._kmeans._initialization
import divik.cluster._divik._dunn
import divik.score._sampled_gap
import divik.score._gap

In [3]:
X, y = make_blobs(n_samples=100_000, n_features=100, centers=10, random_state=42)

In [4]:
kmeans = KMeans(n_clusters=10, init='kdtree')

In [5]:
mdl = DunnDiviK(distance='euclidean', filter_type='auto', n_jobs=1, features_percentage=0.2, verbose=True)

# Generic timing

In [6]:
%lprun \
    -f divik.cluster._kmeans._initialization.KDTreeInitialization.__call__ \
    -f divik.cluster._kmeans._core._KMeans.__call__ \
    kmeans.fit(X)

Timer unit: 1e-06 s

Total time: 0.545768 s
File: /app/divik/cluster/_kmeans/_core.py
Function: __call__ at line 113

Line #      Hits         Time  Per Hit   % Time  Line Contents
   113                                               def __call__(self, data: Data, number_of_clusters: int) \
   114                                                       -> Tuple[IntLabels, Centroids]:
   115         1         21.0     21.0      0.0          _validate_kmeans_input(data, number_of_clusters)
   116         1         12.0     12.0      0.0          if number_of_clusters == 1:
   117                                                       return np.zeros((data.shape[0], 1), dtype=int), \
   118                                                              np.mean(data, axis=0, keepdims=True)
   119         1         70.0     70.0      0.0          data = data.reshape(data.shape, order='C')
   120         1         13.0     13.0      0.0          if self.normalize_rows:
   121                     

# Init timing

In [7]:
%lprun \
    -f divik.cluster._kmeans._initialization.KDTreeInitialization.__call__ \
    -f divik.cluster._kmeans._initialization._find_residuals \
    -f divik.cluster._kmeans._initialization.make_tree \
    kmeans.fit(X)

Timer unit: 1e-06 s

Total time: 0.008778 s
File: /app/divik/cluster/_kmeans/_initialization.py
Function: _find_residuals at line 26

Line #      Hits         Time  Per Hit   % Time  Line Contents
    26                                           def _find_residuals(data: Data, sample_weight=None) -> np.ndarray:
    27         1         10.0     10.0      0.1      features = data.T
    28         1         11.0     11.0      0.1      assumed_ys = features[0]
    29         1         40.0     40.0      0.5      modelled_xs = np.hstack([np.ones((data.shape[0], 1)),
    30         1        186.0    186.0      2.1                              features[1:].T])
    31         1         20.0     20.0      0.2      lr = LinearRegression().fit(modelled_xs, assumed_ys,
    32         1       7563.0   7563.0     86.2                                  sample_weight=sample_weight)
    33         1        940.0    940.0     10.7      residuals = np.abs(lr.predict(modelled_xs) - assumed_ys)
    34     

In [8]:
%lprun \
    -f divik.cluster._kmeans._initialization.KDTreeInitialization.__call__ \
    -f divik.cluster._kmeans._initialization._find_residuals \
    -f divik.cluster._kmeans._initialization.make_tree \
    mdl.fit(X)

  0%|          | 0/100000 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:15<00:15, 15.81s/it][A
100%|██████████| 2/2 [00:24<00:00, 13.81s/it][A
                                             [A
  0%|          | 0/9 [00:00<?, ?it/s][A
 11%|█         | 1/9 [00:00<00:01,  4.69it/s][A
 22%|██▏       | 2/9 [00:00<00:01,  4.94it/s][A
 33%|███▎      | 3/9 [00:00<00:01,  5.20it/s][A
 44%|████▍     | 4/9 [00:00<00:00,  5.22it/s][A
 56%|█████▌    | 5/9 [00:00<00:00,  4.92it/s][A
 67%|██████▋   | 6/9 [00:01<00:00,  4.82it/s][A
 78%|███████▊  | 7/9 [00:01<00:00,  4.65it/s][A
 89%|████████▉ | 8/9 [00:01<00:00,  4.88it/s][A
100%|██████████| 9/9 [00:01<00:00,  4.98it/s][A
                                             [A
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:07<00:07,  7.86s/it][A
 10%|█         | 10000/100000 [00:39<05:58, 251.00it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:05<00:05,  5.45s/it][A
 2

Timer unit: 1e-06 s

Total time: 0.496525 s
File: /app/divik/cluster/_kmeans/_initialization.py
Function: _find_residuals at line 26

Line #      Hits         Time  Per Hit   % Time  Line Contents
    26                                           def _find_residuals(data: Data, sample_weight=None) -> np.ndarray:
    27       140       1694.0     12.1      0.3      features = data.T
    28       140       2023.0     14.4      0.4      assumed_ys = features[0]
    29       140       5511.0     39.4      1.1      modelled_xs = np.hstack([np.ones((data.shape[0], 1)),
    30       140       8945.0     63.9      1.8                              features[1:].T])
    31       140       3451.0     24.6      0.7      lr = LinearRegression().fit(modelled_xs, assumed_ys,
    32       140     425789.0   3041.3     85.8                                  sample_weight=sample_weight)
    33       140      47720.0    340.9      9.6      residuals = np.abs(lr.predict(modelled_xs) - assumed_ys)
    34     