In [1]:
import numpy as np

from sklearn.datasets import make_blobs
from sklearn.linear_model import LinearRegression

from divik.cluster._kmeans._initialization import _residuals_numba, _lstsq_numba

In [2]:
X, y = make_blobs(n_samples=100_000, n_features=100, centers=15, random_state=42)

In [3]:
def _find_residuals(data: np.ndarray) -> np.ndarray:
    features = data.T
    assumed_ys = features[0]
    modelled_xs = np.hstack([np.ones((data.shape[0], 1)),
                             features[1:].T])
    coefficients = _lstsq_numba(modelled_xs, assumed_ys)
    residuals = _residuals_numba(modelled_xs, assumed_ys, coefficients)
    return residuals

In [4]:
_find_residuals(X)
None

In [5]:
%timeit _find_residuals(X)

1.23 s ± 76.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
def _find_residuals_sklearn(data: np.ndarray) -> np.ndarray:
    features = data.T
    assumed_ys = features[0]
    modelled_xs = features[1:].T
    lr = LinearRegression().fit(modelled_xs, assumed_ys)
    residuals = np.abs(lr.predict(modelled_xs) - assumed_ys)
    return residuals

In [7]:
%timeit _find_residuals_sklearn(X)

1.51 s ± 130 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
