In [1]:
import numpy as np
import dask.array as da

# PLEASE
Run only what you need. Thanks

## Simultaneous max and min

In [24]:
def minmax(x):
    y = np.max(x[:, None, :] * np.array([-1, 1])[:, None, None], axis=2)
    #y[:, 0] *= -1
    return y

def minmax2(x):
    return np.concatenate([np.min(x, axis=1)[:,None], np.max(x, axis=1)[:,None]], axis=1)

In [32]:
x = np.arange(10000000).reshape(-1,50,2)

#assert np.allclose(minmax(x), minmax2(x))

%timeit minmax(x)
%timeit minmax2(x)

633 ms ± 90.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
267 ms ± 19.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## `np.nan` is as performant as a shorter matrix?

In [14]:
x = np.arange(1000000).reshape(-1,2)
print(x.shape)

%timeit x + 2 - np.array([2,20])[None]

y = np.vstack([x, np.full((500000, 2), np.nan)])
print(y.shape)

%timeit y + 2 - np.array([2,20])[None]

z = np.vstack([x, x])
print(z.shape)

%timeit z + 2 - np.array([2,20])[None]

(500000, 2)
9.49 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
(1000000, 2)
21 ms ± 3.95 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
(1000000, 2)
21 ms ± 2.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Best way to achieve all `False` in NumPy

In [3]:
x = np.full((10000,1000), False)
x[2,5] = True
x[1000,10] = True

In [7]:
%timeit np.all(np.logical_not(x), axis=1)
%timeit np.logical_not(np.any(x, axis=1))
%timeit ~np.any(x, axis=1)
%timeit (~x).all(axis=1)

6 ms ± 1.5 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
1.24 ms ± 78.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.14 ms ± 5.85 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
4.26 ms ± 39.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Is `map_blocks` better than the standard operation?

In [6]:
y = da.from_array(np.ones((1000,1000,200)))

print('Simple subtraction')
%timeit (y - 10).compute()
%timeit y.map_blocks(lambda x: x - 10).compute()

print('Some references to y')
%timeit (y - 10 + 2*y).compute()
%timeit y.map_blocks(lambda x: x - 10 + 2*x).compute()

print('Multiple references to y')
%timeit (y + 2*y - 3*y + 4*y - 5*y).compute()
%timeit y.map_blocks(lambda x: x + 2*x - 3*x + 4*x - 5*x).compute()

print('NumPy function')
%timeit (np.exp(y+2)).compute()
%timeit y.map_blocks(lambda x: np.exp(x)).compute()

Simple subtraction
9.77 s ± 2.38 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
8.31 s ± 873 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Some references to y
14.2 s ± 1.56 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
14.9 s ± 1.49 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
Multiple references to y
28.9 s ± 3.69 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
10.7 s ± 458 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
NumPy function
10.5 s ± 1.54 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
9.45 s ± 976 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Is `map_blocks` better/worse than the corresponding Dask function?

In [7]:
y = da.from_array(np.ones((1000,1000,20)) + 0.5)

%timeit y.map_blocks(lambda x: np.floor(x)).compute()
%timeit da.floor(y).compute()

282 ms ± 87.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
178 ms ± 18.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Does `np.unique` work?

In [16]:
y = da.from_array(np.arange(10000))
da.unique(y, return_index=True)[1].compute()

array([   0,    1,    2, ..., 9997, 9998, 9999])

## What about `argsort`?

In [18]:
y = da.from_array(np.arange(10000))
y = y[da.argsort(y)]

AttributeError: module 'dask.array' has no attribute 'argsort'

## What's the best way to create a matrix such that different submatrices are filled with different rules?

In [13]:
def fill1():
    x = np.zeros((9900, 100), dtype=float)
    for i in range(1,100):
        x[(i-1)*100:i*100] = i*np.arange(i*100-100,i*100)[::-1]
    return x

def fill2():
    return np.vstack([
        np.repeat(i*np.arange(i*100-100,i*100)[::-1][None], repeats=100, axis=0) for i in range(1,100)
    ])


%timeit fill1()
%timeit fill2()

3.02 ms ± 24.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
8.46 ms ± 73.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## What's the best way to fill a matrix taking values from a vector according to some indexing?

In [50]:
from itertools import chain

def generate_binified_points_matrix(pts, indexes, bins_size):
    zs = np.zeros((bins_size - 1, pts.shape[1]))
    for idx in range(len(indexes)):
        idx = indexes[idx]
        yield pts[idx], zs[: bins_size - len(idx)]
        
def bins1(pts, indexes_inside_bins, biggest_bin):
    return np.vstack(
            tuple(chain.from_iterable(
                generate_binified_points_matrix(
                    pts, indexes_inside_bins, biggest_bin
                )
            ))
        ).reshape(len(indexes_inside_bins), biggest_bin, pts.shape[1])

def bins2(pts, indexes_inside_bins, biggest_bin):
    nbins = len(indexes_inside_bins)
    bins = np.zeros((nbins, biggest_bin, pts.shape[1]))
    for bin_idx in range(nbins):
        ps = pts[indexes_inside_bins[bin_idx]]
        bins[bin_idx, : len(ps)] = ps
    return bins

In [53]:
for n_components in (1,2,3,5,10,100):
    pts = np.random.rand(10000, n_components)
    indexes_inside_bins = np.arange(10000)[::-1]
    split_sizes = [100,200,400,50, 50, 1000, 1500, 500, 900, 100, 500, 1000, 500, 100, 200, 200, 
                   500, 1000, 1, 1, 1, 7, 80, 5]
    indexes_inside_bins = np.split(indexes_inside_bins, np.cumsum(split_sizes))

    M = np.max(split_sizes)
    
    print('n components = {}'.format(n_components))
    %timeit bins1(pts, indexes_inside_bins, M)
    %timeit bins2(pts, indexes_inside_bins, M)

n components = 1
172 µs ± 2.28 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
116 µs ± 933 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
n components = 2
408 µs ± 770 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
346 µs ± 962 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
n components = 3
372 µs ± 1.53 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
329 µs ± 31.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
n components = 5
437 µs ± 3.59 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
359 µs ± 1.64 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
n components = 10
1.09 ms ± 3.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
963 µs ± 2.06 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
n components = 100
12.5 ms ± 676 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
9.85 ms ± 136 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
