In [None]:
# Measuring time in GFLOPS

In [22]:
import numpy as np
x = np.ones((1000, 1000), dtype='f8')
%timeit x.dot(x)  # Matrix-matrix multiplication

95.1 ms ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [23]:
# To determinate the speed in GFLOPS we have to divide n cubed computations / secons / Giga (10e9)
# So, taking into account the worst case we have:
(1000**3)/((103 + 6.71)/1000)/(10**9)

9.114939385653086

# Measuring the same thing on dask

In [24]:
import dask.array as da

images = da.ones((10000, 1000, 1000))
%timeit images.dot(images)

  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(
  intermediate = blockwise(


181 ms ± 23.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


  intermediate = blockwise(


In [25]:
(1000**3)/((221+28.8)/1000)/(10**9)

4.00320256204964

We acctually had a decrease in performance, but why?

The first clue is the fact that we had a Python warning : Increasing number of chunks by factor of 160.
Probably Python's default chunk size is small, and, in a supercomputer would be configured to a larger one.

As we can see, we can't execute the task above, because it would require almost 80GB of RAM

In [26]:
import h5py

In [27]:
f = h5py.File('myfile.hdf5', 'a')

In [30]:
# In order to rerun the tests we need to delete the datasets
# To do that we use the following commands
del f['A']
del f['B']

In [31]:
A = f.create_dataset(name='A', shape=(200000, 4000), dtype='f8',
                        chunks=(250, 250), fillvalue=1.0)

In [32]:
B = f.create_dataset(name='B', shape=(4000, 4000), dtype='f8',
                        chunks=(250, 250), fillvalue=1.0)

I don't know what he meant with doing blze things, and, why we need blaze

In [33]:
import blaze

In [34]:
help(blaze)

Help on package blaze:

NAME
    blaze

PACKAGE CONTENTS
    _version
    cached
    compatibility
    compute (package)
    deprecation
    dispatch
    expr (package)
    index
    interactive
    mongo
    partition
    server (package)
    sql
    tests (package)
    types
    utils

SUBMODULES
    datetime

DATA
    Cheap = (<class 'blaze.expr.collections.Head'>, <class 'blaze.expr.exp...
    Sequence = (<class 'tuple'>, <class 'list'>, <class 'collections.abc.I...
    abs = <dispatched abs>
        Multiply dispatched method: abs
        
        Inputs: <object>
        -----------------
        Return the absolute value of the argument.
        
        Inputs: <ndarray>
        ------------------
        absolute(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True[, signature, extobj])
        
        Calculate the absolute value element-wise.
        
        ``np.abs`` is a shorthand for this function.
        
        Parameters
        --

In [38]:
from dask.array.core import Array
from blaze import data, into

a = into(Array, 'myfile.hdf5::/A', blockshape=(1000, 1000))  # dask things
b = into(Array, 'myfile.hdf5::/B', blockshape=(1000, 1000))
A = data(a)  # Blaze things
B = data(b)

NodeNotFound: Either source <class 'h5py._hl.dataset.Dataset'> or target <class 'dask.array.core.Array'> is not in G

In [21]:
import dask.array as da

images = da.ones((10000, 1000, 1000))
images

Unnamed: 0,Array,Chunk
Bytes,74.51 GiB,119.21 MiB
Shape,"(10000, 1000, 1000)","(250, 250, 250)"
Count,640 Tasks,640 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 74.51 GiB 119.21 MiB Shape (10000, 1000, 1000) (250, 250, 250) Count 640 Tasks 640 Chunks Type float64 numpy.ndarray",1000  1000  10000,

Unnamed: 0,Array,Chunk
Bytes,74.51 GiB,119.21 MiB
Shape,"(10000, 1000, 1000)","(250, 250, 250)"
Count,640 Tasks,640 Chunks
Type,float64,numpy.ndarray


Now, it works!