## Keypoints

* HDF5 is a fast format for storing numerical data
* Dask lets you do larger-than-memory computations
* The abstraction dask.array is much like a numpy.array

In [1]:
import numpy as np
import h5py
import dask.array as da
import glob
import timeit

In [2]:
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [3]:
!du -h data/weather-big/*

8.2G	data/weather-big/2014-01-01.hdf5
8.2G	data/weather-big/2014-01-02.hdf5
8.2G	data/weather-big/2014-01-03.hdf5
8.2G	data/weather-big/2014-01-04.hdf5
8.2G	data/weather-big/2014-01-05.hdf5
8.2G	data/weather-big/2014-01-06.hdf5
8.2G	data/weather-big/2014-01-07.hdf5


In [4]:
!du -h data/weather-small/2014-01-01.csv

1.6M	data/weather-small/2014-01-01.csv


In [5]:
!du -h data/weather-small/2014-01-01.hdf5

84K	data/weather-small/2014-01-01.hdf5


In [6]:
%timeit temps = np.loadtxt('data/weather-small/2014-01-01.csv', delimiter=',')

59.2 ms ± 1.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [7]:
%timeit temps = h5py.File('data/weather-small/2014-01-02.hdf5')['/t2m'][...]

2.05 ms ± 153 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
dset = h5py.File('data/weather-big/2014-01-01.hdf5')['/t2m']

In [28]:
print(dset)

<HDF5 dataset "t2m": shape (23040, 46080), type "<f8">


In [32]:
temps = dset[...] # gets numpy array from HDF5 dataset

In [31]:
temps.shape

(23040, 46080)

In [13]:
print(temps)

[[ 81.  81.  81. ...,  81.  81.  81.]
 [ 81.  81.  81. ...,  81.  81.  81.]
 [ 81.  81.  81. ...,  81.  81.  81.]
 ..., 
 [ 83.  83.  83. ...,  83.  83.  83.]
 [ 83.  83.  83. ...,  83.  83.  83.]
 [ 83.  83.  83. ...,  83.  83.  83.]]


Next line takes a couple of minute to run

In [None]:
#plt.imshow(temps)
#plt.colorbar()

We can get away with plotting a smaller array:

In [None]:
plt.imshow(temps[::100, ::100])
plt.colorbar()

What if we wanted the mean across all days?

In [50]:
file_list = glob.glob('data/weather-big/*.hdf5')

t1 = timeit.default_timer()
temp_mean = 0
for filename in file_list:
    print(filename)
    dset = h5py.File(filename)['/t2m']
    temps = dset[...]
    temp_mean += temps.sum()
t1 = timeit.default_timer()
print(temp_mean/len(file_list))

data/weather-big/2014-01-06.hdf5
data/weather-big/2014-01-02.hdf5
data/weather-big/2014-01-01.hdf5
data/weather-big/2014-01-04.hdf5
data/weather-big/2014-01-03.hdf5
data/weather-big/2014-01-07.hdf5
data/weather-big/2014-01-05.hdf5
171381091767.0


In [39]:
temps.ravel().shape

(1061683200,)

In [45]:
print(temp_mean/(temps.ravel().shape[0]*len(file_list)))

161.423946208


In [55]:
dsets = [da.from_array(h5py.File(filename)['/t2m'], chunks=(5000, 5000)) for filename in file_list]
temps = da.stack(dsets)
print(temps.mean)

<bound method mean of dask.array<stack, shape=(7, 23040, 46080), dtype=float64, chunksize=(1, 10000, 10000)>>


In [56]:
t1 = timeit.default_timer()
temps.mean().compute()
t2 = timeit.default_timer()
print(t2-t1)

70.35648456995841
