Problem:  

How to handle data processing if your dataset(200GB) is bigger than your memory size (64GB)?

In [56]:
import numpy as np

shape = (20000,64,64,64)
l = reduce(lambda x,y:x*y, shape)
data = np.arange(l, dtype=np.float32).reshape(shape)


npy_file = './test_chunks.npy'
time_start = time.clock()
np.save(npy_file, data)
print('  save npy took {}s'.format(time.clock() - time_start))
print('  raw data is {} GB'.format(data.nbytes/1024/1024/1024.))


  save npy took 16.20359s
  raw data is 19.53125 GB


In [57]:
import os
import h5py
import time

file_chunks_true = './test_chunks_true.hdf5'

def save_default_chunk():
    time_start = time.clock()
    with h5py.File(file_chunks_true, 'w') as outfile:
        dset = outfile.create_dataset('test_data', data=data, chunks=True, compression='lzf')
        print('saving chunks=True took {}s'.format(time.clock() - time_start))


def read(f):
    data = h5py.File(f, 'r')
    return data['test_data']

def load(f):
    time_start = time.clock()
    data = read(f)
    print('read {} {}s'.format(f, time.clock() - time_start))

    time_start = time.clock()
    num_slices = 128
    import random
    r = random.sample(range(1, shape[0]), num_slices)
    r.sort()
    vol = data[r,:,:,:]
    print('load {} slices {}s'.format(num_slices, time.clock() - time_start))

save_default_chunk()   # 73s to save
load(file_chunks_true)   # takes 322s to load slice




saving chunks=True took 75.789275s
read ./test_chunks_true.hdf5 0.000770999999986s
load 128 slices 31.665066s


In [58]:
file_manual_chunks = './test_chunks_manual.hdf5'  # manually set chunk_size

def save_manual_chunk():    
    time_start = time.clock()
    with h5py.File(file_manual_chunks, 'w') as outfile:
        dset = outfile.create_dataset('test_data', data=data, chunks=(1,64,64,64), compression='lzf')  #manual chunks
        print('saving {} {}s'.format(file, time.clock() - time_start))

save_manual_chunk()   # 55s to save
load(file_manual_chunks)   # takes 68s to load 10*128*(64,64,64) slices


saving /data/luna16/test_chunks_manual.hdf5 53.264433s
read ./test_chunks_manual.hdf5 0.000420000000076s
load 128 slices 7.777437s


In [2]:
ls -hl *chunks*

-rw-rw-r-- 1 jack jack 635M  7月  7 17:25 test_chunks_manual.hdf5
-rw-rw-r-- 1 jack jack  20G  7月  7 17:22 test_chunks.npy
-rw-rw-r-- 1 jack jack 5.4G  7月  7 17:24 test_chunks_true.hdf5


In [None]:
Contact:  