# Creating datasets

Creating datasets with PyTables via NumPy arrays is easy:

In [1]:
import numpy as np
import tables as tb

# Create a new file
f = tb.open_file("atest.h5", "w")

# Create a NumPy array
a = np.arange(100).reshape(20,5)

# Save the array
f.create_array(f.root, "array1", a)

/array1 (Array(20, 5)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

In [2]:
# Peek data
f.root.array1[:]

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54],
       [55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64],
       [65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74],
       [75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84],
       [85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94],
       [95, 96, 97, 98, 99]])

In [3]:
# Slice and dice (only these slices are loaded into memory)
ta = f.root.array1
ta[1:10:3,2:5]

array([[ 7,  8,  9],
       [22, 23, 24],
       [37, 38, 39]])

In [4]:
# Make sure that the read data is the same than the original
np.allclose(ta[1:10:3,2:5], a[1:10:3,2:5])

True

In [5]:
# Create another array
ta2 = f.create_array(f.root, "array2", np.arange(10))

In [6]:
# Let's have a look at the size of the underlying file
!ls -l atest.h5

-rw-r--r--  1 Gonzalo  staff  0 Oct 19 10:41 atest.h5


In [7]:
# Flush data to the file (very important to keep all your data safe!)
f.flush()

In [8]:
!ls -l atest.h5

-rw-r--r--  1 Gonzalo  staff  3024 Oct 19 10:48 atest.h5


In [9]:
f.close()  # close access to file

In [10]:
# Look at its contents by using `ptdump` utility
! ptdump atest.h5

/ (RootGroup) ''
/array1 (Array(20, 5)) ''
/array2 (Array(10,)) ''


In [11]:
# Reopen the file and revisit the datasets there
f = tb.open_file("atest.h5", mode="r")  # note the 'r'ead mode

In [12]:
# Get the summary of the contents
f

File(filename=atest.h5, title='', mode='r', root_uep='/', filters=Filters(complevel=0, shuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) ''
/array1 (Array(20, 5)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None
/array2 (Array(10,)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

In [13]:
f.root.array1

/array1 (Array(20, 5)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

In [14]:
f.root.array1[:]

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54],
       [55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64],
       [65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74],
       [75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84],
       [85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94],
       [95, 96, 97, 98, 99]])

In [15]:
# Always close your files when you are done (or use contexts)
f.close()

## Exercise

Create a new HDF5 file with 2 arrays on it.  One should be 2-dimensional and the other the result of summing the 2nd dimension (.sum(axis=1)).  Use contexts so that you don't have to close the file explicitly.

### Solution

In [24]:
import tables as tb
import numpy as np

nevt = 1000
NROWS = 1000
NCOLS = 100

try:
    h5f.close()
except:
    pass

h5f = tb.open_file('/Users/Gonzalo/github/Training-Next-Collaboration/data/test1.h5','w',
                   filters=tb.Filters(complib='zlib', complevel=1))

h5f.create_group(h5f.root,'DATA')
arr2 = np.random.randint(0,high=100,size=(NROWS,NCOLS) )
arr1 = arr2.sum(axis=1)

array2 = h5f.create_array(h5f.root.DATA, "D2", arr2)
array1 = h5f.create_array(h5f.root.DATA, "D1", arr1)
    
array2.flush()
array1.flush()
h5f.close()


In [22]:
import tables as tb
import numpy as np

nevt = 1000
NROWS = 1000
NCOLS = 100

try:
    h5f.close()
except:
    pass

h5f = tb.open_file('/Users/Gonzalo/github/Training-Next-Collaboration/data/test1.h5','w',
                   filters=tb.Filters(complib='zlib', complevel=1))

h5f.create_group(h5f.root,'DATA')

array2 = h5f.create_earray(h5f.root.DATA, "D2",
                           atom=tb.Int16Atom(),     #not Float32! bad for compression
                           shape=(0, NROWS, NCOLS),
                           expectedrows=nevt)

array1 = h5f.create_earray(h5f.root.DATA, "D1",
                           atom=tb.Int16Atom(),     #not Float32! bad for compression
                           shape=(0, NROWS),
                           expectedrows=nevt)



for i in range(nevt):
    if not i%100:print('evt {}'.format(i))
    numbers = np.random.randint(0,high=100,size=(NROWS,NCOLS) )
    sumnumb = numbers.sum(axis=1)
    array2.append( numbers.reshape(1,NROWS,NCOLS) )
    array1.append( sumnumb.reshape(1,NROWS) )
    
array2.flush()
array1.flush()
h5f.close()


evt 0
evt 100
evt 200
evt 300
evt 400
evt 500
evt 600
evt 700
evt 800
evt 900


# Playing with the object tree

In [25]:
# Re-open the existing file in 'a'ppend mode
f = tb.open_file("atest.h5", "a")

In [26]:
f

File(filename=atest.h5, title='', mode='a', root_uep='/', filters=Filters(complevel=0, shuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) ''
/array1 (Array(20, 5)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None
/array2 (Array(10,)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

In [27]:
# We can get a shortened view too:
print(f)

atest.h5 (File) ''
Last modif.: 'Wed Oct 19 10:48:42 2016'
Object Tree: 
/ (RootGroup) ''
/array1 (Array(20, 5)) ''
/array2 (Array(10,)) ''



In [28]:
# Add a new group
f.create_group(f.root, 'group1', 'Title for group1')

/group1 (Group) 'Title for group1'
  children := []

In [29]:
f

File(filename=atest.h5, title='', mode='a', root_uep='/', filters=Filters(complevel=0, shuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) ''
/array1 (Array(20, 5)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None
/array2 (Array(10,)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None
/group1 (Group) 'Title for group1'

In [30]:
f.move_node(f.root.array1, f.root.group1)

In [31]:
f

File(filename=atest.h5, title='', mode='a', root_uep='/', filters=Filters(complevel=0, shuffle=False, fletcher32=False, least_significant_digit=None))
/ (RootGroup) ''
/array2 (Array(10,)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None
/group1 (Group) 'Title for group1'
/group1/array1 (Array(20, 5)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

In [32]:
# Create a very nested group (note the `createparents` parameter)
f.create_group('/g1/g2/g3/g4', 'g5', createparents=True)

/g1/g2/g3/g4/g5 (Group) ''
  children := []

In [33]:
print(f)

atest.h5 (File) ''
Last modif.: 'Wed Oct 19 10:48:42 2016'
Object Tree: 
/ (RootGroup) ''
/array2 (Array(10,)) ''
/g1 (Group) ''
/group1 (Group) 'Title for group1'
/group1/array1 (Array(20, 5)) ''
/g1/g2 (Group) ''
/g1/g2/g3 (Group) ''
/g1/g2/g3/g4 (Group) ''
/g1/g2/g3/g4/g5 (Group) ''



In [34]:
# Add an array in the 'very nested' group
f.create_array(f.root.g1.g2.g3.g4.g5, 'array2', np.arange(10))

/g1/g2/g3/g4/g5/array2 (Array(10,)) ''
  atom := Int64Atom(shape=(), dflt=0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := None

In [35]:
print(f)

atest.h5 (File) ''
Last modif.: 'Wed Oct 19 10:48:42 2016'
Object Tree: 
/ (RootGroup) ''
/array2 (Array(10,)) ''
/g1 (Group) ''
/group1 (Group) 'Title for group1'
/group1/array1 (Array(20, 5)) ''
/g1/g2 (Group) ''
/g1/g2/g3 (Group) ''
/g1/g2/g3/g4 (Group) ''
/g1/g2/g3/g4/g5 (Group) ''
/g1/g2/g3/g4/g5/array2 (Array(10,)) ''



In [36]:
# Removing nodes is very easy
f.remove_node(f.root.g1.g2.g3.g4.g5.array2)

In [37]:
print(f)

atest.h5 (File) ''
Last modif.: 'Wed Oct 19 11:18:12 2016'
Object Tree: 
/ (RootGroup) ''
/array2 (Array(10,)) ''
/g1 (Group) ''
/group1 (Group) 'Title for group1'
/group1/array1 (Array(20, 5)) ''
/g1/g2 (Group) ''
/g1/g2/g3 (Group) ''
/g1/g2/g3/g4 (Group) ''
/g1/g2/g3/g4/g5 (Group) ''



In [38]:
# Show the PyTables File object working as an iterator
for n in f:
    print(n)

/ (RootGroup) ''
/array2 (Array(10,)) ''
/g1 (Group) ''
/group1 (Group) 'Title for group1'
/g1/g2 (Group) ''
/group1/array1 (Array(20, 5)) ''
/g1/g2/g3 (Group) ''
/g1/g2/g3/g4 (Group) ''
/g1/g2/g3/g4/g5 (Group) ''


In [39]:
# The `File.walk_nodes` method offers more flexibility
for n in f.walk_nodes():
    print(n)

/ (RootGroup) ''
/array2 (Array(10,)) ''
/g1 (Group) ''
/group1 (Group) 'Title for group1'
/g1/g2 (Group) ''
/group1/array1 (Array(20, 5)) ''
/g1/g2/g3 (Group) ''
/g1/g2/g3/g4 (Group) ''
/g1/g2/g3/g4/g5 (Group) ''


In [40]:
# Get info from a certain point of the hierarchy on
for n in f.walk_nodes(f.root.group1):
    print(n)

/group1 (Group) 'Title for group1'
/group1/array1 (Array(20, 5)) ''


In [41]:
# walknodes allows to iterate over specific classes
for n in f.walk_nodes(f.root.group1, classname="Array"):
    print(n[:2])

[[0 1 2 3 4]
 [5 6 7 8 9]]


In [42]:
f.close()

## Exercise

Use the file that you created in the previous exercise and create a new group called 'reduced' and titled 'My Reduced data' and move the 1-dimensional array there.  Look at the final contents with the ptdump utility.  

### Solution

In [49]:
import tables as tb
import numpy as np

h5f = tb.open_file('/Users/Gonzalo/github/Training-Next-Collaboration/data/test1.h5','r+',
                   filters=tb.Filters(complib='zlib', complevel=1))

if not '/REDUCED' in h5f:
    h5f.create_group(h5f.root,'REDUCED')
h5f.move_node(h5f.root.DATA.D1, h5f.root.REDUCED)
print(h5f)
h5f.close()

NoSuchNodeError: group ``/DATA`` does not have a child named ``D1``

# HDF5 atributes

In [93]:
# Re-open the file
f = tb.open_file("atest.h5", "a")

In [94]:
print(f)

atest.h5 (File) ''
Last modif.: 'Wed Oct 19 12:20:00 2016'
Object Tree: 
/ (RootGroup) ''
/array2 (Array(10,)) ''
/g1 (Group) ''
/group1 (Group) 'Title for group1'
/group1/array1 (Array(20, 5)) ''
/g1/g2 (Group) ''
/g1/g2/g3 (Group) ''
/g1/g2/g3/g4 (Group) ''
/g1/g2/g3/g4/g5 (Group) ''



In [95]:
# Print the attrs in /array2
f.root.array2.attrs

/array2._v_attrs (AttributeSet), 5 attributes:
   [CLASS := 'ARRAY',
    FLAVOR := 'numpy',
    TITLE := '',
    VERSION := '2.4',
    myattr := 12.300000000000001]

In [96]:
# Add a new attribute to /array2
f.root.array2.attrs.myattr = "Hello World!"

In [97]:
f.root.array2.attrs

/array2._v_attrs (AttributeSet), 5 attributes:
   [CLASS := 'ARRAY',
    FLAVOR := 'numpy',
    TITLE := '',
    VERSION := '2.4',
    myattr := 'Hello World!']

In [98]:
# Has the modification arrived to disk yet? 
!ptdump -a atest.h5:/array2  # note the -a flag and node specification

/array2 (Array(10,)) ''
  /array2._v_attrs (AttributeSet), 5 attributes:
   [CLASS := 'ARRAY',
    FLAVOR := 'numpy',
    TITLE := '',
    VERSION := '2.4',
    myattr := 12.300000000000001]


In [99]:
# Nope, so force a flush
f.flush()

In [100]:
!ptdump -a atest.h5:/array2

/array2 (Array(10,)) ''
  /array2._v_attrs (AttributeSet), 5 attributes:
   [CLASS := 'ARRAY',
    FLAVOR := 'numpy',
    TITLE := '',
    VERSION := '2.4',
    myattr := 'Hello World!']


In [101]:
# Attributes can also be general arrays
f.root.array2.attrs.myarray = np.arange(10)
f.flush()

In [102]:
!ptdump -a atest.h5:/array2

/array2 (Array(10,)) ''
  /array2._v_attrs (AttributeSet), 6 attributes:
   [CLASS := 'ARRAY',
    FLAVOR := 'numpy',
    TITLE := '',
    VERSION := '2.4',
    myarray := array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
    myattr := 'Hello World!']


In [103]:
# Get a shortcut to the attribute handler
attrs = f.root.array2.attrs

In [104]:
attrs

/array2._v_attrs (AttributeSet), 6 attributes:
   [CLASS := 'ARRAY',
    FLAVOR := 'numpy',
    TITLE := '',
    VERSION := '2.4',
    myarray := array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
    myattr := 'Hello World!']

In [81]:
# Removing an attribute
del attrs.myarray
attrs

/array2._v_attrs (AttributeSet), 5 attributes:
   [CLASS := 'ARRAY',
    FLAVOR := 'numpy',
    TITLE := '',
    VERSION := '2.4',
    myattr := 'Hello World!']

In [82]:
# Overwrite an existing one (be careful with this feature!)
attrs.myattr = 12.3
attrs

/array2._v_attrs (AttributeSet), 5 attributes:
   [CLASS := 'ARRAY',
    FLAVOR := 'numpy',
    TITLE := '',
    VERSION := '2.4',
    myattr := 12.300000000000001]

In [83]:
# Print the attributes for all the arrays in the object tree
for n in f.walk_nodes(f.root.group1, classname="Array"):
    print(repr(n.attrs))

/group1/array1._v_attrs (AttributeSet), 4 attributes:
   [CLASS := 'ARRAY',
    FLAVOR := 'numpy',
    TITLE := '',
    VERSION := '2.4']


In [84]:
f.close()

## Exercise

Use the file in the previous exercise and add an attribute to the 1-dimensional array specifying the mean and the standard deviation. Use ptdump -a to check that the attributes are there.

### Solution

In [105]:
h5f = tb.open_file('/Users/Gonzalo/github/Training-Next-Collaboration/data/test1.h5','r+')

array = h5f.root.REDUCED.D1
array.attrs.mean = np.mean(array)
array.attrs.stddev = np.std(array)
array.flush()
print(repr(array.attrs))
h5f.close()

! ptdump -a /Users/Gonzalo/github/Training-Next-Collaboration/data/test1.h5:/REDUCED/D1

/REDUCED/D1._v_attrs (AttributeSet), 6 attributes:
   [CLASS := 'ARRAY',
    FLAVOR := 'numpy',
    TITLE := '',
    VERSION := '2.4',
    mean := 4941.3069999999998,
    stddev := 291.55082704564569]
/REDUCED/D1 (Array(1000,)) ''
  /REDUCED/D1._v_attrs (AttributeSet), 6 attributes:
   [CLASS := 'ARRAY',
    FLAVOR := 'numpy',
    TITLE := '',
    VERSION := '2.4',
    mean := 4941.3069999999998,
    stddev := 291.55082704564569]


# Chunked datasets

In [106]:
f = tb.open_file('ctest.h5', 'w')

In [107]:
# Create an un-initialized CArray (Compressible Array)
f.create_carray(f.root, 'carray', tb.Float64Atom(), (10000,1000))

/carray (CArray(10000, 1000)) ''
  atom := Float64Atom(shape=(), dflt=0.0)
  maindim := 0
  flavor := 'numpy'
  byteorder := 'little'
  chunkshape := (16, 1000)

In [108]:
# Flush everything to disk
f.flush()

In [109]:
# The container is there, but not the data (yet)
!ls -lh ctest.h5

-rw-r--r--  1 Gonzalo  staff   1.7K Oct 19 12:31 ctest.h5


In [110]:
# Push some data into this carray container
ca = f.root.carray
na = np.linspace(0, 1, 1e7).reshape(10000,1000)
%time ca[:] = na

CPU times: user 25.2 ms, sys: 67.6 ms, total: 92.8 ms
Wall time: 182 ms


In [111]:
# Flush (we can specify which node should be flushed)
ca.flush()

In [112]:
!ls -lh ctest.h5

-rw-r--r--  1 Gonzalo  staff    76M Oct 19 12:31 ctest.h5


In [113]:
np.prod(ca.shape) * ca.dtype.itemsize / 2**20.

76.2939453125

In [114]:
# Retrieve only part of the data
ca[:10,::2]

array([[  0.00000000e+00,   2.00000020e-07,   4.00000040e-07, ...,
          9.94000099e-05,   9.96000100e-05,   9.98000100e-05],
       [  1.00000010e-04,   1.00200010e-04,   1.00400010e-04, ...,
          1.99400020e-04,   1.99600020e-04,   1.99800020e-04],
       [  2.00000020e-04,   2.00200020e-04,   2.00400020e-04, ...,
          2.99400030e-04,   2.99600030e-04,   2.99800030e-04],
       ..., 
       [  7.00000070e-04,   7.00200070e-04,   7.00400070e-04, ...,
          7.99400080e-04,   7.99600080e-04,   7.99800080e-04],
       [  8.00000080e-04,   8.00200080e-04,   8.00400080e-04, ...,
          8.99400090e-04,   8.99600090e-04,   8.99800090e-04],
       [  9.00000090e-04,   9.00200090e-04,   9.00400090e-04, ...,
          9.99400100e-04,   9.99600100e-04,   9.99800100e-04]])

In [115]:
f.close()

## Using compression with chunked arrays

In [116]:
f = tb.open_file('ctest-zlib.h5', 'w')

In [117]:
# Create a CArray (Compressible Array) using the zlib compressor
filters = tb.Filters(complib='zlib', complevel=1)
ca = f.create_carray(f.root, 'carray', tb.Float64Atom(), (10000,1000),
                     filters=filters)

In [118]:
# Push some data on this carray container
na = np.linspace(0, 1, 1e7).reshape(10000,1000)
%time ca[:] = na

CPU times: user 417 ms, sys: 21.9 ms, total: 439 ms
Wall time: 457 ms


In [119]:
# Flush the carray container only
ca.flush()
!ls -lh ctest-zlib.h5

-rw-r--r--  1 Gonzalo  staff   6.7M Oct 19 12:54 ctest-zlib.h5


In [120]:
np.prod(ca.shape) * ca.dtype.itemsize / 2**20.

76.2939453125

In [121]:
f.close()

In [122]:
# Look at the file with a native HDF5 tool
!h5ls -v ctest-zlib.h5

Opened "ctest-zlib.h5" with sec2 driver.
carray                   Dataset {10000/10000, 1000/1000}
    Attribute: CLASS scalar
        Type:      6-byte null-terminated ASCII string
        Data:  "CARRAY"
    Attribute: TITLE null
        Type:      1-byte null-terminated ASCII string

    Attribute: VERSION scalar
        Type:      3-byte null-terminated ASCII string
        Data:  "1.1"
    Location:  1:1024
    Links:     1
    Chunks:    {16, 1000} 128000 bytes
    Storage:   80000000 logical bytes, 7004415 allocated bytes, 1142.14% utilization
    Filter-0:  shuffle-2 OPT {8}
    Filter-1:  deflate-1 OPT {1}
    Type:      native double
H5tools-DIAG: Error detected in HDF5:tools (1.8.17) thread 140735105355776:
  #000: h5tools_dump.c line 1836 in h5tools_dump_mem(): H5Sis_simple failed
    major: Failure in tools library
    minor: error in function


## Using compression (Blosc)

In [127]:
f = tb.open_file('ctest-blosc.h5', 'w')

In [128]:
# Create a CArray (Compressible Array) using the Blosc compressor
filters = tb.Filters(complib='blosc:lz4', complevel=9)
ca = f.create_carray(f.root, 'carray', tb.Float64Atom(), (10000,1000),
                     filters=filters)

In [129]:
# Push some data on this carray container
na = np.linspace(0, 1, 1e7).reshape(10000,1000)
%time ca[:] = na

CPU times: user 89.6 ms, sys: 14.8 ms, total: 104 ms
Wall time: 105 ms


**Note how witing a compressed carray is faster in the this case than both the uncompressed case above (~500 ms) and with using zlib (~750 ms).**

In [130]:
f.close()
!ls -lh ctest-blosc.h5

-rw-r--r--  1 Gonzalo  staff   7.7M Oct 19 12:58 ctest-blosc.h5


As you see, the compression ratio is quite the same than with zlib.

## Exercise

PyTables comes with support for different compressors, namely 'zlib' (the default), 'bzip2' and 'blosc:X' where X is a codec can be one of 'blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib' (and 'zstd' from PyTables 3.3 on).  Based on the example above, do a small study on which ones work best.

* Which one compresses best?
* Which one compresses faster?
* Which one shows the best balance?

### Solution

# Specifying chunk size

When creating a chunked dataset, a chunksize is chosen automatically based on some heuristics.  However, you may want to specify your own chunksize and see the best for you:

In [None]:
na = np.linspace(0, 1, 1e7).reshape(10000,1000)
for nrows in range(10, 210, 30):
    with tb.open_file("chunk_study.h5", "w") as f:
        chunkshape = (nrows, 1000)
        print("chunkshape:", chunkshape)
        filters = tb.Filters(complib="blosc:lz4", complevel=9)
        ca = f.create_carray(f.root, 'carray', tb.Float64Atom(), (10000,1000),
                            filters=filters, chunkshape=chunkshape)
        %time ca[:] = na
    !ls -lh chunk_study.h5

# Using ptrepack

You can use the 'ptrepack' utility to copy HDF5 whole files (or only parts) and change different parameters during the copy process.

In [None]:
!ptrepack -o --complib zlib --complevel 1 ctest.h5 ctest-repacked-zlib.h5

In [None]:
!ptrepack -o --complib blosc:lz4 --complevel 9 ctest.h5 ctest-repacked-blosc.h5

In [None]:
!ptrepack -o --complib blosc:lz4 --complevel 9 --chunkshape '(1000,1000)' ctest.h5 ctest-repacked-blosc-chunkshape.h5

# Queries in Table objects

In [None]:
# The description for the tabular data
class TabularData(tb.IsDescription):
    col1 = tb.StringCol(200)
    col2 = tb.IntCol()
    col3 = tb.FloatCol()

In [None]:
# Open a file and create the Table container
f = tb.open_file('atable.h5', 'w')
t = f.create_table(f.root, 'table', TabularData, 'table title',
                   filters=tb.Filters(9, 'blosc'))

In [None]:
t

In [None]:
%%time
#  Fill the table with some 1 million rows
r = t.row
for i in range(1000*1000):
    r['col1'] = str(i)
    r['col2'] = i + 1
    r['col3'] = i * (i + 1)
    r.append()
t.flush()

In [None]:
t

In [None]:
# Size on disk
!ls -lh atable.h5

In [None]:
# Real size
np.prod(t.shape) * t.dtype.itemsize / 2**20.

In [None]:
# Do a query (regular)
%time [r['col1'] for r in t if r['col2'] < 5]

In [None]:
# Repeat the query, but using in-kernel method
%time [r['col1'] for r in t.where('col2 < 5')]

In [None]:
# Performing complex conditions (regular query)
%time [r['col1'] for r in t if r['col2'] < 5 and r['col3'] < 10]

In [None]:
# Complex, in-kernel queries
%time [r['col1'] for r in t.where('(col2 < 5) & (col3 < 10)')]

In [None]:
# Get a structured array out of disk
sa = t[:]
sa

In [None]:
# Perform the query in-memory using pure NumPy machinery 
%time sa[((sa['col2'] < 5) & (sa['col3'] < 10))]['col1']

In [None]:
# Create an index for the on-disk table
%time t.cols.col2.create_csindex()

In [None]:
# Repeat the complex query (indexed)
%time [r['col1'] for r in t.where('(col2 < 5) & (col3 < 10)')]

Indexing normally offers the best speed for doing queries.

In [None]:
f.close()

# Exercise

Open the 'ic_dst...' file in the data/ directory:

In [None]:
f = tb.open_file("../data/ic_dst_NEXT_v0_08_02_Kr_ACTIVE_0_0_5bar_MCRD_10000.root.h5")

In [None]:
print(f)


* Determine the chunksize of the /MLR/mau and /RD/pmtrwf datasets

* Copy them to another (new) HDF5 file using different chunksizes and compressors.  Determine the ones that offers best ratio and speed. (use ptrepack).

* Use the /TWF/TWF and /Sensors/DataSiPM and do some small analysis (e.g. plotting the times for TWF, or query them based on some conditions that make sense).

### Solution