In [22]:
import tempfile
import io
import h5py
import numpy as np
import pandas as pd

pd.set_option('io.hdf.default_format','table')

filepath1 =  'hdf5.test.1.h5'
filepath2 =  'hdf5.test.2.h5'
filepath3 =  'hdf5.test.3.h5'

In [4]:
# Accessing ndarray data in H5 dataset.
A = np.arange(64).reshape(8,8)

with io.BytesIO() as bio:
    # Using in-memory file to mock file object
    with h5py.File(bio, 'w') as f:
        f['dataset'] = A
        print("Get dataset properties as ndarray properties")
        print(f['dataset'].shape, f['dataset'].size)
        print("Index ndarray")
        print(f['dataset'][-1,-1], f['dataset'][-1])
        print("Get the entire ndarray")
        Ap = f['dataset'][()]
        print(type(Ap))
        print(Ap)

Get dataset properties as ndarray properties
(8, 8) 64
Index ndarray
63 [56 57 58 59 60 61 62 63]
Get the entire ndarray
<class 'numpy.ndarray'>
[[ 0  1  2  3  4  5  6  7]
 [ 8  9 10 11 12 13 14 15]
 [16 17 18 19 20 21 22 23]
 [24 25 26 27 28 29 30 31]
 [32 33 34 35 36 37 38 39]
 [40 41 42 43 44 45 46 47]
 [48 49 50 51 52 53 54 55]
 [56 57 58 59 60 61 62 63]]


Writing/reading ndarray and DataFrame to/from H5 files

In [42]:
# Documentation on pandas.HDFStore:
# https://www.kite.com/python/docs/pandas.HDFStore

# save ndarray to H5 file
a = np.array([
    [0,1,2],
    [2,3,1],
    [1,3,4]])
# write a H5 file
with h5py.File(filepath1, 'w') as f:
    # create a dataset
    f.create_dataset('test0', data=a)
# read a H5 file
with h5py.File(filepath1, 'r') as f:
    print("h5 file's keys: ", *f.keys())
    # get h5py.Dataset
    dset = f['test0']
    # get ndarray from h5py.Dataset
    ap = dset[()]
    print(ap)
    
# save pd.DataFrame to H5 file using pd.HDFStore
d = {'patch_size': [1024, 1024, 512],
     'magnification': [10, 20, 40],
     'dataset_name': ['tn', 'ovr_mmrd', 'mmrd']}
df = pd.DataFrame(d)
with pd.HDFStore(filepath1) as s:
    s['test1'] = df

# must use pd.HDFStore to read DataFrame data
with pd.HDFStore(filepath1, 'r') as s:
    print(s['test1'])

# adding pd.DataFrame to H5 file using pd.HDFStore needs 'a' mode.
d = {'patch_size': [2048, 64],
     'magnification': [80, 5],
     'dataset_name': ['tn', 'ovr']}
df = pd.DataFrame(d)
with pd.HDFStore(filepath1, 'a') as s:
    s['gp1/test2'] = df

# get info and pd.DataFrame values from H5 
with pd.HDFStore(filepath1) as s:
    print()
    print(s.info())
    print()
    print(s['gp1/test2'])
    print()
    print(s['test1'])

# must use h5py.File to read ndarray data
with h5py.File(filepath1, 'r') as f:
    print(f['test0'][()])

h5 file's keys:  test0
[[0 1 2]
 [2 3 1]
 [1 3 4]]
   patch_size  magnification dataset_name
0        1024             10           tn
1        1024             20     ovr_mmrd
2         512             40         mmrd

<class 'pandas.io.pytables.HDFStore'>
File path: hdf5.test.1.h5
/gp1/test2            frame_table  (typ->appendable,nrows->2,ncols->3,indexers->[index],dc->[])
/test1                frame_table  (typ->appendable,nrows->3,ncols->3,indexers->[index],dc->[])

   patch_size  magnification dataset_name
0        2048             80           tn
1          64              5          ovr

   patch_size  magnification dataset_name
0        1024             10           tn
1        1024             20     ovr_mmrd
2         512             40         mmrd
[[0 1 2]
 [2 3 1]
 [1 3 4]]


In [47]:
# appending to an existing dataframe
# Need to set `pd.set_option('io.hdf.default_format','table')` to append a dataframe
# To not overwrite, pd.HDFStore must be constructed with the append 'a' mode.
# Based on:
# https://www.kite.com/python/docs/pandas.HDFStore
# https://stackoverflow.com/questions/39638179/pandas-hdf5-append-time-series-fails

d = {'patch_size': [1024, 1024, 512],
     'magnification': [10, 20, 40],
     'dataset_name': ['tn', 'ovr_mmrd', 'mmrd']}
df1 = pd.DataFrame(d)
d = {'patch_size': [2048, 64],
     'magnification': [80, 5],
     'dataset_name': ['tn', 'ovr']}
df2 = pd.DataFrame(d)

with pd.HDFStore(filepath1, 'w') as s:
    s.put('test', df1, format='t', append=True, data_columns=True)
    # s.append('test', df1)
with pd.HDFStore(filepath1, 'a') as s:
    s.put('test', df2, format='t', append=True, data_columns=True)
    # s.append('test', df2)
with pd.HDFStore(filepath1, 'a') as s:
    s['test'].reset_index(drop=True, inplace=True)
with pd.HDFStore(filepath1, 'r') as s:
    print(s['test'])

   patch_size  magnification dataset_name
0        1024             10           tn
1        1024             20     ovr_mmrd
2         512             40         mmrd
0        2048             80           tn
1          64              5          ovr


In [20]:
# save ndarray to H5 file
a = np.array([
    [0,1,2],
    [2,3,1],
    [1,3,4]])
# write a H5 file
with h5py.File(filepath2, 'w') as f:
    try:
        # can't access groups or datasets that have not been created yet
        f['gr1/gr2/test0']
    except Exception as e:
        print(e)
    # create a group
    gr = f.create_group("gr1/gr2")
    # create a dataset
    gr.create_dataset('test0', data=a)
    
# read a H5 file
with h5py.File(filepath2, 'r') as f:
    print("h5 file's keys: ", *f.keys())
    print("gr1 in f?", 'gr1' in f)
    print("gr1/gr2 in f?", 'gr1/gr2' in f)
    print("gr1/gr2/test0 in f?", 'gr1/gr2/test0' in f)
    print("gr1/gr2/test0 is dataset?", isinstance(f['gr1/gr2/test0'], h5py.Dataset))
    print("gr1/gr2/test0 is group?", isinstance(f['gr1/gr2/test0'], h5py.Group))
    # get h5py.Dataset
    dset = f['gr1/gr2/test0']
    # get ndarray from h5py.Dataset
    ap = dset[()]
    print(ap)
    group = f['gr1/gr2']
    print("test0 in g?", 'test0' in group)
    print("test0 is dataset?", isinstance(group['test0'], h5py.Dataset))
    

'Unable to open object (component not found)'
h5 file's keys:  gr1
gr1 in f? True
gr1/gr2 in f? True
gr1/gr2/test0 in f? True
gr1/gr2/test0 is dataset? True
gr1/gr2/test0 is group? False
[[0 1 2]
 [2 3 1]
 [1 3 4]]
test0 in g? True
test0 is dataset? True


In [21]:
# save ndarray to H5 file
width = 4
height = 3
# write a H5 file
with h5py.File(filepath2, 'w') as f:
    f.create_dataset('test', (height, width,), dtype='i')
with h5py.File(filepath2, 'a') as f:
    dset = f['test']
    # write array horizontally
    dset[1,1:3] = np.array([-1,-1])
    # write array vertically
    dset[0:2,3] = np.array([1,1])
    # write number
    dset[0,1] = 2
    # write a single cell
    dset[0,2] = np.array([3])
with h5py.File(filepath2, 'r') as f:
    dset = f['test']
    print(dset[()])

[[ 0  2  3  1]
 [ 0 -1 -1  1]
 [ 0  0  0  0]]


In [30]:
a1 = np.array([
    [0,1],
    [2,3]])
a2 = np.array([
    [1,0],
    [0,1]])
a3 = np.array([
    [0,1],
    [1,0]])
a4 = np.array([
    [1,-1],
    [-1,1]])

with h5py.File(filepath3, 'w') as f:
    gr = f.create_group("gr11/gr21")
    dset = gr.create_dataset('a1', data=a1)
    # cannot create group gr11 since already exist
    # but can create group gr11/gr22 since diff. inner group
    try:
        gr = f.create_group('gr11')
    except ValueError as e:
        print(e)
    gr = f.create_group("gr11/gr22")
    gr.create_dataset('a2', data=a2)
    # can create dataset directly within groups
    dset = f.create_dataset('gr12/gr2/a3', data=a3)
    dset = f.create_dataset('gr13/gr2/gr3/a4', data=a4)

def get_h5_keys(f, key=''):
    keys = [ ]
    if isinstance(f, h5py.Dataset):
        return [key]
    else:
        for k, v in f.items():
            if key:
                tmp_key = f"{key}/{k}"
            else:
                tmp_key = k
            tmp_keys = get_h5_keys(v, key=tmp_key)
            keys.extend(tmp_keys)
        return keys

print("\nget the IDs in h5 file")
with h5py.File(filepath3, 'r') as f:
    for k in get_h5_keys(f):
        print(k)
    
print("\nwalk words in h5 file")
with h5py.File(filepath3, 'r') as f:
    for k1, v1 in f.items():
        print(k1)
        for k2, v2 in v1.items():
            print(' ', k2)
            for k3, v3 in v2.items():
                print('    ', k3)
                if isinstance(v3, h5py.Group):
                    for k4, v4 in v3.items():
                        print('      ', k4)

print("\nVisit all groups and datasets in file")
with h5py.File(filepath3, 'r') as f:
    f.visititems(lambda _name, _obj: print(_name, '     ', _obj))
    # alernatively use the below to print the IDs
    # f.visit(print)

Unable to create group (name already exists)

get the IDs in h5 file
gr11/gr21/a1
gr11/gr22/a2
gr12/gr2/a3
gr13/gr2/gr3/a4

walk words in h5 file
gr11
  gr21
     a1
  gr22
     a2
gr12
  gr2
     a3
gr13
  gr2
     gr3
       a4

Visit all groups and datasets in file
gr11       <HDF5 group "/gr11" (2 members)>
gr11/gr21       <HDF5 group "/gr11/gr21" (1 members)>
gr11/gr21/a1       <HDF5 dataset "a1": shape (2, 2), type "<i8">
gr11/gr22       <HDF5 group "/gr11/gr22" (1 members)>
gr11/gr22/a2       <HDF5 dataset "a2": shape (2, 2), type "<i8">
gr12       <HDF5 group "/gr12" (1 members)>
gr12/gr2       <HDF5 group "/gr12/gr2" (1 members)>
gr12/gr2/a3       <HDF5 dataset "a3": shape (2, 2), type "<i8">
gr13       <HDF5 group "/gr13" (1 members)>
gr13/gr2       <HDF5 group "/gr13/gr2" (1 members)>
gr13/gr2/gr3       <HDF5 group "/gr13/gr2/gr3" (1 members)>
gr13/gr2/gr3/a4       <HDF5 dataset "a4": shape (2, 2), type "<i8">
