# Chapter 1. Introduction

In [2]:
import h5py
import numpy as np
temperature = np.random.random(1024)

f = h5py.File("weather.hdf5")
f["/15/temperature"] = temperature
f["/15/temperature"].attrs["dt"] = 10.0
f["/15/temperature"].attrs["start_time"] = 1375204299

wind = np.random.random(2048)
dt_wind = 5.0 # Wind sampled every 5 seconds

f["/15/wind"] = wind
f["/15/wind"].attrs["dt"] = 5.0

  f = h5py.File("weather.hdf5")


In [5]:
dataset = f["/15/temperature"]
dataset

<HDF5 dataset "temperature": shape (1024,), type "<f8">

In [7]:
dataset.attrs["dt"]

10.0

In [8]:
dataset.attrs["start_time"]

1375204299

In [9]:
dataset = f["/15/temperature"]
for key, value in dataset.attrs.items():
    print("%s: %s" % (key, value))

dt: 10.0
start_time: 1375204299


In [10]:
dataset[0:10]

array([0.49415441, 0.12350843, 0.36382387, 0.21723826, 0.39345324,
       0.66432891, 0.60911616, 0.95404353, 0.27932092, 0.0866344 ])

In [11]:
big_dataset = f.create_dataset("big", shape=(1024, 1024, 1024, 512), dtype='float32')

In [12]:
big_dataset[344, 678, 23, 36] = 42.0

In [13]:
compressed_dataset = f.create_dataset("comp", shape=(1024,), dtype='int32', compression='gzip')

In [14]:
compressed_dataset[:] = np.arange(1024)
compressed_dataset[:]

array([   0,    1,    2, ..., 1021, 1022, 1023], dtype=int32)

In [15]:
f.keys()

<KeysViewHDF5 ['15', 'big', 'comp']>

In [16]:
f["/15"].keys()

<KeysViewHDF5 ['temperature', 'wind']>

# Chapter 2. Getting Started

In [19]:
h5py.File.close?

[0;31mSignature:[0m [0mh5py[0m[0;34m.[0m[0mFile[0m[0;34m.[0m[0mclose[0m[0;34m([0m[0mself[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Close the file.  All open objects become invalid 
[0;31mFile:[0m      ~/miniconda3/lib/python3.8/site-packages/h5py/_hl/files.py
[0;31mType:[0m      function


In [22]:
from timeit import timeit
import time
time.sleep(0.1), timeit("time.sleep(0.1)", number=5)

(None, 0.5036407000006875)

In [23]:
%timeit time.sleep(0.1)

101 ms ± 74.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [28]:
f = h5py.File("name.hdf5")
f.close()

  f = h5py.File("name.hdf5")


In [29]:
f = h5py.File("name.hdf5", "w") # New file overwriting any existing file
f = h5py.File("name.hdf5", "r") # Open read-only (must exist)
f = h5py.File("name.hdf5", "r+") # Open read-write (must exist)
f = h5py.File("name.hdf5", "a") # Open read-write (create if doesn't exist)

In [31]:
name = u"name_eta_\u03b7"
f = h5py.File(name)
print(f.filename)

name_eta_η


  f = h5py.File(name)


In [32]:
with h5py.File("name.hdf5", "w") as f:
    print(f["missing_dataset"])

KeyError: "Unable to open object (object 'missing_dataset' doesn't exist)"

In [33]:
print(f)

<Closed HDF5 file>


In [39]:
f = h5py.File("name.hdf5", driver="core")

  f = h5py.File("name.hdf5", driver="core")


In [40]:
f = h5py.File("name.hdf5", driver="core", backing_store=True)

  f = h5py.File("name.hdf5", driver="core", backing_store=True)


In [38]:
h5py.get_config().default_file_mode

# Chapter 3. Working with Datasets

In [75]:
f = h5py.File("testfile.hdf5")
arr = np.ones((5,2))
f["my dataset"] = arr
dset = f["my dataset"]
dset

  f = h5py.File("testfile.hdf5")


<HDF5 dataset "my dataset": shape (5, 2), type "<f8">

In [45]:
dset.dtype

dtype('<f8')

In [46]:
dset.shape

(5, 2)

In [47]:
out = dset[...]
out

array([[1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.],
       [1., 1.]])

In [48]:
type(out)

numpy.ndarray

In [49]:
dset[1:4,1] = 2.0
dset[...]

array([[1., 1.],
       [1., 2.],
       [1., 2.],
       [1., 2.],
       [1., 1.]])

In [50]:
dset = f.create_dataset("big dataset", (1024**3,), dtype=np.float32)
dset[0:1024] = np.arange(1024)

In [51]:
#we also ask HDF5 to flush its buffers and actually write to disk:
f.flush()

In [52]:
%ls -lh testfile.hdf5

-rwxrwxrwx 1 dan dan 4.1G Feb 11 13:00 [0m[01;32mtestfile.hdf5[0m*


In [53]:
bigdata = np.ones((100,1000))
bigdata.dtype

dtype('float64')

In [54]:
bigdata.shape

(100, 1000)

In [55]:
with h5py.File('big1.hdf5','w') as f1:
    f1['big'] = bigdata

In [3]:
%ls -lh big1.hdf5

 Volume in drive D is Data
 Volume Serial Number is C839-DC37

 Directory of D:\books\python\0. HDF5


 Directory of D:\books\python\0. HDF5

02/11/2022  10:17 PM           802,048 big1.hdf5
               1 File(s)        802,048 bytes
               0 Dir(s)  635,343,654,912 bytes free


In [57]:
with h5py.File('big2.hdf5','w') as f2:
    f2.create_dataset('big', data=bigdata, dtype=np.float32)

In [58]:
%ls -lh big2.hdf5

-rwxrwxrwx 1 dan dan 393K Feb 11 13:42 [0m[01;32mbig2.hdf5[0m*


In [59]:
f1 = h5py.File("big1.hdf5")
f2 = h5py.File("big2.hdf5")
f1['big'].dtype, f2['big'].dtype

  f1 = h5py.File("big1.hdf5")
  f2 = h5py.File("big2.hdf5")


(dtype('<f8'), dtype('<f4'))

In [60]:
dset = f2['big']
dset.dtype

dtype('<f4')

In [61]:
dset.shape

(100, 1000)

In [62]:
big_out = np.empty((100, 1000), dtype=np.float64)

In [63]:
dset.read_direct(big_out)
dset.shape

(100, 1000)

In [64]:
dset.dtype

dtype('<f4')

In [65]:
with dset.astype('float64'):
    out = dset[0,:]
out.dtype

dtype('float64')

In [66]:
f.create_dataset('x', data=1e256, dtype=np.float64)
print(f['x'][...])

1e+256


In [67]:
f.create_dataset('y', data=1e256, dtype=np.float32)
print(f['y'][...])

inf


In [68]:
# empty default value is 0
dset = f.create_dataset('empty', (2,2), dtype=np.int32)
dset[...]

array([[0, 0],
       [0, 0]], dtype=int32)

In [69]:
dset = f.create_dataset('filled', (2,2), dtype=np.int32, fillvalue=42)
dset[...]

array([[42, 42],
       [42, 42]], dtype=int32)

In [70]:
dset.fillvalue

42

In [71]:
dset = f2['big']
dset

<HDF5 dataset "big": shape (100, 1000), type "<f4">

In [72]:
out = dset[0:10, 20:70]
out.shape

(10, 50)

In [76]:
# Ellipsis ...
dset = f.create_dataset('range', data=np.arange(10))
dset[...]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [77]:
f

<HDF5 file "testfile.hdf5" (mode r+)>

In [78]:
dset[:]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [79]:
dset = f.create_dataset('4d', shape=(100, 80, 50, 20))
dset[...].shape, dset[0,...,0].shape

((100, 80, 50, 20), (80, 50))

In [80]:
dset = f.create_dataset('1d', shape=(1,), data=42)
dset.shape, dset[0], dset[...]

((1,), 42, array([42]))

In [81]:
dset = f.create_dataset('0d', data=42)
dset.shape, dset[...]

((), array(42))

In [83]:
dset[()]

42

In [84]:
dset = f['range']
dset[...], dset[ [1,2,7] ]

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([1, 2, 7]))

In [88]:
dset = f2['big']
dset.dtype, dset.shape

(dtype('<f4'), (100, 1000))

In [89]:
out = np.empty((100, 1000), dtype=np.float64)
dset.read_direct(out)
dset.dtype

dtype('<f4')

In [91]:
# np.s_ is a gadget that takes slices, in the ordinary array-slicing syntax,
# and returns a NumPy slice object with the corresponding information.
np.s_[0,:]

(0, slice(None, None, None))

In [90]:
dset.read_direct(out, source_sel=np.s_[0,:], dest_sel=np.s_[50,:])

In [92]:
dset = f.create_dataset('perftest', (10000, 10000), dtype=np.float32)
dset[:] = np.random.random(10000) # note the use of broadcasting!
def time_simple():
    dset[:,0:500].mean(axis=1)
out = np.empty((10000, 500), dtype=np.float32)
def time_direct():
    dset.read_direct(out, np.s_[:,0:500])
    out.mean(axis=1)

In [93]:
timeit(time_simple, number=100), timeit(time_direct, number=100)

(9.971557299999404, 9.53366140000071)

In [94]:
a = np.ones((1000,1000), dtype='<f4') # Little-endian 4-byte float
b = np.ones((1000,1000), dtype='>f4') # Big-endian 4-byte float
timeit(a.mean, number=1000), timeit(b.mean, number=1000)

(0.2721551999966323, 0.5773685000021942)

In [95]:
c = b.view("float32")
c[:] = b
b = c
b.dtype, c.dtype, timeit(b.mean, number=1000), timeit(c.mean, number=1000)

(dtype('float32'), dtype('float32'), 0.24609680000139633, 0.2194777000004251)

In [96]:
dset = f.create_dataset('fixed', (2,2))
dset.shape, dset.maxshape

((2, 2), (2, 2))

In [99]:
dset = f.create_dataset('resizable1', (2,2), maxshape=(2,2))
dset.shape, dset.maxshape

((2, 2), (2, 2))

In [100]:
dset.resize((1,1))

In [101]:
dset.shape

(1, 1)

In [102]:
dset.resize((2,2))
dset.shape

(2, 2)

In [103]:
dset = f.create_dataset('unlimited', (2,2), maxshape=(2, None))
dset.shape

(2, 2)

In [104]:
dset.maxshape

(2, None)

In [105]:
dset.resize((2,3))
dset.shape

(2, 3)

In [106]:
a = np.array([ [1, 2], [3, 4] ])
a.shape, print(a)

[[1 2]
 [3 4]]


((2, 2), None)

In [108]:
a.resize((1,4))
print(a), a.shape

[[1 2 3 4]]


(None, (1, 4))

In [109]:
a.resize((1,10))
print(a), a.shape

[[1 2 3 4 0 0 0 0 0 0]]


(None, (1, 10))

In [110]:
dset = f.create_dataset('sizetest', (2,2), dtype=np.int32, maxshape=(None, None))
dset[...] = [ [1, 2], [3, 4] ]
dset[...]

array([[1, 2],
       [3, 4]], dtype=int32)

In [111]:
dset.resize((1,4))
dset[...]

array([[1, 2, 0, 0]], dtype=int32)

In [112]:
dset.resize((1,10))
dset[...]

array([[1, 2, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)

# Chapter 4. How Chunking and Compression Can Help You

In [113]:
f = h5py.File("imagetest.hdf5")
dset = f.create_dataset("Images", (100, 480, 640), dtype='uint8')
image = dset[0, :, :]
image.shape

  f = h5py.File("imagetest.hdf5")


(480, 640)

In [114]:
tile = dset[0,0:64,0:64]
tile.shape

(64, 64)

In [115]:
dset = f.create_dataset('chunked', (100,480,640), dtype='i1', chunks=(1,64,64))

In [116]:
dset.chunks

(1, 64, 64)

In [117]:
# how to “append” to a dataset 

dset1 = f.create_dataset('timetraces', (1,1000), maxshape=(None, 1000))
def add_trace_1(arr):
    dset1.resize( (dset1.shape[0]+1, 1000) )
    dset1[-1,:] = arr

In [118]:
dset2 = f.create_dataset('timetraces2', (5000, 1000), maxshape=(None, 1000))
ntraces = 0
def add_trace_2(arr):
    global ntraces
    dset2[ntraces,:] = arr
    ntraces += 1
def done():
    dset2.resize((ntraces,1000))

In [122]:
def setup():
    """ Re-initialize both datasets for the tests """
    global data, N, dset1, dset2, ntraces
    data = np.random.random(1000)
    N = 10000 # Number of iterations
    dset1.resize((1,1000))
    dset2.resize((10001,1000))
    ntraces = 0
def test1():
    """ Add N traces to the first dataset """
    for idx in range(N):
        add_trace_1(data)
def test2():
    """ Add N traces to the second dataset, and then trim it """
    for idx in range(N):
        add_trace_2(data)
    done()

In [123]:
timeit(test1, setup=setup, number=1)

1.2162422000001243

In [124]:
timeit(test2, setup=setup, number=1)

1.0335919999997714

In [125]:
dset1.shape

(1, 1000)

In [126]:
dset1.chunks, dset2.chunks

((1, 1000), (157, 63))

In [127]:
dset = f.create_dataset("BigDataset", (1000,1000), dtype='f', compression="gzip")
dset.compression

'gzip'

In [128]:
# Compression is transparent; data is read and written normally
dset[...] = 42.0
dset[0,0]

42.0

In [129]:
dset.compression_opts

4

In [131]:
dset.shape, dset.chunks

((1000, 1000), (63, 125))

In [132]:
dset = f.create_dataset("Dataset2", (1000,), compression=9)
dset.compression

'gzip'

In [133]:
dset.compression_opts

9

# Chapter 5. Groups, Links, and Iteration: The “H” in HDF5

In [135]:
f = h5py.File("Groups.hdf5")
subgroup = f.create_group("SubGroup")
subgroup

  f = h5py.File("Groups.hdf5")


<HDF5 group "/SubGroup" (0 members)>

In [136]:
subgroup.name

'/SubGroup'

In [137]:
subsubgroup = subgroup.create_group("AnotherGroup")
subsubgroup.name

'/SubGroup/AnotherGroup'

In [138]:
f["Dataset1"] = 1.0
f["Dataset2"] = 2.0
f["Dataset3"] = 3.0
subgroup["Dataset4"] = 4.0

In [139]:
dset1 = f["Dataset1"]
dset1

<HDF5 dataset "Dataset1": shape (), type "<f8">

In [140]:
dset4 = f["SubGroup/Dataset4"] # Right
dset4
# dset4 = f["SubGroup"]["Dataset4"] # Works, but inefficient

<HDF5 dataset "Dataset4": shape (), type "<f8">

In [141]:
out = f.get("BadName")
print(out)

None


In [142]:
len(f), len(f["SubGroup"])

(4, 2)

In [143]:
f = h5py.File('propdemo.hdf5','w')
grp = f.create_group('hello')
grp.file == f

True

In [144]:
grp.parent

<HDF5 group "/" (1 members)>

In [145]:
f = h5py.File('linksdemo.hdf5','w')
grpx = f.create_group('x')
grpx.name

'/x'

In [146]:
f['y'] = grpx
grpy = f['y']
grpy == grpx

True

In [147]:
grpx.name, grpy.name

('/x', '/y')

In [148]:
grpz = f.create_group(None)
print(grpz.name)

None


In [149]:
f['z'] = grpz
grpz.name

'/z'

In [151]:
del f['y']

In [152]:
del f['x'] # Last hard link; the group is deleted in the file

In [153]:
f = h5py.File('test.hdf5','w')
grp = f.create_group('mygroup')
dset = grp.create_dataset('dataset', (100,))

In [154]:
f['hardlink'] = dset
f['hardlink'] == grp['dataset']

True

In [155]:
grp.move('dataset', 'new_dataset_name')
f['hardlink'] == grp['new_dataset_name']

True

In [156]:
grp.move('new_dataset_name', 'dataset')
f['softlink'] = h5py.SoftLink('/mygroup/dataset')
f['softlink'] == grp['dataset']

True

In [157]:
f['softlink']

<HDF5 dataset "softlink": shape (100,), type "<f4">

In [159]:
softlink = h5py.SoftLink('/some/path')
softlink

<SoftLink to "/some/path">

In [160]:
softlink.path

'/some/path'

In [161]:
#if we move the dataset and replace it with something else, /softlink would then point to the new object
grp.move('dataset', 'new_dataset_name')
dset2 = grp.create_dataset('dataset', (50,))
f['softlink'] == dset, f['softlink'] == dset2

(False, True)

In [162]:
grp = f.create_group(u'e_with_accent_\u00E9')
print(grp.name)

/e_with_accent_é


In [164]:
grp2 = f.create_group('e_with_accent_\u00EA')
print(grp2.name)

/e_with_accent_ê


In [165]:
f = h5py.File('get_demo.hdf5','w')
f.create_group('subgroup')
f.create_dataset('dataset', (100,))

<HDF5 dataset "dataset": shape (100,), type "<f4">

In [166]:
for name in f:
    print(name, f.get(name, getclass=True))

dataset <class 'h5py._hl.dataset.Dataset'>
subgroup <class 'h5py._hl.group.Group'>


In [168]:
f['softlink'] = h5py.SoftLink('/subgroup')
with h5py.File('get_demo_ext.hdf5','w') as f2:
    f2.create_group('egroup')
f['extlink'] = h5py.ExternalLink('get_demo_ext.hdf5','/egroup')
for name in f:
    print(name, f.get(name, getlink=True))

dataset <h5py._hl.group.HardLink object at 0x7f447d1ab940>
extlink <ExternalLink to "/egroup" in file "get_demo_ext.hdf5"
softlink <SoftLink to "/subgroup">
subgroup <h5py._hl.group.HardLink object at 0x7f4502d29610>


In [170]:
f.create_dataset('dataset2', (100,), dtype='i')
f.require_dataset('dataset2', (100,), dtype='i')

<HDF5 dataset "dataset2": shape (100,), type "<i4">

# Chapter 6. Storing Metadata with Attributes

In [2]:
import h5py
import numpy as np
f = h5py.File('attrsdemo.hdf5','w')
dset = f.create_dataset('dataset',(100,))

In [3]:
dset.attrs

<Attributes of HDF5 object at 139988506886608>

In [4]:
dset.attrs['title'] = "Dataset from third round of experiments"
dset.attrs['sample_rate'] = 100e6 # 100 MHz digitizer setting
dset.attrs['run_id'] = 144

In [5]:
dset.attrs['title']

'Dataset from third round of experiments'

In [6]:
dset.attrs['sample_rate']

100000000.0

In [7]:
[x for x in dset.attrs]

['run_id', 'sample_rate', 'title']

In [8]:
dset.attrs['another_id'] = 42
dset.attrs['another_id'] = 100

In [9]:
del dset.attrs['another_id']
dset.attrs['another_id']

KeyError: "Can't open attribute (can't locate attribute: 'another_id')"

In [12]:
[(name, val) for name, val in dset.attrs.items()]

[('run_id', 144),
 ('sample_rate', 100000000.0),
 ('title', 'Dataset from third round of experiments')]

In [13]:
dset.attrs.get('run_id')

144

In [14]:
print(dset.attrs.get('missing'))

None


In [15]:
dset.dtype

dtype('<f4')

In [16]:
f.flush()

In [4]:
%h5ls -vlr attrsdemo.hdf5

UsageError: Line magic function `%h5ls` not found.


# Chapter 7. More About Types

In [2]:
import h5py
import numpy as np
f = h5py.File("typesdemo.hdf5")
dset = f.create_dataset('smallint', (10,), dtype=np.int8)
dset[0] = 300
dset[0]

  f = h5py.File("typesdemo.hdf5")


127

In [3]:
a = np.zeros((10,), dtype=np.int8)
a[0] = 300
a[0]

44

In [4]:
dset = f.create_dataset('half_float', (100,100,100), dtype=np.float16)
a = dset[...]
a = a.astype(np.float32)

In [6]:
dt = np.dtype('S3')
a = np.array( [ "a", "ab", "abc", "abcd" ], dtype=dt)
a

array([b'a', b'ab', b'abc', b'abc'], dtype='|S3')

In [7]:
dt = h5py.special_dtype(vlen=str)
dt

dtype('O')

In [8]:
dset = f.create_dataset('vlen_dataset', (100,), dtype=dt)

In [9]:
dset[0] = "Hello"
dset[1] = np.string_("Hello2")
dset[3] = "X"*10000

In [10]:
out = dset[0]
type(out)

str

In [11]:
dset[0:2]

array(['Hello', 'Hello2'], dtype=object)

In [12]:
out = dset[0:1]
out.dtype

dtype('O')

# Chapter 8. Organizing Data with References, Types, and Dimension Scales

In [13]:
f = h5py.File('refs_demo.hdf5','w')
grp1 = f.create_group('group1')
grp2 = f.create_group('group2')
dset = f.create_dataset('mydata', shape=(100,))

In [14]:
grp1.ref

<HDF5 object reference>

In [15]:
out = f[grp1.ref]
out == grp1

True

In [16]:
isinstance(grp1.ref, h5py.Reference)

True

In [17]:
out = grp2[grp1.ref]
out == grp1

True