In [1]:
# Based on O'Reilly Webcast
# https://www.youtube.com/watch?v=wZEFoVUu8h0
import numpy as np
import h5py

In [2]:
# This the data segment
data = np.arange(10)

# Show what the 'data' is
data

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [3]:
# Create a hdf5 file object
## haha
f = h5py.File("demo.hdf5", "w")

# Show what f is
f

# Some suspiciously looking named items here, like
# - f.iterkeys
# - f.iteritems
# - f.keys
# - ...
# These are things that you would find in python dictionaries
# First piece of information on how HDF file interface works:
# This container objects (files and groups) work like python dictionaries

<HDF5 file "demo.hdf5" (mode r+)>

In [4]:
data

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [5]:
# Assign data to 'mydata'
f['mydata'] = data

In [6]:
# This is a a proxy object
# Allows partial I/O & attribute access
dset = f['mydata']
dset

<HDF5 dataset "mydata": shape (10,), type "<i8">

In [7]:
# Investigating this object further
# Data shape (just a like a numpy array)
dset.shape

(10,)

In [8]:
# Data type (just a like a numpy array)
dset.dtype

dtype('int64')

In [9]:
# Access the array itself (numpy syntax)
dset[:]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [10]:
# Supports partial I/O
# Numpy slicing syntax
# uses HDF5 capabilities, only reads these entries
# very efficient for large datasets
dset[0:6:2]

array([0, 2, 4])

In [11]:
# In numpy you can specify which indeces you want
# HDF only pulls out these elements
data[[0,2,6]]

array([0, 2, 6])

In [12]:
# In numpy you can specify which indeces you want
# HDF only pulls out these elements
dset[[1,2,6]]

array([1, 2, 6])

In [13]:
# Attributes of our object
# Behaves like python dictionaries
dset.attrs

<Attributes of HDF5 object at 139811803988760>

In [14]:
# Add a metadata
dset.attrs['sampling rate'] = 100e6 # sampling rate
dset.attrs['pressure'] = 15 # pressure

In [15]:
dset.attrs.keys()


KeysView(<Attributes of HDF5 object at 139811803988760>)

In [16]:
dset.attrs.items()

ItemsViewHDF5(<Attributes of HDF5 object at 139811803988760>)

In [17]:
list(dset.attrs)

['sampling rate', 'pressure']

In [18]:
dset.attrs.items()

ItemsViewHDF5(<Attributes of HDF5 object at 139811803988760>)

In [19]:
list(dset.attrs.items())

[('sampling rate', 100000000.0), ('pressure', 15)]

In [20]:
# Close the file
f.close()

In [21]:
# Here you may want to take a look at the file
# with hdfview .hdf5 file viewer

In [22]:
# Open up the file again (without the write mode)
f = h5py.File("demo.hdf5")

In [23]:
list(f.keys())

['mydata']

In [24]:
# Let's look at the dataset object again
dset = f['mydata']

In [25]:
# Check the name of the data
dset.name

'/mydata'

In [26]:
# So we've got a slash '/' in the front
# In HDF5 the data is organised in ahierarchy
# HDF5, H -> hierarchy 
# All of the objects have a full pathname
# Starting with a '/' which is the root group

# Root group
root = f['/']

In [27]:
root

<HDF5 group "/" (1 members)>

In [28]:
list(root.keys())

['mydata']

In [29]:
# Let's try to create a dataset which is not in the root group
f['/path/dataset'] = data

In [30]:
# Get the proxy object for the /path/dataset
dset2 = f['path/dataset']

In [31]:
# Name of dset2 dataset
dset2.name

'/path/dataset'

In [32]:
# Retrieve the group from the file
grp = f['/path']

In [33]:
grp

<HDF5 group "/path" (1 members)>

In [34]:
# What's inside in this group?
list(grp.keys())

['dataset']

In [35]:
# You can use standard python containership testing
# Example 1
'mydata' in f

True

In [36]:
# Example 2
'mydata2' in f

False

In [37]:
# Example 3
'/path/dataset' in f

True

In [38]:
########################################################
# Okay, so far we've used numpy arrays which
# seem to work nicely with hdf5
# But we don't need numpy arrays ready to make a dataset!
# It's very very commong in analysis/research to a create
# an empty dataset and then fill it up

# Create a dataset on the fly
# How do you do that?
# Check out the manual.
f.create_dataset?

In [39]:
# Create a really BIG dataset
dset3 = f.create_dataset('BIG', (1000,1000,1000,1000), dtype='f', compression='gzip')

In [40]:
# Check the shape
dset3.shape

(1000, 1000, 1000, 1000)

In [41]:
# Check the type
dset3.dtype

dtype('float32')

In [43]:
# Random access to dataset
dset3[456,892,344,12] = 42

In [47]:
# Create a smaller dataset
dset4 = f.create_dataset('smaller', (1000,1000), dtype='f4', compression='gzip')

In [50]:
# Assign 42.0 to every cell
dset4[:] = 42.0

In [53]:
# Flush the file
f.flush()

In [54]:
# Close the file
f.close()