In [1]:
import h5py

In [3]:
# open hdf5 data for reading mode, much like how you open a file using open(file, mode)
# for writing to a file you will use 'w', or read/write r+
# more information about the mode can be found in this page
# http://docs.h5py.org/en/stable/high/file.html
dataset = h5py.File('/mnt/act3/users/yilun/data/dataset.h5', 'r')

In [4]:
# the dataset behaves like a python dictionary with data stored in it.
# here are the groups i created
dataset.keys()

[u'test', u'train', u'validate']

In [5]:
# to open a given group, say 'train'
group = dataset['train']

In [6]:
# the group again behaves like a python dictionary
# I am showing the keys in this group, each corresponding
# to an actual data from a detector. The key is a unique
# identifier to that detector data, it's too long i will
# only print the first 10 for demo
det_keys = group.keys()
for i in range(10):
    print det_keys[i]

0.1
0.10
0.100
0.1001
0.1002
0.1004
0.1005
0.1006
0.1008
0.1009


In [7]:
# as you can see there are about 50880 detector data stored in training group.
print len(group.keys())

50880


In [8]:
# now we look at a given det data. pick a random detector to look at
# say we look at the detector corresonding to key '0.1005'
det_data = group['0.1005']

In [9]:
# this is now an actual dataset
type(det_data)

h5py._hl.dataset.Dataset

In [10]:
# to actually load the data from this detector dataset
# simply do
data = det_data[:]

In [11]:
print type(data)
print data

<type 'numpy.ndarray'>
[-0.91825491 -0.90685265 -0.90975888 ..., -0.98669338 -0.99271015
 -0.97558444]


In [12]:
# now you can see that it is loaded as a numpy array that you can use for furthur processing
# there are also metadata associated with each detector data, these are contained in a 
# dictionary under attrs. I only included the 10 parameters that Jesse suggested, but 
# it's easy to add more parameters as metadata. 
det_data.attrs.keys()

[u'corrLive',
 u'rmsLive',
 u'kurtLive',
 u'DELive',
 u'MFELive',
 u'skewLive',
 u'normLive',
 u'darkRatioLive',
 u'jumpLive',
 u'gainLive',
 u'label']

In [13]:
# to look at the value for a given parameter
print det_data.attrs['jumpLive']

1.19359e-12


In [14]:
# label parameter contained the 'sel' parameter, the truth
# 0 means unselected (bad), 1 means selected (good)
det_data.attrs['label']

0

In [None]:
# to add a new parameter as attribute you can do
# > det_data.attrs['your_cool_new_feature'] = somevalue
# make sure you open the file in write mode 
# and then make sure you close the file properly with
# > dataset.close()

In [None]:
# a sample workflow to load all training data and generate a new feature for each
# of them would be
dataset = h5py.File('dataset.h5', 'r+')

# suppose we are working on train set

grp = dataset['train']
for key in grp.keys():
    # get dataset
    det_data = grp[key]
    
    # retrieve data as numpy array
    data = det_data[:]
    
    # do something with the the data to generate your cool feature
    new_feature = generate_your_cool_feature(data)
    
    # save the new feature as a metadata
    data.attrs['your_cool_feature_name'] = new_feature
    
# save and close the file then you should be done
dataset.close()