In [None]:
#default_exp crono

In [None]:
#hide 
%load_ext autoreload
%autoreload 2

In [None]:
#hide 
import os 
os.chdir('/home/frank/Work/Projecten/DoRe/data/maxrf/crono')

# Exploring the Crono MA-XRF HDF5 file format 

> A peek inside  

The HDF5 file format is an open self documentable data container file format. It contains Groups and Datasets that can be understood as (sub)folders and files. An excellent python package that can be used to explore and read the contents of an hdf5 file is [h5py](https://docs.h5py.org/en/stable/index.html) by Andrew Collette. If you want to read more, the book by Andrew Collette *Python and HDF5 - Unlocking scientific data*, is an excellent introduction. 

<a href="https://www.oreilly.com/library/view/python-and-hdf5/9781491944981/"><img width="5%" src="https://learning.oreilly.com/library/cover/9781491944981/250w/"></a>

As it turns out, the Crono HDF5 file format is rather complicated are therefore hard to inspect with standard `h5py` functions. This is due to complicated internal structures of the datasets inside the file. Furthermore, some of the datasets contain binary blobs for which it is still unclear how we can decode them. 

The function `read_datasets()` can be used to obtain a list of all dataset objects for further processing. To inspect the contents of these datasets I have created the functions `peek_inside()` to inspect the contents of single dataset and `report()` print an extended summary of the full contents of a Crono HDF5 file. 

In [None]:
from cronomaxrf import read_datasets, peek_inside, report

crono_filename = '14200215102021-blindTest2AgedDetail.HDF5'   # 50 Mb 
datasets = read_datasets(crono_filename)

for i, d in enumerate(datasets): 
    print(f'[{i}] {d.name}')

[0] /Configuration/FinalSystemStatus
[1] /Configuration/InitialSystemStatus
[2] /Configuration/Settings
[3] /CreatedDateTime
[4] /CreatedWithSoftwareVersion
[5] /FileStructureVersion
[6] /Images/Documentation
[7] /Images/ImageAdjusting/ImagePixelSizeRatio
[8] /Images/Static
[9] /Images/StitchedImage
[10] /Images/VideoStreams
[11] /XRF/CalibrationPoints
[12] /XRF/DPPStartTimestamp
[13] /XRF/Detected
[14] /XRF/EnergyVector
[15] /XRF/LiveTimes
[16] /XRF/Maps/ComputedMaps
[17] /XRF/MotorsPositions
[18] /XRF/MotorsPositionsStats
[19] /XRF/MotorsTimestamps
[20] /XRF/OutputCountRates
[21] /XRF/ROIs/ROIsEnergies
[22] /XRF/ROIs/ROIsNames
[23] /XRF/ROIs/ROIsObjects
[24] /XRF/RealTimes
[25] /XRF/ResetCounterValue
[26] /XRF/Spectra
[27] /XRF/SpectraIDs
[28] /XRF/SpectraSelectedIndex
[29] /XRF/Timestamps
[30] /XRF/XComputedMapsCoordinate
[31] /XRF/YComputedMapsCoordinate


Below we will need to scroll through quite some data, so let's just pick one dataset `[14]` to start to inspect its contents. A dataset can contain attributes and values. Normally attributes are used to store metadata and the values part is used to store the actual measured values... 

In [None]:
peek_inside(datasets[14])

/XRF/EnergyVector:

+ATTRIBUTES: (none)

+SHAPE: (4096,) DTYPE: 'float32'

+VALUES: 
'array([-1.1700948, -1.1581414, -1.146188 , ..., 47.755295 , 47.767246 ,
       47.7792   ], dtype=float32)'




Ok, this dataset `/XRF/EnergyVector` contains no meta data attributes. It contains a single vector with 4096 floating point numbers. From the context it is clear that these are the energy values in keV units for the detector channels. 

Ok, now let's see the complete contents of the file.  

In [None]:
report(crono_filename)

CONTENTS OF CRONO HDF5 FILE: '14200215102021-blindTest2AgedDetail.HDF5'
(32 DATASETS)

[0] /Configuration/FinalSystemStatus
[1] /Configuration/InitialSystemStatus
[2] /Configuration/Settings
[3] /CreatedDateTime
[4] /CreatedWithSoftwareVersion
[5] /FileStructureVersion
[6] /Images/Documentation
[7] /Images/ImageAdjusting/ImagePixelSizeRatio
[8] /Images/Static
[9] /Images/StitchedImage
[10] /Images/VideoStreams
[11] /XRF/CalibrationPoints
[12] /XRF/DPPStartTimestamp
[13] /XRF/Detected
[14] /XRF/EnergyVector
[15] /XRF/LiveTimes
[16] /XRF/Maps/ComputedMaps
[17] /XRF/MotorsPositions
[18] /XRF/MotorsPositionsStats
[19] /XRF/MotorsTimestamps
[20] /XRF/OutputCountRates
[21] /XRF/ROIs/ROIsEnergies
[22] /XRF/ROIs/ROIsNames
[23] /XRF/ROIs/ROIsObjects
[24] /XRF/RealTimes
[25] /XRF/ResetCounterValue
[26] /XRF/Spectra
[27] /XRF/SpectraIDs
[28] /XRF/SpectraSelectedIndex
[29] /XRF/Timestamps
[30] /XRF/XComputedMapsCoordinate
[31] /XRF/YComputedMapsCoordinate

-----------------------------------------

[8] /Images/Static:

+ATTRIBUTES: (none)

+SHAPE: (13358,) DTYPE: 'object' SUBSHAPES: [(0,), (2565,), (2572,), (2525,), '....'] SUB_DTYPE: '|V1'

+VALUES: 
'b'''


--------------------------------------------------------------------------------
[9] /Images/StitchedImage:

+ATTRIBUTES: (none)

+SHAPE: (1,) DTYPE: 'object' SUBSHAPES: [(0,)] SUB_DTYPE: '|V1'

+VALUES: 
'b'''


--------------------------------------------------------------------------------
[10] /Images/VideoStreams:

+ATTRIBUTES: (none)

+SHAPE: (1,) DTYPE: 'object' SUBSHAPES: [(0,)] SUB_DTYPE: '|V1'

+VALUES: 
'b'''


--------------------------------------------------------------------------------
[11] /XRF/CalibrationPoints:

+ATTRIBUTES: 
        - CalibrationBins: 4096
        - CalibrationEquation: 0

+SHAPE: (2, 2) DTYPE: 'float32'

+VALUES: 
'array([[ 771.   ,    8.046],
       [1952.   ,   22.163]], dtype=float32)'


--------------------------------------------------------------------------------
[12] /XRF/DPPStar

# API 

In [None]:
#export 

import h5py 
import numpy as np 

def read_datasets(crono_filename): 
    '''Read all datasets in `crono_filename`.'''
    
    fh = h5py.File(crono_filename, mode='r')

    groups_and_datasets = []
    fh.visit(groups_and_datasets.append) 

    datasets = [fh[d] for d in groups_and_datasets if (type(fh[d]) == h5py.Dataset)] 
    
    return datasets


def report(crono_filename): 
    '''Print info about structure and content of datasets in `crono_filename`.'''
    
    datasets = read_datasets(crono_filename) 
    
    # print list of datasets 
    print(f'CONTENTS OF CRONO HDF5 FILE: \'{crono_filename}\'')
    print(f'({len(datasets)} DATASETS)\n')
    
    for i, d in enumerate(datasets):
        print(f'[{i}] {d.name}')
        
    print()
    
    ruler = '-'*80
    
    for i, d in enumerate(datasets): 
        
        
        print(ruler)
        print(f'[{i}] ', end='')
        peek_inside(d)
        



def peek_inside(dataset, _print=True):
    '''Summarize structure and content of a `dataset`. '''
        
    shape_str, dtype, sub_dtype = _nesting(dataset)


    # simplify represention for certain complex dataset structures 
    if dtype == np.dtype('S1'): 
        value_str = ''.join(dataset[...].astype(str))

    elif sub_dtype == np.dtype('V1'): 
        value_str = bytes(dataset[...][0])

    elif sub_dtype == np.dtype('S1'): 
        str_list = [''.join(s.astype(str)) for s in dataset[...]]
        value_str = '\n'.join(str_list)

    # otherwise use default repr string
    else: 
        value_str = repr(dataset[...])

    # compose summary 
    name = dataset.name
    attributes = _get_attrs(dataset) 
    
    summary = f'{name}:\n\n{attributes}\n{shape_str}\n\n+VALUES: \n\'{value_str}\'\n\n' 

    if _print:  
        print(summary)
        return None
    else: 
        return summary 
    
        
def _nesting(dataset): 
    '''Report shape or nested shapes of numpy arrays in `dataset`.'''
    
    v = dataset[...]
    shape = v.shape 
    dtype = v.dtype 
    
    shape_str = f'+SHAPE: {v.shape} DTYPE: \'{v.dtype}\'' 
    
    sub_dtype = None 
    
    if dtype == np.dtype('O'):
        
        subshape_list = [a.shape for a in v] 
        sub_dtype = v[-1].dtype 
        n_sub = len(subshape_list)
        
        if n_sub > 4: 
            
            subshape_list = subshape_list[0:4]
            subshape_list.append(f'....')
        
        subshape_str = f'{subshape_list}' 
        shape_str = shape_str + ' SUBSHAPES: ' + subshape_str + f' SUB_DTYPE: \'{sub_dtype}\''
    
    return shape_str, dtype, sub_dtype 


def _get_attrs(dataset): 
    '''Report information from `dataset` attributes.'''
    
    attr_items = dataset.attrs.items()
    attr_keys = []
    attr_values = []

    for k, v in attr_items:
        attr_keys.append(k) 

        # get rid of outer array 
        if v[0].dtype == np.dtype('S1'):       
            v = ''.join(v[0].astype(str))  
        if type(v) == np.ndarray: 
            v = v[0]
        
        if k == 'MapSetup': 
            v = bytes(v)
        
        if k == 'TubeTemperature': 
            v = f'{v:0.2f}'
            
        attr_values.append(v) 
            

    attr_dict = dict(zip(attr_keys, attr_values))
    
    if len(attr_dict) > 0: 
    
        repr_string = '+ATTRIBUTES: \n'

        for k, v in attr_dict.items():

            repr_string = repr_string + f'        - {k}: {v}\n'
            
    else: 
        repr_string = '+ATTRIBUTES: (none)\n'
        
    #print(repr_string) 
        
   
    return repr_string