In [None]:
#default_exp crono

In [None]:
#hide 
%load_ext autoreload
%autoreload 2

In [None]:
#hide 
import os 
os.chdir('/home/frank/Work/Projecten/DoRe/data/maxrf/crono')

# Exploring the Crono MA-XRF HDF5 file format 

> A peek inside 

The HDF5 file format is an open self documentable data container file format. It contains Groups and Datasets that can be understood as (sub)folders and files. An excellent python package that can be used to explore and read the contents of an hdf5 file is [h5py](https://docs.h5py.org/en/stable/index.html) by Andrew Collette. If you want to read more, the book by Andrew Collette *Python and HDF5 - Unlocking scientific data*, is an excellent introduction. 

<a href="https://www.oreilly.com/library/view/python-and-hdf5/9781491944981/"><img width="10%" src="https://learning.oreilly.com/library/cover/9781491944981/250w/"></a>

In [None]:
from cronomaxrf import report

In [None]:
crono_filename = '14200215102021-blindTest2AgedDetail.HDF5' # 50 Mb 

report(crono_filename)

--------------------------------------------------------------------------------
CRONO FILE: 14200215102021-blindTest2AgedDetail.HDF5

DATASETS: 

[0*] /Configuration/FinalSystemStatus
[1*] /Configuration/InitialSystemStatus
[2*] /Configuration/Settings
[3] /CreatedDateTime
[4] /CreatedWithSoftwareVersion
[5] /FileStructureVersion
[6*] /Images/Documentation
[7] /Images/ImageAdjusting/ImagePixelSizeRatio
[8*] /Images/Static
[9*] /Images/StitchedImage
[10*] /Images/VideoStreams
[11] /XRF/CalibrationPoints
[12] /XRF/DPPStartTimestamp
[13] /XRF/Detected
[14] /XRF/EnergyVector
[15] /XRF/LiveTimes
[16] /XRF/Maps/ComputedMaps
[17] /XRF/MotorsPositions
[18] /XRF/MotorsPositionsStats
[19] /XRF/MotorsTimestamps
[20] /XRF/OutputCountRates
[21] /XRF/ROIs/ROIsEnergies
[22*] /XRF/ROIs/ROIsNames
[23*] /XRF/ROIs/ROIsObjects
[24] /XRF/RealTimes
[25] /XRF/ResetCounterValue
[26] /XRF/Spectra
[27] /XRF/SpectraIDs
[28] /XRF/SpectraSelectedIndex
[29] /XRF/Timestamps
[30] /XRF/XComputedMapsCoordinate
[31] /X

     array([array([], dtype='|V1'),
       array([b'\x00', b'\x01', b'\x00', ..., b'\xFF', b'\xD9', b'\x0B'],
             dtype='|V1')                                                ,
       array([b'\x00', b'\x01', b'\x00', ..., b'\xFF', b'\xD9', b'\x0B'],
             dtype='|V1')                                                ,
       ...,
       array([b'\x00', b'\x01', b'\x00', ..., b'\xFF', b'\xD9', b'\x0B'],
             dtype='|V1')                                                ,
       array([b'\x00', b'\x01', b'\x00', ..., b'\xFF', b'\xD9', b'\x0B'],
             dtype='|V1')                                                ,
       array([b'\x00', b'\x01', b'\x00', ..., b'\xFF', b'\xD9', b'\x0B'],
             dtype='|V1')                                                ],
      dtype=object)

[9*] /Images/StitchedImage

     array([array([], dtype='|V1')], dtype=object)

[10*] /Images/VideoStreams

     array([array([], dtype='|V1')], dtype=object)

[11] /XRF/CalibrationPoin

# API 

In [None]:
#export 

import h5py 
import numpy as np 



def report(crono_filename, content=True): 
    '''Print datasets and their content from `crono_filename`. '''
    
    fh = h5py.File(crono_filename, mode='r')

    groups_and_datasets = []
    fh.visit(groups_and_datasets.append) 

    datasets = [fh[d] for d in groups_and_datasets if (type(fh[d]) == h5py.Dataset)]

    blob_indices = []
    for i, d in enumerate(datasets): 
        if d.dtype == np.dtype('O'): 
            blob_indices.append(i)
            #print(f'datasets[{i}]: {d.name}') 

    # print dataset names
    hr = '-'*80
    print(hr)
    print(f'CRONO FILE: {crono_filename}\n')
    print('DATASETS: \n')
    for i, d in enumerate(datasets): 

        star = ''
        if i in blob_indices: 
            star = '*'

        print(f'[{i}{star}] {d.name}')
    print('')
    print(hr)
    print('CONTENT:\n')    
    # print dataset names and their content 
    
    if content: 
        for i, d in enumerate(datasets): 

            star = ''
            if i in blob_indices: 
                star = '*'

            print(f'[{i}{star}] {d.name}')

            keys = d.attrs.keys()

            if len(keys) > 0:  

                for k in keys: 
                    value = d.attrs[k]
                    value, dtype = _simplify_repr(value)
                    print()
                    print(f'    {k}: {value}')
                    print(f'      dtype={dtype}')

            else: 
                print()
                print(f'     {d[...].__repr__()}')

            print('')
            
def _simplify_repr(value):
    '''If dataset attribute `value` contains nested array of characters, then simplify representation. 
    
    Otherwise, return default representation string.''' 

    
    # assuming array(array(['b', 'l', 'a', 'h', ...], dtype='|S1')], dtype=object)
    
    if value[0].dtype == np.dtype('S1'):       
        repr_string = ''.join(value[0].astype(str))  
        dtype = 'nested_array_of_characters'
    else:     
        repr_string = repr(value) 
        dtype = value.dtype 
        
    return repr_string, dtype  
     