In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import h5py


In [None]:
path = './data/2025-03-25_13-37-56/HC-5059/run6_Pup2/_/'

In [None]:
with h5py.File(path+'hc5059-run6-run6_Pup2.h5', 'r') as hdf:
    print(hdf.keys())
    for key in hdf.keys():
        print(hdf[key].keys())

<KeysViewHDF5 ['entry_0000: run6 - run6_Pup2']>
<KeysViewHDF5 ['end_time', 'folder_path', 'instrument', 'measurement', 'proposal', 'sample', 'start_time']>


In [14]:
def make_dataframe(file, save_as_csv=False, path_to_save=None):
    with h5py.File(file, 'r') as hdf:
        df = pd.DataFrame()
        for key in hdf.keys():
            if 'measurement' in key.lower():
                measurements = hdf[key]
                for key in measurements.keys():
                    if 'timestamp' in key.lower():
                        column = [t.decode('utf-8') for t in measurements[key][:]]
                    else:
                        column = measurements[key][:]
                    df[key] = column
    if save_as_csv:
        df.to_csv(path_to_save, index=False)
    return df

In [16]:
df = make_dataframe(path+'hc5059-run6-run6_Pup2.h5')

In [18]:
def explore_hdf5_file(filepath):
    """
    Open and explore the structure of an HDF5 file.

    Parameters:
        filepath (str): Path to the HDF5 file.
    """
    def print_structure(name, obj):
        if isinstance(obj, h5py.Group):
            print(f"Group: {name}")
        elif isinstance(obj, h5py.Dataset):
            print(f"Dataset: {name}, Shape: {obj.shape}, Dtype: {obj.dtype}")

    with h5py.File(filepath, 'r') as f:
        print(f"\nContents of HDF5 file: {filepath}")
        f.visititems(print_structure)

In [19]:
explore_hdf5_file(path+'hc5059-run6-run6_Pup2.h5')


Contents of HDF5 file: ./data/2025-03-25_13-37-56/HC-5059/run6_Pup2/_/hc5059-run6-run6_Pup2.h5
Group: entry_0000: run6 - run6_Pup2
Dataset: entry_0000: run6 - run6_Pup2/end_time, Shape: (), Dtype: |S26
Dataset: entry_0000: run6 - run6_Pup2/folder_path, Shape: (), Dtype: |S44
Group: entry_0000: run6 - run6_Pup2/instrument
Dataset: entry_0000: run6 - run6_Pup2/instrument/name, Shape: (), Dtype: |S8
Group: entry_0000: run6 - run6_Pup2/instrument/source
Dataset: entry_0000: run6 - run6_Pup2/instrument/source/current, Shape: (), Dtype: |S5
Dataset: entry_0000: run6 - run6_Pup2/instrument/source/mode, Shape: (), Dtype: |S5
Group: entry_0000: run6 - run6_Pup2/measurement
Dataset: entry_0000: run6 - run6_Pup2/proposal, Shape: (), Dtype: |S6
Group: entry_0000: run6 - run6_Pup2/sample
Dataset: entry_0000: run6 - run6_Pup2/sample/description, Shape: (), Dtype: |S5
Dataset: entry_0000: run6 - run6_Pup2/sample/name, Shape: (), Dtype: |S4
Dataset: entry_0000: run6 - run6_Pup2/start_time, Shape: (),

In [22]:
def explore_and_read_hdf5(filepath, max_elements=10):
    """
    Explore HDF5 file structure and read small datasets.

    Parameters:
        filepath (str): Path to the HDF5 file.
        max_elements (int): Maximum number of elements to print from a dataset.
    """
    def print_structure_and_data(name, obj):
        if isinstance(obj, h5py.Group):
            print(f"Group: {name}")
        elif isinstance(obj, h5py.Dataset):
            print(f"Dataset: {name}, Shape: {obj.shape}, Dtype: {obj.dtype}")
            try:
                # Only print if dataset is scalar or small
                if obj.shape == () or (len(obj.shape) == 1 and obj.shape[0] <= max_elements):
                    print(f"  Value: {obj[()].decode('utf-8')}")
            except Exception as e:
                print(f"  Could not read dataset: {e}")

    with h5py.File(filepath, 'r') as f:
        print(f"\nContents of HDF5 file: {filepath}")
        f.visititems(print_structure_and_data)


In [23]:
explore_and_read_hdf5(path+'hc5059-run6-run6_Pup2.h5')


Contents of HDF5 file: ./data/2025-03-25_13-37-56/HC-5059/run6_Pup2/_/hc5059-run6-run6_Pup2.h5
Group: entry_0000: run6 - run6_Pup2
Dataset: entry_0000: run6 - run6_Pup2/end_time, Shape: (), Dtype: |S26
  Value: 2023-02-26T17:28:38.753767
Dataset: entry_0000: run6 - run6_Pup2/folder_path, Shape: (), Dtype: |S44
  Value: /data/visitor/hc5059/id06-lvp/run6/run6_Pup2
Group: entry_0000: run6 - run6_Pup2/instrument
Dataset: entry_0000: run6 - run6_Pup2/instrument/name, Shape: (), Dtype: |S8
  Value: id06-lvp
Group: entry_0000: run6 - run6_Pup2/instrument/source
Dataset: entry_0000: run6 - run6_Pup2/instrument/source/current, Shape: (), Dtype: |S5
  Value: ERROR
Dataset: entry_0000: run6 - run6_Pup2/instrument/source/mode, Shape: (), Dtype: |S5
  Value: ERROR
Group: entry_0000: run6 - run6_Pup2/measurement
Dataset: entry_0000: run6 - run6_Pup2/proposal, Shape: (), Dtype: |S6
  Value: hc5059
Group: entry_0000: run6 - run6_Pup2/sample
Dataset: entry_0000: run6 - run6_Pup2/sample/description, S

In [39]:
df1 = pd.DataFrame({
    'Sensor_1': np.random.randn(100),
    'Sensor_2': np.random.randint(0, 100, size=100),
    'Date': pd.date_range('20230101', periods=100)
})

In [40]:
df1.to_hdf('example_pandas_to_hdf.h5', key='sensor_readings', mode='w')

In [41]:
data = pd.read_hdf('example_pandas_to_hdf.h5')

In [42]:
data

Unnamed: 0,Sensor_1,Sensor_2,Date
0,-0.414846,77,2023-01-01
1,1.148940,69,2023-01-02
2,0.101305,11,2023-01-03
3,-0.586764,33,2023-01-04
4,-0.680189,69,2023-01-05
...,...,...,...
95,-0.455919,5,2023-04-06
96,0.642878,27,2023-04-07
97,0.419097,3,2023-04-08
98,1.269762,15,2023-04-09


In [43]:
df2 = pd.DataFrame({
    'Pressure': range(1,12)
})

In [45]:
with pd.HDFStore('multi_data.h5', mode='w') as store:
    store.put('df1', df1)
    store.put('df2', df2)

In [48]:
with pd.HDFStore('multi_data.h5', mode='r') as store:
    print(store.keys())

['/df1', '/df2']


In [51]:
data = pd.read_hdf('multi_data.h5',key='df1')

In [52]:
data

Unnamed: 0,Sensor_1,Sensor_2,Date
0,-0.414846,77,2023-01-01
1,1.148940,69,2023-01-02
2,0.101305,11,2023-01-03
3,-0.586764,33,2023-01-04
4,-0.680189,69,2023-01-05
...,...,...,...
95,-0.455919,5,2023-04-06
96,0.642878,27,2023-04-07
97,0.419097,3,2023-04-08
98,1.269762,15,2023-04-09


In [58]:
df1.to_hdf('data_query.h5', key='sensor_readings', format='table', mode='w', data_columns=True)

In [59]:
with pd.HDFStore('data_query.h5', mode='r') as store:
    print(store.keys())

['/sensor_readings']


In [60]:
result = pd.read_hdf('data_query.h5', key='/sensor_readings', where='Sensor_2 > 25')

In [61]:
result

Unnamed: 0,Sensor_1,Sensor_2,Date
0,-0.414846,77,2023-01-01
1,1.148940,69,2023-01-02
3,-0.586764,33,2023-01-04
4,-0.680189,69,2023-01-05
6,-0.225088,64,2023-01-07
...,...,...,...
89,0.098666,93,2023-03-31
92,-0.444442,66,2023-04-03
93,-0.659863,78,2023-04-04
94,-0.738505,49,2023-04-05
