In [1]:
from pathlib import Path

data_dir = Path.cwd() / "data"
assert data_dir.exists(), f"Data directory does not exist: {data_dir}"

In [2]:
import json
from copy import copy

import numpy as np
import h5py

from rich.progress import track

In [3]:
first = json.loads(
    (data_dir / "Versuch_V4_T2_A1_Dec-18-25__19_26.json").read_text())

In [4]:
len(first)

500

In [5]:
first[0].keys()

dict_keys(['meta_data', 'expert_knowledge', 'data'])

In [6]:
first[0]["meta_data"]

{'dataset': 'SPP2422_experiment_tdms',
 'origin': 'experiment',
 'program': 'LabVIEW/NI TDMS',
 'description': 'TDMS measurement V4_T2_A1 | file=Versuch_V4_T2_A1_Dec-18-25__19_26.tdms',
 'created_utc': '2026-01-11T12:39:38.292358Z',
 'source_file': 'Versuch_V4_T2_A1_Dec-18-25__19_26.tdms',
 'group_name': 'Messergebnisse_V4_T2_A1',
 'V': 4,
 'T': 2,
 'A': 1,
 'export': {'target_length': 1200, 'max_hubs_per_file': None},
 'channel_mapping': {'K1_Ch1_Mod2/AI0': 'K1_Ch1_Mod2__AI0',
  'K1_Ch2_Mod2/AI1': 'K1_Ch2_Mod2__AI1',
  'K1_Ch3_Mod2/AI2': 'K1_Ch3_Mod2__AI2',
  'K2_Ch1_Mod2/AI3': 'K2_Ch1_Mod2__AI3',
  'K2_Ch2_Mod2/AI4': 'K2_Ch2_Mod2__AI4',
  'K2_Ch3_Mod2/AI5': 'K2_Ch3_Mod2__AI5',
  'K3_Ch1_Mod2/AI6': 'K3_Ch1_Mod2__AI6',
  'K3_Ch2_Mod2/AI7': 'K3_Ch2_Mod2__AI7',
  'K3_Ch3_Mod2/AI8': 'K3_Ch3_Mod2__AI8',
  'Mod2/AI9': 'Mod2__AI9',
  'Mod7 A0': 'Mod7_A0',
  'Mod7 A1': 'Mod7_A1',
  'Mod7 A2': 'Mod7_A2',
  'Mod7 A3': 'Mod7_A3'},
 'hub_index': 1,
 'hub_range_samples': [197, 10453],
 'stroke': 0

In [7]:
first[0]["expert_knowledge"]

[{'textual_insight': '', 'extracted_features': {}}]

In [8]:
first[0]["data"].keys()

dict_keys(['name', 'label', 'label_unit', 'level', 'origin', 'i.O/n.i.O', 'time_s', 'signals'])

In [9]:
simpler = copy(first[0]["data"])
del simpler["time_s"]
del simpler["signals"]
simpler

{'name': 'Versuch_V4_T2_A1_Dec-18-25__19_26__hub0001',
 'label': None,
 'label_unit': None,
 'level': 1,
 'origin': 'experiment',
 'i.O/n.i.O': None}

In [10]:
t = np.array(first[0]["data"]["time_s"])
t.shape, [t.min().item(), t.mean().item(), t.max().item()]

((1200,), [0.0, 0.5, 1.0])

In [11]:
first[0]["data"]["signals"].keys()

dict_keys(['K1_Ch1_Mod2__AI0', 'K1_Ch2_Mod2__AI1', 'K1_Ch3_Mod2__AI2', 'K2_Ch1_Mod2__AI3', 'K2_Ch2_Mod2__AI4', 'K2_Ch3_Mod2__AI5', 'K3_Ch1_Mod2__AI6', 'K3_Ch2_Mod2__AI7', 'K3_Ch3_Mod2__AI8', 'Mod2__AI9', 'Mod7_A0', 'Mod7_A1', 'Mod7_A2', 'Mod7_A3'])

In [12]:
s = np.array(first[0]["data"]["signals"]["K1_Ch1_Mod2__AI0"])
s.shape, [s.min().item(), s.mean().item(), s.max().item()]

((1200,), [-0.09008782699368648, 0.4326112404590785, 6.710733563229876])

In [13]:
agg = []

# We iterate over all files in the data directory and collect all data
for file_path in track(list(data_dir.glob("Versuch_*.json"))):
    data = json.loads(file_path.read_text())

    for i, data_entry in enumerate(data):
        agg.append({
            "meta_data": {
                "Versuch": file_path.stem,
                "index": i,
                "V": data_entry["meta_data"]["V"],
                "T": data_entry["meta_data"]["T"],
                "A": data_entry["meta_data"]["A"],
            },
            "data": {
                key: np.array(value)
                for key, value in {
                    "time_s": data_entry["data"]["time_s"],
                    **data_entry["data"]["signals"]
                }.items()
            }
        })

# Get signal keys from first entry
signal_keys = list(agg[0]["data"].keys())
n_entries = len(agg)
n_points = len(agg[0]["data"]["time_s"])

print(f"Number of entries: {n_entries}")
print(f"Number of time points per entry: {n_points}")
print(f"Number of signals: {len(signal_keys)}")

Output()

Number of entries: 5000
Number of time points per entry: 1200
Number of signals: 15


In [14]:
# Stack all data into single arrays
# Shape: (n_entries, n_signals, n_points)
data_array = np.stack([
    np.stack([entry["data"][key] for key in signal_keys])
    for entry in agg
])

# Metadata arrays
versuch_array = np.array([entry["meta_data"]["Versuch"]
                         for entry in agg], dtype="S64")
index_array = np.array([entry["meta_data"]["index"]
                       for entry in agg], dtype=np.int32)
v_array = np.array([entry["meta_data"]["V"] for entry in agg], dtype=np.int32)
t_array = np.array([entry["meta_data"]["T"] for entry in agg], dtype=np.int32)
a_array = np.array([entry["meta_data"]["A"] for entry in agg], dtype=np.int32)

print(f"Data array shape: {data_array.shape}")

# Dump that into an HDF5 file for easier access later
with h5py.File(data_dir / "all_data.h5", "w") as h5f:
    # Store signal data as single dataset
    h5f.create_dataset("data", data=data_array, compression="gzip")

    # Store signal names as attribute
    h5f["data"].attrs["signal_keys"] = [k.encode() for k in signal_keys]

    # Store metadata as datasets
    meta_grp = h5f.create_group("meta_data")
    meta_grp.create_dataset("Versuch", data=versuch_array)
    meta_grp.create_dataset("index", data=index_array)
    meta_grp.create_dataset("V", data=v_array)
    meta_grp.create_dataset("T", data=t_array)
    meta_grp.create_dataset("A", data=a_array)

Data array shape: (5000, 15, 1200)


In [15]:
# Print the file size
h5_file_size = (data_dir / "all_data.hdf5").stat().st_size / (1024 ** 2)
print(f"HDF5 file size: {h5_file_size:.2f} MB")

# Verify that we can read back the data correctly
with h5py.File(data_dir / "all_data.hdf5", "r") as h5f:
    read_data = h5f["data"][:]
    read_signal_keys = [k for k in h5f["data"].attrs["signal_keys"]]

    assert np.array_equal(data_array, read_data), "Data mismatch"
    assert read_signal_keys == signal_keys, "Signal keys mismatch"
    assert np.array_equal(h5f["meta_data/Versuch"][:],
                          versuch_array), "Versuch mismatch"
    assert np.array_equal(h5f["meta_data/V"][:], v_array), "V mismatch"
    assert np.array_equal(h5f["meta_data/T"][:], t_array), "T mismatch"
    assert np.array_equal(h5f["meta_data/A"][:], a_array), "A mismatch"

del agg, data_array  # Free memory

print("HDF5 file verified successfully.")

HDF5 file size: 490.74 MB
HDF5 file verified successfully.
