# Step 1: Findability

The goal is to make your dataset easy to identify and search for by both humans and machines.

In [6]:
import pandas as pd
import h5py

In [2]:
data = pd.read_csv('./data/NOx_data.csv')

In [4]:
data = data[data.columns[:-6]]

In [31]:
with h5py.File('NOx.h5', 'w') as hdf:
    # meta_group = hdf.create_group("metadata", track_order=True)
    hdf.attrs["ID"] = "smoke_2024_10_01"
    hdf.attrs["title"] = "Smoke Data of 2024-10-01"
    hdf.attrs["description"] = "This dataset contains the NOx and Effectiveness data for further ML"
    hdf.attrs["author"] = "D. Druzhbin"
    hdf.attrs["email"] = ""
    hdf.attrs["organization"] = "Halias Technologies"
    hdf.attrs["keywords"] = "NOx, Effectiveness, Smoke, 2024-10-01, ML"
    hdf.attrs["date_created"] = "2024-10-01"
    hdf.attrs["data_format"] = "hdf5"
    hdf.attrs["license"] = ""

    sensor_readings = hdf.create_dataset("sensor_readings", data=data.values, dtype='float32')
    sensor_readings.attrs["columns"] = ", ".join(data.columns)

In [32]:
with h5py.File('NOx.h5', 'r') as hdf:
    sensor_readings = hdf['sensor_readings']
    print(sensor_readings.attrs['columns'])

O2, CO, NOx, T.Gas, T.Air, Draught, CO2, Eff., Losses, Excess Air, Dew point, T. Sensor, T1 DeltaT, T2 DeltaT, Air pressure, I.Flow


In [33]:
with h5py.File("NOx.h5", "r") as hdf:
    print("\nGlobal Metadata:")
    for key, value in hdf.attrs.items():
        print(f"{key}: {value}")

    print("\nDataset Metadata:")
    dataset = hdf["sensor_readings"]
    print("Columns:", dataset.attrs["columns"])  # Retrieve column names



Global Metadata:
ID: smoke_2024_10_01
author: D. Druzhbin
data_format: hdf5
date_created: 2024-10-01
description: This dataset contains the NOx and Effectiveness data for further ML
email: 
keywords: NOx, Effectiveness, Smoke, 2024-10-01, ML
license: 
organization: Halias Technologies
title: Smoke Data of 2024-10-01

Dataset Metadata:
Columns: O2, CO, NOx, T.Gas, T.Air, Draught, CO2, Eff., Losses, Excess Air, Dew point, T. Sensor, T1 DeltaT, T2 DeltaT, Air pressure, I.Flow


In [35]:
import h5py
import hashlib

def compute_checksum(file_path):
    """Compute SHA256 checksum of the file to ensure data integrity."""
    sha256_hash = hashlib.sha256()
    with open(file_path, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

def check_fair_compliance(hdf5_file):
    """Checks FAIR compliance for an HDF5 dataset."""
    required_metadata = [
        "ID", "title", "description", "author", "organization",
        "keywords", "date_created", "data_format", "license"
    ]
    
    with h5py.File(hdf5_file, "r") as hdf:
        print("\nChecking FAIR Compliance for:", hdf5_file)

        # Step 1: Check Global Metadata
        print("\nGlobal Metadata Verification:")
        missing_metadata = []
        for key in required_metadata:
            if key in hdf.attrs:
                print(f"{key}: {hdf.attrs[key]}")
            else:
                print(f"MISSING: {key}")
                missing_metadata.append(key)

        # Step 2: Check Dataset Structure
        print("\nDataset Verification:")
        if "sensor_readings" in hdf:
            dataset = hdf["sensor_readings"]
            print("Dataset 'sensor_readings' found.")

            # Ensure column names metadata exists
            if "columns" in dataset.attrs:
                print(f"Columns: {dataset.attrs['columns']}")
            else:
                print("MISSING: Column metadata")
        else:
            print("Dataset 'sensor_readings' NOT found!")

        # Step 3: Compute Checksum for Data Integrity
        checksum = compute_checksum(hdf5_file)
        print("\nData Integrity Check:")
        print(f"SHA256 Checksum: {checksum}")

        # Step 4: FAIR Compliance Summary
        print("\nFAIR Compliance Report:")
        if not missing_metadata and "sensor_readings" in hdf and "columns" in dataset.attrs:
            print("The dataset meets FAIR principles!")
        else:
            print("Some FAIR requirements are missing. Please update your metadata.")

# Run the compliance check on your HDF5 file
hdf5_filename = "NOx.h5"  # Update this if needed
check_fair_compliance(hdf5_filename)


Checking FAIR Compliance for: NOx.h5

Global Metadata Verification:
ID: smoke_2024_10_01
title: Smoke Data of 2024-10-01
description: This dataset contains the NOx and Effectiveness data for further ML
author: D. Druzhbin
organization: Halias Technologies
keywords: NOx, Effectiveness, Smoke, 2024-10-01, ML
date_created: 2024-10-01
data_format: hdf5
license: 

Dataset Verification:
Dataset 'sensor_readings' found.
Columns: O2, CO, NOx, T.Gas, T.Air, Draught, CO2, Eff., Losses, Excess Air, Dew point, T. Sensor, T1 DeltaT, T2 DeltaT, Air pressure, I.Flow

Data Integrity Check:
SHA256 Checksum: ef6174db00a5a406e3fca84fba8283961359f1e24d69a07f388a895102fd8107

FAIR Compliance Report:
The dataset meets FAIR principles!
