In [1]:
import h5py
import numpy as np
import pandas as pd

## Extract & Filter Data
## Convert HDF5 to CSV
## Compute Statistical Insights
## Detect Anomalies (Outliers)
Write a function that loads an HDF5 file and filters measurements based on a temperature range
Convert an HDF5 dataset to a CSV file

In [73]:
def make_dataframe(file, save_as_csv=False, path_to_save=None):
    with h5py.File(file, 'r') as hdf:
        df = pd.DataFrame()
        for key in hdf.keys():
            if 'measurement' in key.lower():
                measurements = hdf[key]
                for key in measurements.keys():
                    if 'timestamp' in key.lower():
                        column = [t.decode('utf-8') for t in measurements[key][:]]
                    else:
                        column = measurements[key][:]
                    df[key] = column
    if save_as_csv:
        df.to_csv(path_to_save, index=False)
    return df
    
def filter_hdf5_data(file, temp_min, temp_max):
    df = make_dataframe(file)
    temp_cols = [col for col in df.columns if 'temperature' in col.lower()]
    if not temp_cols:
        return 'No temperature column found'
    temp_col = temp_cols[0]
    return df[df[temp_col].between(temp_min, temp_max)]
    

def stats_hdf5_data(file):
    df = make_dataframe(file)
    stats = df.describe()
    return stats[1:]

def get_outliers(file, threshold):
    from scipy.stats import zscore
    df = make_dataframe(file)
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    df_z = zscore(df[numerical_cols])
    return df[(df_z > threshold).any(axis=1)]
    

In [74]:
make_dataframe('beamline_experiment.h5')

Unnamed: 0,Intensity (counts),Temperature (°C),Timestamp
0,1200.0,295.0,2025-03-16T14:05:00
1,1250.0,310.0,2025-03-16T14:10:00
2,1350.0,320.0,2025-03-16T14:15:00
3,1400.0,330.0,2025-03-16T14:20:00
4,1450.0,340.0,2025-03-16T14:25:00
5,1500.0,350.0,2025-03-16T14:30:00


In [75]:
filter_hdf5_data('beamline_experiment.h5', 200, 300)

Unnamed: 0,Intensity (counts),Temperature (°C),Timestamp
0,1200.0,295.0,2025-03-16T14:05:00


In [59]:
stats_hdf5_data('beamline_experiment.h5')

Unnamed: 0,Intensity (counts),Temperature (°C)
mean,1358.333374,324.166656
std,115.830338,20.103897
min,1200.0,295.0
25%,1275.0,312.5
50%,1375.0,325.0
75%,1437.5,337.5
max,1500.0,350.0


In [64]:
get_outliers('beamline_experiment.h5', 3)

Unnamed: 0,Intensity (counts),Temperature (°C),Timestamp
