In [2]:
import h5py
import pandas as pd
from collections import Counter
import os
import re

directory = '/data3/afisher/cdmslite-run3-cuts-output/'
file_names = []


for file in os.listdir(directory):
    if os.path.isfile(os.path.join(directory, file)):
        file_names.append(file)

def print_structure(name, obj):
    if isinstance(obj, h5py.Group):
        print(f'Group: {name}')
    elif isinstance(obj, h5py.Dataset):
        print(f"Dataset: {name}, Shape: {obj.shape}, Data type: {obj.dtype}")
        # Print the dataset values (for small datasets)
        data = obj[()]
        print(f"Values: {data}")

def create_metadata(output_path):
    with h5py.File(output_path, 'w') as f:
        short_names = []

        # pattern fits files like: cut_output_bg-restricted_IsGlitch_chisq_CDMSliteR3.csv
        pattern = r'(?:(cut_output_bg-|out_bg-))(?:restricted_)(.*?)(?:_CDMSliteR3.csv)'

        # allCuts_pattern fits files like: out_bg-restricted_allCutsOld_inclPmult.csv
        allCuts_pattern = r'(?:.*?-restricted_)(.*?)(?:\.csv)'

        # Together, they capture all files besides the README

        id_path = directory+'ID_CDMSliteR3.csv'
        cdms_ids = pd.read_csv(id_path)
        cdms_index = cdms_ids.iloc[:,0]
        series_number = cdms_ids.iloc[:,1]

        for file in file_names:
            if file in ['ID_CDMSliteR3.csv', 'README.md']:
                continue
            match = re.search(pattern, file)
            allCuts_match = re.search(allCuts_pattern, file)
            # Fill list with names to create easy to read group names
            if match:
                name = match.group(1) + match.group(2)

            elif allCuts_match:
                name = allCuts_match.group(1)
            
            short_names.append(name)

            # Create group for each cut file
            cut_group = f.create_group(name)

            # Load cut_data
            cut_data = pd.read_csv(directory+file)
            
            j = 0
            for i in range(len(cut_data)): #range(5):
                if cut_data.iloc[i,0] == 1:
                    series_match = series_number[i]
                    #print(series_match)
                    cut_group.create_dataset(f'UID', data=series_match)
                    j += 1

def explore_structure(file_path):
    # Open the HDF5 file in read mode
    with h5py.File(file_path, 'r') as f:
        
        # Use the visititems method to walk through the file
        f.visititems(print_structure)

#output_path = '/data3/afisher/soudan_output/metadata.hdf5'
output_path = 'metadata.hdf5'

#create_metadata(output_path)
explore_structure(output_path)

KeyboardInterrupt: 