In [1]:
import os, glob, h5py
import numpy as np
from tqdm.auto import tqdm

def convert_npy_to_hdf5(root_dir, output_path="flair_dataset.h5"):
    """
    subject 단위 npy → HDF5로 변환 (2D slice 또는 3D volume 모두 지원)

    - 2D: (N, H, W)
    - 3D: (N, C, H, W)

    Args:
        root_dir: subject 폴더들이 모여있는 상위 경로
        output_path: 변환된 HDF5 파일 저장 경로
    """
    img_files = sorted(glob.glob(os.path.join(root_dir, "*", "*AxialSlices_padded.npy")))
    lab_files = sorted(glob.glob(os.path.join(root_dir, "*", "*label_sliceLevel.npy")))
    subj_dirs = [os.path.basename(os.path.dirname(f)) for f in img_files]

    assert len(img_files) == len(lab_files), "이미지와 라벨 파일 수가 다릅니다!"

    # --- 총 샘플 수 계산 ---
    total_samples, data_shape = 0, None
    for img_f, lab_f in zip(img_files, lab_files):
        imgs = np.load(img_f, mmap_mode="r")  # (N,H,W) or (N,C,H,W)
        labs = np.load(lab_f, mmap_mode="r")
        if labs.ndim == 2: labs = labs[:, 0]

        total_samples += imgs.shape[0]
        if data_shape is None:
            data_shape = imgs.shape[1:]  # (H,W) or (C,H,W)

    print(f"[INFO] 총 샘플 수: {total_samples}, 데이터 shape={data_shape}")

    # --- HDF5 저장 ---
    with h5py.File(output_path, "w") as f:
        # 이미지 dataset
        if len(data_shape) == 2:   # 2D slice
            dset_img = f.create_dataset(
                "images", shape=(total_samples, data_shape[0], data_shape[1]),
                dtype="float32", compression="gzip", chunks=(1, data_shape[0], data_shape[1])
            )
        elif len(data_shape) == 3: # 3D volume
            dset_img = f.create_dataset(
                "images", shape=(total_samples, data_shape[0], data_shape[1], data_shape[2]),
                dtype="float32", compression="gzip", chunks=(1, data_shape[0], data_shape[1], data_shape[2])
            )
        else:
            raise ValueError(f"지원하지 않는 데이터 shape: {data_shape}")

        # 라벨 dataset
        dset_lab = f.create_dataset(
            "labels", shape=(total_samples,), dtype="int8",
            compression="gzip", chunks=(1024,)
        )

        # subject_id dataset (가변 길이 문자열)
        string_dt = h5py.string_dtype(encoding="utf-8")
        dset_sid = f.create_dataset(
            "subject_ids", shape=(total_samples,), dtype=string_dt
        )

        # 데이터 채우기
        offset = 0
        for img_f, lab_f, subj in tqdm(zip(img_files, lab_files, subj_dirs), total=len(img_files)):
            imgs = np.load(img_f).astype("float32")
            labs = np.load(lab_f).astype("int8")
            if labs.ndim == 2: labs = labs[:, 0]

            n_samples = imgs.shape[0]
            dset_img[offset:offset+n_samples] = imgs
            dset_lab[offset:offset+n_samples] = labs
            dset_sid[offset:offset+n_samples] = [subj] * n_samples  # ✅ 각 slice/volume 에 subject_id 기록
            offset += n_samples

    print(f"[완료] 변환된 HDF5 파일 저장: {output_path}")

In [2]:
root_dir = "/zdisk/users/ext_user_03/01_yschoi/project_01_FVH_detection/01_data/04_flair_preproc_slices_v02"
output_path = "/zdisk/users/ext_user_03/01_yschoi/project_01_FVH_detection/01_data/06_flair_preproc_slices_v02_h5/flair_slice_dataset.h5"

convert_npy_to_hdf5(root_dir, output_path)

[INFO] 총 샘플 수: 60513, 데이터 shape=(672, 672)


  0%|          | 0/2514 [00:00<?, ?it/s]

[완료] 변환된 HDF5 파일 저장: /zdisk/users/ext_user_03/01_yschoi/project_01_FVH_detection/01_data/06_flair_preproc_slices_v02_h5/flair_slice_dataset.h5


In [3]:
import os
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

path_load = "/zdisk/users/ext_user_03/01_yschoi/project_01_FVH_detection/01_data/04_flair_preproc_slices_v02"
path_save = "/zdisk/users/ext_user_03/01_yschoi/project_01_FVH_detection/01_data/04_flair_preproc_slices_v02_compress"

idx_bt = 12
idx_up = 15

list_subjects = [i for i in os.listdir(path_load) if not(i.startswith('.'))]
for _subject in tqdm(list_subjects[:]):
    _fn_img = f"{os.path.join(path_load, _subject)}/{_subject}_AxialSlices_padded.npy"
    _fn_label = f"{os.path.join(path_load, _subject)}/{_subject}_label_sliceLevel.npy"
    
    _img = np.load(_fn_img)
    _label = np.load(_fn_label)
    
    if any(_label):
        # _label에 positive(1)이 하나라도 포함되어 있는 경우
        _mask_pos = _label == 1
        _img_new = _img[_mask_pos]
        _label_new = _label[_mask_pos]
        # print(_subject)
        # print(_label)
        # print(_mask_pos)
        # print()
        
        # for _slice in _img_new:
        #     plt.figure()
        #     plt.imshow(_slice, cmap='gray')    
    else:
        _img_new = _img[idx_bt: idx_up+1]
        _label_new = _label[idx_bt: idx_up+1]
        
    _path_save = os.path.join(path_save, _subject)
    os.makedirs(_path_save, exist_ok=True)

    _path_save_img = os.path.join(_path_save, f"{_subject}_AxialSlices_padded.npy")
    _path_save_label = os.path.join(_path_save, f"{_subject}_label_sliceLevel.npy")
    np.save(_path_save_img, _img_new)
    np.save(_path_save_label, _label_new)
        


  0%|          | 0/2514 [00:00<?, ?it/s]

In [4]:
root_dir = "/zdisk/users/ext_user_03/01_yschoi/project_01_FVH_detection/01_data/04_flair_preproc_slices_v02_compress"
output_path = "/zdisk/users/ext_user_03/01_yschoi/project_01_FVH_detection/01_data/06_flair_preproc_slices_v02_compress_h5/flair_slice_dataset.h5"

convert_npy_to_hdf5(root_dir, output_path)

[INFO] 총 샘플 수: 10165, 데이터 shape=(672, 672)


  0%|          | 0/2514 [00:00<?, ?it/s]

[완료] 변환된 HDF5 파일 저장: /zdisk/users/ext_user_03/01_yschoi/project_01_FVH_detection/01_data/06_flair_preproc_slices_v02_compress_h5/flair_slice_dataset.h5
