# hdf5 dataset

Original dataset in h5 format:
https://zenodo.org/record/3746119#.ZCPs7exBwq-

### Setup

In [3]:
import h5py
import os
import soundfile as sf
import numpy as np
from pathlib import Path

## h5py data

In [4]:
H5_DATA = '/homedtic/fpapaleo/smc-spring-reverb/dataset'

In [14]:
# Verify if the dataset contains any metadata
with h5py.File(os.path.join(H5_DATA,'dry_train.h5'), 'r') as f:
    print(f['Xtrain'].attrs.keys())
    print(f['Xtrain'].attrs.values())
    values = list(f['Xtrain'].attrs.values())
    print(values)

<KeysViewHDF5 []>
ValuesViewHDF5(<Attributes of HDF5 object at 22474360750224>)
[]


In [18]:
# Printing the dataset names and shapes

# Iterate over all files in the H5_DATA directory
for file_name in os.listdir(H5_DATA):
    # Check if the file has the .h5 extension
    if file_name.endswith('.h5'):
        with h5py.File(os.path.join(H5_DATA, file_name), 'r') as f:
            # Print the list of datasets in the file
            print(f"File: {file_name}")
            print(list(f.keys()))

            # Iterate over all datasets in the file
            for dataset_name in f.keys():
                # Load the dataset
                data = np.array(f[dataset_name])

                # Print information about the dataset
                print(f"Dataset: {dataset_name}")
                print(data.shape)
                print(data.dtype)
                print()

File: dry_val_test.h5
['Xvalidation']
Dataset: Xvalidation
(64, 32000, 1)
float64

File: wet_val_test.h5
['Yvalidation_0']
Dataset: Yvalidation_0
(64, 32000, 1)
float64

File: wet_train.h5
['Ytrain_0']
Dataset: Ytrain_0
(1122, 32000, 1)
float64

File: dry_train.h5
['Xtrain']
Dataset: Xtrain
(1122, 32000, 1)
float64



In [19]:
import torch
from torch.utils.data import Dataset, DataLoader

In [20]:
# Instantiate the dataset

class CustomH5Dataset(Dataset):
    def __init__(self, data_list):
        self.data_list = data_list

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        return self.data_list[idx]

In [21]:
train_data_list = []
val_data_list = []

for file_name in os.listdir(H5_DATA):
    if file_name.endswith('.h5'):
        with h5py.File(os.path.join(H5_DATA, file_name), 'r') as f:
            for dataset_name in f.keys():
                data = torch.from_numpy(np.array(f[dataset_name]))

                if 'train' in file_name:
                    train_data_list.append(data)
                elif 'val' in file_name or 'validation' in file_name:
                    val_data_list.append(data)

train_dataset = CustomH5Dataset(train_data_list)
val_dataset = CustomH5Dataset(val_data_list)

In [22]:
batch_size = 64

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [26]:
# Iterate over the train_dataloader
for batch_idx, data in enumerate(train_dataloader):
    print(f"Batch {batch_idx}:")
    print(f"Data shape: {data.shape}")
    print(f"Data type: {data.dtype}")
    print()

    # You can break the loop after the first iteration to see just one batch
    break

# Iterate over the val_dataloader
for batch_idx, data in enumerate(val_dataloader):
    print(f"Batch {batch_idx}:")
    print(f"Data shape: {data.shape}")
    print(f"Data type: {data.dtype}")
    print()

    # You can break the loop after the first iteration to see just one batch
    break

Batch 0:
Data shape: torch.Size([2, 1122, 32000, 1])
Data type: torch.float64

Batch 0:
Data shape: torch.Size([2, 64, 32000, 1])
Data type: torch.float64



In [29]:
def get_audio_length(file_path, sample_rate=16000):
    total_length_seconds = 0
    total_length_samples = 0
    
    with h5py.File(file_path, 'r') as f:
        for dataset_key in f.keys():
            audio_data = f[dataset_key][:]
            num_samples = audio_data.shape[0]
            audio_length_samples = audio_data.shape[1]
            audio_length_seconds = num_samples * audio_length_samples / sample_rate
            total_length_seconds += audio_length_seconds
            total_length_samples += num_samples * audio_length_samples
    
    return total_length_seconds, total_length_samples

for file_name in os.listdir(H5_DATA):
    file_path = os.path.join(H5_DATA, file_name)

    # Check if the file is an .h5 file and if it contains the word 'train' in its name
    if file_path.endswith('.h5'):
        file_length_seconds, file_length_samples = get_audio_length(file_path)
        print(f'Audio file: {file_name}, Total Length: {file_length_seconds:.2f} seconds, {file_length_samples} samples')



Audio file: dry_val_test.h5, Total Length: 128.00 seconds, 2048000 samples
Audio file: wet_val_test.h5, Total Length: 128.00 seconds, 2048000 samples
Audio file: wet_train.h5, Total Length: 2244.00 seconds, 35904000 samples
Audio file: dry_train.h5, Total Length: 2244.00 seconds, 35904000 samples
