# hdf5 data subset pipeline

Original dataset in h5 format:
https://zenodo.org/record/3746119#.ZCPs7exBwq-

### Setup

In [1]:
import h5py
import os
import soundfile as sf
import numpy as np
from pathlib import Path
import random

from IPython.display import Audio


In [2]:
seed = 84  # You can choose any seed value

random.seed(seed)
np.random.seed(seed)

In [3]:
H5_DATA = '/homedtic/fpapaleo/smc-spring-reverb/dataset'
H5_SUBSET = '/homedtic/fpapaleo/smc-spring-reverb/dataset_subset'

### h5py original dataset

In [4]:
# Verify if the dataset contains any metadata
with h5py.File(os.path.join(H5_DATA,'dry_train.h5'), 'r') as f:
    print(f['Xtrain'].attrs.keys())
    print(f['Xtrain'].attrs.values())
    values = list(f['Xtrain'].attrs.values())
    print(values)

<KeysViewHDF5 []>
ValuesViewHDF5(<Attributes of HDF5 object at 22744714008752>)
[]


### Utility Functions

In [5]:
# Function to load an h5 file and print informations
def get_length_info(file_path, sample_rate=16000):
    total_length_seconds = 0
    total_length_samples = 0
    
    with h5py.File(file_path, 'r') as f:
        # Iterate over all datasets in the file
        for dataset_key in f.keys():
            audio_data = f[dataset_key][:]
            num_samples = audio_data.shape[0]
            audio_length_samples = audio_data.shape[1]
            audio_length_seconds = num_samples * audio_length_samples / sample_rate
            total_length_seconds += audio_length_seconds
            total_length_samples += num_samples * audio_length_samples
            
            # Print information about the dataset
            print(f'Audio file: {file_name}, Total Length: {total_length_seconds:.2f} seconds, {total_length_samples} samples')
            print(f"Dataset: {dataset_key}")
            print(f"Data shape: {audio_data.shape}")
            print(f"Data type: {audio_data.dtype}")
            print()


# Function to randomly select samples from an h5 file using a list of indices
def select_random_samples(file_path, selected_indices):
    with h5py.File(file_path, 'r') as f:
        dataset_key = list(f.keys())[0]
        data = np.array(f[dataset_key])

        # Select the samples corresponding to the provided indices
        selected_samples = data[selected_indices]
        
    return selected_samples

# Function to create subsets from an h5 file
def create_subsets(input_path, file_name, indices, output_path, output_name):
    with h5py.File(os.path.join(input_path, file_name), 'r') as f:
        dataset_key = list(f.keys())[0]
        data = np.array(f[dataset_key])
        subset = data[indices]

    with h5py.File(os.path.join(output_path, output_name), 'w') as f:
        f.create_dataset(f'{dataset_key}_subset', data=subset)

# Function to select random indices
def select_indices(data, num_samples):
    if isinstance(data, tuple):
        return random.sample(range(data[0]), num_samples)
    elif isinstance(data, list):
        return random.sample(data, num_samples)
    
def play_audio_samples(samples, num_samples, sample_rate=16000):
    for i, sample in enumerate(samples[:num_samples]):
        print(f"Playing sample {i+1}:")
        display(Audio(sample.T, rate=sample_rate))

### Printing informations

In [6]:
# Iterate over all files in the H5_DATA directory
for file_name in os.listdir(H5_DATA):
    file_path = os.path.join(H5_DATA, file_name)

    if file_path.endswith('.h5'):
        get_length_info(file_path)

Audio file: dry_val_test.h5, Total Length: 128.00 seconds, 2048000 samples
Dataset: Xvalidation
Data shape: (64, 32000, 1)
Data type: float64

Audio file: wet_val_test.h5, Total Length: 128.00 seconds, 2048000 samples
Dataset: Yvalidation_0
Data shape: (64, 32000, 1)
Data type: float64

Audio file: wet_train.h5, Total Length: 2244.00 seconds, 35904000 samples
Dataset: Ytrain_0
Data shape: (1122, 32000, 1)
Data type: float64

Audio file: dry_train.h5, Total Length: 2244.00 seconds, 35904000 samples
Dataset: Xtrain
Data shape: (1122, 32000, 1)
Data type: float64



### Splitting the dataset into 3 subsets

In [14]:
# Dictionary containing the names of the files to split
file_names = {
    'dry_train': 'dry_train.h5',
    'wet_train': 'wet_train.h5',
    'dry_val_test': 'dry_val_test.h5',
    'wet_val_test': 'wet_val_test.h5',
}

# Number of samples to select from each dataset
num_train_samples = 17
num_val_samples = 2
num_test_samples = 2

# Select indices for train, validation, and test subsets
with h5py.File(os.path.join(H5_DATA, file_names['dry_train']), 'r') as f:
    train_indices = select_indices(f[list(f.keys())[0]].shape, num_train_samples)

with h5py.File(os.path.join(H5_DATA, file_names['dry_val_test']), 'r') as f:
    data_shape = f[list(f.keys())[0]].shape
    val_indices = select_indices(data_shape, num_val_samples)
    remaining_indices = [i for i in range(data_shape[0]) if i not in val_indices]
    test_indices = select_indices(remaining_indices, num_test_samples)

# Create train, validation, and test subsets for dry and wet files
for subset_name, indices in [('train', train_indices), ('val', val_indices), ('test', test_indices)]:
    for file_type in ['dry', 'wet']:
        input_file = file_names[f'{file_type}_{subset_name if subset_name == "train" else "val_test"}']
        output_file = f'{file_type}_{subset_name}_subset.h5'
        create_subsets(H5_DATA, input_file, indices, H5_SUBSET, output_file)

## Split the dataset into 2 subsets

In [None]:
#@title Create a subset with less samples keeping the same structure

# Dictionary containing the names of the files to split
file_names = {
    'dry_train': 'dry_train.h5',
    'wet_train': 'wet_train.h5',
    'dry_val_test': 'dry_val_test.h5',
    'wet_val_test': 'wet_val_test.h5',
}

# Number of samples to select from each dataset
num_train_samples = 18
num_val_test_samples = 4

# Select indices for train and validation/test subsets
with h5py.File(os.path.join(H5_DATA, file_names['dry_train']), 'r') as f:
    train_indices = select_indices(f[list(f.keys())[0]].shape, num_train_samples)

with h5py.File(os.path.join(H5_DATA, file_names['dry_val_test']), 'r') as f:
    data_shape = f[list(f.keys())[0]].shape
    test_val_indices = select_indices(data_shape, num_val_test_samples)
    
# Create train, validation/test files in H5_SUBSET
for subset_name, indices in [('train', train_indices), ('val', val_indices), ('test', test_indices)]:
    for file_type in ['dry', 'wet']:
        input_file = file_names[f'{file_type}_{subset_name if subset_name == "train" else "val_test"}']
        output_file = f'{file_type}_{subset_name}_subset.h5'
        create_subsets(H5_DATA, input_file, indices, H5_SUBSET, output_file)

## Play and check

In [18]:
num_samples_to_play = 1

# Read the data from the h5 files of the subsets
file_names = ['dry_train_subset', 'wet_train_subset', 'dry_val_subset', 'wet_val_subset', 'dry_test_subset', 'wet_test_subset']

for file_name in file_names:
    with h5py.File(os.path.join(H5_SUBSET, f'{file_name}.h5'), 'r') as f:
        dataset_key = list(f.keys())[0]
        data = np.array(f[dataset_key])

    # Play the audio samples
    print(f"{file_name}:")
    play_audio_samples(data, num_samples_to_play)



dry_train_subset:
Playing sample 1:


wet_train_subset:
Playing sample 1:


dry_val_subset:
Playing sample 1:


wet_val_subset:
Playing sample 1:


dry_test_subset:
Playing sample 1:


wet_test_subset:
Playing sample 1:


In [19]:
# Iterate over all files in the H5_DATA directory
for file_name in os.listdir(H5_SUBSET):
    file_path = os.path.join(H5_SUBSET, file_name)

    # Check if the file is an .h5 file and if it contains the word 'train' in its name
    if file_path.endswith('.h5'):
        get_length_info(file_path)

Audio file: wet_train_subset.h5, Total Length: 34.00 seconds, 544000 samples
Dataset: Ytrain_0_subset
Data shape: (17, 32000, 1)
Data type: float64

Audio file: wet_val_subset.h5, Total Length: 4.00 seconds, 64000 samples
Dataset: Yvalidation_0_subset
Data shape: (2, 32000, 1)
Data type: float64

Audio file: dry_val_subset.h5, Total Length: 4.00 seconds, 64000 samples
Dataset: Xvalidation_subset
Data shape: (2, 32000, 1)
Data type: float64

Audio file: wet_test_subset.h5, Total Length: 4.00 seconds, 64000 samples
Dataset: Yvalidation_0_subset
Data shape: (2, 32000, 1)
Data type: float64

Audio file: dry_test_subset.h5, Total Length: 4.00 seconds, 64000 samples
Dataset: Xvalidation_subset
Data shape: (2, 32000, 1)
Data type: float64

Audio file: dry_train_subset.h5, Total Length: 34.00 seconds, 544000 samples
Dataset: Xtrain_subset
Data shape: (17, 32000, 1)
Data type: float64

