# hdf5 dataset

Original dataset in h5 format:
https://zenodo.org/record/3746119#.ZCPs7exBwq-

### Setup

In [1]:
import h5py
import os
import soundfile as sf
import numpy as np
from pathlib import Path
import random

from IPython.display import Audio


In [None]:
seed = 84  # You can choose any seed value

random.seed(seed)
np.random.seed(seed)

## h5py data

In [2]:
H5_DATA = '/homedtic/fpapaleo/smc-spring-reverb/dataset'
H5_SUBSET = '/homedtic/fpapaleo/smc-spring-reverb/dataset_subset'

In [3]:
# Verify if the dataset contains any metadata
with h5py.File(os.path.join(H5_DATA,'dry_train.h5'), 'r') as f:
    print(f['Xtrain'].attrs.keys())
    print(f['Xtrain'].attrs.values())
    values = list(f['Xtrain'].attrs.values())
    print(values)

<KeysViewHDF5 []>
ValuesViewHDF5(<Attributes of HDF5 object at 22790911845376>)
[]


In [4]:
# Printing the dataset names and shapes

# Iterate over all files in the H5_DATA directory
for file_name in os.listdir(H5_DATA):
    # Check if the file has the .h5 extension
    if file_name.endswith('.h5'):
        with h5py.File(os.path.join(H5_DATA, file_name), 'r') as f:
            # Print the list of datasets in the file
            print(f"File: {file_name}")
            print(list(f.keys()))

            # Iterate over all datasets in the file
            for dataset_name in f.keys():
                # Load the dataset
                data = np.array(f[dataset_name])

                # Print information about the dataset
                print(f"Dataset: {dataset_name}")
                print(data.shape)
                print(data.dtype)
                print()

File: dry_val_test.h5
['Xvalidation']
Dataset: Xvalidation
(64, 32000, 1)
float64

File: wet_val_test.h5
['Yvalidation_0']
Dataset: Yvalidation_0
(64, 32000, 1)
float64

File: wet_train.h5
['Ytrain_0']
Dataset: Ytrain_0
(1122, 32000, 1)
float64

File: dry_train.h5
['Xtrain']
Dataset: Xtrain
(1122, 32000, 1)
float64



In [29]:
def get_audio_length(file_path, sample_rate=16000):
    total_length_seconds = 0
    total_length_samples = 0
    
    with h5py.File(file_path, 'r') as f:
        # Iterate over all datasets in the file
        for dataset_key in f.keys():
            audio_data = f[dataset_key][:]
            num_samples = audio_data.shape[0]
            audio_length_samples = audio_data.shape[1]
            audio_length_seconds = num_samples * audio_length_samples / sample_rate
            total_length_seconds += audio_length_seconds
            total_length_samples += num_samples * audio_length_samples
            
            # Print information about the dataset
            print(f'Audio file: {file_name}, Total Length: {total_length_seconds:.2f} seconds, {total_length_samples} samples')
            print(f"Dataset: {dataset_key}")
            print(f"Data shape: {audio_data.shape}")
            print(f"Data type: {audio_data.dtype}")
            print()

In [31]:
# Iterate over all files in the H5_DATA directory
for file_name in os.listdir(H5_DATA):
    file_path = os.path.join(H5_DATA, file_name)

    # Check if the file is an .h5 file and if it contains the word 'train' in its name
    if file_path.endswith('.h5'):
        get_audio_length(file_path)
        

Audio file: dry_val_test.h5, Total Length: 128.00 seconds, 2048000 samples
Dataset: Xvalidation
Data shape: (64, 32000, 1)
Data type: float64

Audio file: wet_val_test.h5, Total Length: 128.00 seconds, 2048000 samples
Dataset: Yvalidation_0
Data shape: (64, 32000, 1)
Data type: float64

Audio file: wet_train.h5, Total Length: 2244.00 seconds, 35904000 samples
Dataset: Ytrain_0
Data shape: (1122, 32000, 1)
Data type: float64

Audio file: dry_train.h5, Total Length: 2244.00 seconds, 35904000 samples
Dataset: Xtrain
Data shape: (1122, 32000, 1)
Data type: float64



In [7]:
# Lists to store file paths for dry and wet train datasets
dry_train_files = []
wet_train_files = []

# Iterate over all files in the H5_DATA directory
for file_name in os.listdir(H5_DATA):
    file_path = os.path.join(H5_DATA, file_name)
    if file_path.endswith('.h5'):
        if 'dry_train' in file_name:
            dry_train_files.append(file_path)
        elif 'wet_train' in file_name:
            wet_train_files.append(file_path)

# Function to randomly select samples from an h5 file using a list of indices
def select_random_samples(file_path, selected_indices):
    with h5py.File(file_path, 'r') as f:
        dataset_key = list(f.keys())[0]
        data = np.array(f[dataset_key])

        # Select the samples corresponding to the provided indices
        selected_samples = data[selected_indices]
        
    return selected_samples

# Generate a list of 33 random indices
num_samples = 17
with h5py.File(dry_train_files[0], 'r') as f:
    dataset_key = list(f.keys())[0]
    data = np.array(f[dataset_key])
    selected_indices = random.sample(range(data.shape[0]), num_samples)

# Select the same 33 random samples from the dry and wet train datasets
dry_train_subset = select_random_samples(dry_train_files[0], selected_indices)
wet_train_subset = select_random_samples(wet_train_files[0], selected_indices)

# Save the selected samples to new h5 files
with h5py.File(os.path.join(H5_DATA, 'dry_train_subset.h5'), 'w') as f:
    f.create_dataset('Xtrain_subset', data=dry_train_subset)

with h5py.File(os.path.join(H5_DATA, 'wet_train_subset.h5'), 'w') as f:
    f.create_dataset('Ytrain_subset', data=wet_train_subset)


In [9]:
# Read the audio data from the h5 files
with h5py.File(os.path.join(H5_SUBSET, 'dry_train_subset.h5'), 'r') as f:
    dry_train_subset = np.array(f['Xtrain_subset'])

with h5py.File(os.path.join(H5_SUBSET, 'wet_train_subset.h5'), 'r') as f:
    wet_train_subset = np.array(f['Ytrain_subset'])

# Function to play audio samples
def play_audio_samples(samples, num_samples, sample_rate=16000):
    for i, sample in enumerate(samples[:num_samples]):
        print(f"Playing sample {i+1}:")
        display(Audio(sample.T, rate=sample_rate))

num_samples_to_play = 2

# Play the dry_train_subset audio samples
print("Dry train subset:")
play_audio_samples(dry_train_subset, num_samples_to_play)

# Play the wet_train_subset audio samples
print("Wet train subset:")
play_audio_samples(wet_train_subset, num_samples_to_play)


Dry train subset:
Playing sample 1:


Playing sample 2:


Wet train subset:
Playing sample 1:


Playing sample 2:


In [15]:
# Lists to store file paths for dry and wet val_test datasets
dry_val_test_files = []
wet_val_test_files = []

# Iterate over all files in the H5_DATA directory
for file_name in os.listdir(H5_DATA):
    file_path = os.path.join(H5_DATA, file_name)
    if file_path.endswith('.h5'):
        if 'dry_val_test' in file_name:
            dry_val_test_files.append(file_path)
        elif 'wet_val_test' in file_name:
            wet_val_test_files.append(file_path)

# Generate a list of 2 random indices for validation and 2 random indices for testing
num_val_samples = 2
num_test_samples = 2
with h5py.File(dry_val_test_files[0], 'r') as f:
    dataset_key = list(f.keys())[0]
    data = np.array(f[dataset_key])

    val_indices = random.sample(range(data.shape[0]), num_val_samples)
    test_indices = random.sample([i for i in range(data.shape[0]) if i not in val_indices], num_test_samples)

# Select 2 random samples from the dry and wet val_test datasets for validation
dry_val_subset = select_random_samples(dry_val_test_files[0], val_indices)
wet_val_subset = select_random_samples(wet_val_test_files[0], val_indices)

# Select 2 random samples from the dry and wet val_test datasets for testing
dry_test_subset = select_random_samples(dry_val_test_files[0], test_indices)
wet_test_subset = select_random_samples(wet_val_test_files[0], test_indices)

# Save the selected samples to new h5 files
with h5py.File(os.path.join(H5_DATA, 'dry_val_subset.h5'), 'w') as f:
    f.create_dataset('Xval_subset', data=dry_val_subset)

with h5py.File(os.path.join(H5_DATA, 'wet_val_subset.h5'), 'w') as f:
    f.create_dataset('Yval_subset', data=wet_val_subset)

with h5py.File(os.path.join(H5_DATA, 'dry_test_subset.h5'), 'w') as f:
    f.create_dataset('Xtest_subset', data=dry_test_subset)

with h5py.File(os.path.join(H5_DATA, 'wet_test_subset.h5'), 'w') as f:
    f.create_dataset('Ytest_subset', data=wet_test_subset)


In [14]:
# Printing the dataset names and shapes

# Iterate over all files in the H5_DATA directory
for file_name in os.listdir(H5_SUBSET):
    # Check if the file has the .h5 extension
    if file_name.endswith('.h5'):
        with h5py.File(os.path.join(H5_SUBSET, file_name), 'r') as f:
            # Print the list of datasets in the file
            print(f"File: {file_name}")
            print(list(f.keys()))

            # Iterate over all datasets in the file
            for dataset_name in f.keys():
                # Load the dataset
                data = np.array(f[dataset_name])

                # Print information about the dataset
                print(f"Dataset: {dataset_name}")
                print(data.shape)
                print(data.dtype)
                print()

File: wet_train_subset.h5
['Ytrain_subset']
Dataset: Ytrain_subset
(17, 32000, 1)
float64

File: dry_train_subset.h5
['Xtrain_subset']
Dataset: Xtrain_subset
(17, 32000, 1)
float64

