# Setup

In [1]:
import os
import torchaudio
import numpy as np
import tqdm

In [None]:
# data paths:
DATA_ROOT = "/Users/eugenekim/Emo-CLIM/dataset/AudioSet"
AUDIO_DIR = "audio_files"
# audio constants:
SAMPLE_RATE = 16000     # in Hz
CLIP_LENGTH_EXPECT = 10.0      # in seconds

In [None]:
# script options:
data_subsets = ["unbalanced_train", "balanced_train", "eval"]

# Dataset Verification

In [4]:
# extract data file names:
data_file_names = {}
for subset in data_subsets:
    subset_root = os.path.join(DATA_ROOT, AUDIO_DIR, subset)
    data_file_names[subset] = [name for name in os.listdir(subset_root) if os.path.isfile(os.path.join(subset_root, name))]
    print("Size of {} set: {}".format(subset, len(data_file_names[subset])))

Size of unbalanced_train set: 13268
Size of balanced_train set: 325
Size of eval set: 344


In [5]:
# verify dataset:
for subset in data_subsets:
    # check for duplicate file names:
    assert len(set(data_file_names[subset])) == len(data_file_names[subset]), "Duplicate file names found."
    
    for file_name in tqdm.tqdm(data_file_names[subset], total=len(data_file_names[subset]), desc="Verifying {} subset...".format(subset)):
        file_path = os.path.join(DATA_ROOT, AUDIO_DIR, subset, file_name)

        # verify file name:
        assert file_path.endswith(".wav"), "File path does not end in '.wav'"

        # verify sampling rate:
        metadata = torchaudio.info(file_path)
        assert metadata.sample_rate == SAMPLE_RATE, "Incorrect sampling rate."

Verifying unbalanced_train subset...: 100%|██████████| 13268/13268 [00:00<00:00, 24361.61it/s]
Verifying balanced_train subset...: 100%|██████████| 325/325 [00:00<00:00, 19203.61it/s]
Verifying eval subset...: 100%|██████████| 344/344 [00:00<00:00, 19739.52it/s]


# Dataset Exploration

In [6]:
# count number of audio files with unexpected clip lengths:
for subset in data_subsets:
    n_unexpect_files = 0
    min_clip_length = np.inf
    max_clip_length = 0.0
    print()
    for file_name in tqdm.tqdm(data_file_names[subset], total=len(data_file_names[subset]), desc="Examining {} subset...".format(subset)):
        file_path = os.path.join(DATA_ROOT, AUDIO_DIR, subset, file_name)

        # get clip length:
        metadata = torchaudio.info(file_path)
        length = metadata.num_frames / SAMPLE_RATE

        # check things:
        if length != CLIP_LENGTH_EXPECT:
            n_unexpect_files += 1
        if length < min_clip_length:
            min_clip_length = length
        if length > max_clip_length:
            max_clip_length = length
    
    print("Number of files with unexpected lengths: {}".format(n_unexpect_files))
    print("Minimim clip length: {}s".format(min_clip_length))
    print("Maximum clip length: {}s".format(max_clip_length))




Examining unbalanced_train subset...: 100%|██████████| 13268/13268 [00:00<00:00, 25611.61it/s]


Number of files with unexpected lengths: 397
Minimim clip length: 3.3436875s
Maximum clip length: 10.0s



Examining balanced_train subset...: 100%|██████████| 325/325 [00:00<00:00, 26621.92it/s]


Number of files with unexpected lengths: 4
Minimim clip length: 9.1798125s
Maximum clip length: 10.0s



Examining eval subset...: 100%|██████████| 344/344 [00:00<00:00, 27440.34it/s]

Number of files with unexpected lengths: 4
Minimim clip length: 9.1395625s
Maximum clip length: 10.0s



