# Check for corrupted audio files in dataset


In [None]:
import torchaudio
import os
import concurrent.futures

i_dir = "path/to/your/dataset"

In [None]:
def check_wav(file_path):
    """Load a .wav file and return if it's corrupted or not"""
    try:
        waveform, sample_rate = torchaudio.load(file_path)
        return (file_path, True)
    except Exception as e:
        return (file_path, False)


def find_wavs(directory):
    """Find all .wav files in a directory"""
    for foldername, subfolders, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith(".wav"):
                yield os.path.join(foldername, filename)


def main(directory):
    """Check all .wav files in a directory and its subdirectories"""
    with concurrent.futures.ThreadPoolExecutor() as executor:
        wav_files = list(find_wavs(directory))
        future_to_file = {executor.submit(
            check_wav, wav): wav for wav in wav_files}

        done_count = 0
        for future in concurrent.futures.as_completed(future_to_file):
            file_path = future_to_file[future]
            try:
                is_valid = future.result()
            except Exception as exc:
                print(f"{file_path} generated an exception: {exc}")
            else:
                if not is_valid[1]:
                    print(f"Corrupted file: {file_path}")

            done_count += 1
            if done_count % 5000 == 0:
                print(f"Processed {done_count} files...")

In [None]:
main(i_dir)