# Working with Audio Files in Python

In [13]:
from __future__ import annotations

from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
from pathlib import Path
from typing import Any, Dict, List, Optional
import sox

# Optional imports guarded for notebook robustness
try:
    import numpy as np  # type: ignore
except Exception:  # pragma: no cover
    np = None  # fallback if not installed

try:
    import matplotlib.pyplot as plt  # type: ignore

    plt.style.use("seaborn-v0_8-whitegrid")
    plt.rcParams["axes.grid"] = True
except Exception:  # pragma: no cover
    plt = None

# Default set of recognized audio extensions (lowercase)
AUDIO_EXTENSIONS: Set[str] = {".wav", ".mp3", ".flac", ".aac", ".ogg", ".m4a"}

In [14]:
def as_path(path: Union[str, Path]) -> Path:
    """
    Return an absolute, resolved Path without touching the filesystem.
    """
    return Path(path).expanduser().resolve()

In [15]:
def find_audio_files(
    start_path: Union[str, Path],
    extensions: Optional[Iterable[str]] = None,
    recursive: bool = True,
) -> List[Path]:
    """
    Find audio files under a directory.

    Args:
        start_path: Directory to search.
        extensions: Iterable of extensions (e.g., [".wav", ".flac"]). Case-insensitive.
                    If None, uses AUDIO_EXTENSIONS.
        recursive: If True, search subdirectories recursively.

    Returns:
        Sorted list of Paths to matching files.
    """
    base = as_path(start_path)
    if not base.exists() or not base.is_dir():
        raise FileNotFoundError(f"Directory not found: {base}")

    exts = {e.lower() for e in (extensions or AUDIO_EXTENSIONS)}

    files: List[Path] = []
    if recursive:
        for p in base.rglob("*"):
            if p.is_file() and p.suffix.lower() in exts:
                files.append(p)
    else:
        for p in base.iterdir():
            if p.is_file() and p.suffix.lower() in exts:
                files.append(p)

    return sorted(files)

In [16]:
# Example usage:
audio_files = find_audio_files("../audio", extensions=[".wav", ".mp3"], recursive=True)
for file in audio_files:
    print(file)

/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/IR_AKG_BX25_3500ms_48kHz24b.wav
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/IR_AKG_BX25_3500ms_dark_48kHz24b.wav
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/ir_reference.wav
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/log_sweep_tone.wav
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/mix.wav
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/plk-fm-base.wav
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/pred-gcn-gru-tfilm-41-20231201-132556-48k.wav
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/synthetic_spring_reverb.wav
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/targ-gcn-gru-tfilm-41-20231201-132556-48k.wav
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/wet.wav


In [22]:
def read_metadata_sox(
    paths: List[Path],
    fast_scan: bool = True,  # True = header only via info(); False = also merge stat()
) -> List[Dict[str, Any]]:
    out: List[Dict[str, Any]] = []

    for p in paths:
        sp = str(p)
        try:
            info = sox.file_info.info(sp)  # single probe for header data
            # info keys: channels, sample_rate, bitdepth, bitrate, duration,
            #            num_samples, encoding, silent

            # Normalize fields
            sr = info.get("sample_rate")
            ch = info.get("channels")
            dur = info.get("duration")
            bits = info.get("bitdepth")
            enc = info.get("encoding")

            meta: Dict[str, Any] = {
                "path": p,
                "duration_s": float(dur) if dur is not None else 0.0,
                "sample_rate": int(sr) if sr is not None else None,
                "num_channels": int(ch) if ch is not None else None,
                "bits_per_sample": int(bits) if bits not in (None, "") else None,
                "encoding": enc,
                "file_type": sox.file_info.file_type(sp),
                "num_samples": int(info["num_samples"]) if info.get("num_samples") is not None else None,
                "bitrate_bps": float(info["bitrate"]) if info.get("bitrate") is not None else None,
                "silent": bool(info.get("silent")) if info.get("silent") is not None else None,
            }

            if not fast_scan:
                try:
                    st = sox.file_info.stat(sp)  # downmix-to-mono stats dict
                    # Typical keys include:
                    # "Length (seconds)", "RMS     amplitude", "RMS     dB",
                    # "Peak amplitude", "Max level", "Mean    norm", "Mean    amplitude", etc.
                    # Map a few into your schema:
                    rms_db = st.get("RMS     dB")
                    mean_amp = st.get("Mean    amplitude")
                    std_amp = st.get("RMS     amplitude")  # not exactly std, but a useful energy proxy

                    meta["rms_db"] = float(rms_db) if rms_db is not None else None
                    meta["mean"] = float(mean_amp) if mean_amp is not None else None
                    meta["std"] = float(std_amp) if std_amp is not None else None
                except Exception:
                    # If stat fails, keep header-only data
                    pass

            out.append(meta)

        except Exception as e:
            # If info() fails, skip the file and show why (optional)
            print(f"[WARN] sox info failed for {p}: {e}")
            continue

    return out

In [24]:
from typing import Any, Dict, List

def print_metadata_simple(metadata_list: List[Dict[str, Any]]) -> None:
    if not metadata_list:
        print("No metadata to display.")
        return

    # Choose a stable order for common keys, then append any extras
    preferred_order = [
        "path",
        "file_type",
        "encoding",
        "duration_s",
        "sample_rate",
        "num_channels",
        "bits_per_sample",
        "num_samples",
        "bitrate_bps",
        "silent",
        "rms_db",
        "mean",
        "std",
    ]

    for i, m in enumerate(metadata_list, 1):
        print(f"\n--- File #{i} ---")
        # Print preferred keys first (if present)
        printed = set()
        for k in preferred_order:
            if k in m:
                print(f"{k}: {m[k]}")
                printed.add(k)
        # Print any remaining keys in alpha order
        for k in sorted(k for k in m.keys() if k not in printed):
            print(f"{k}: {m[k]}")

In [30]:
def print_metadata_one_line(metadata_list):
    if not metadata_list:
        print("No metadata to display.")
        return
    for m in metadata_list:
        path = m.get("path")
        sr = m.get("sample_rate")
        ch = m.get("num_channels")
        bits = m.get("bits_per_sample")
        dur = m.get("duration_s")
        ftype = m.get("file_type")
        print(f"{path} | {ftype} | {sr} Hz | {ch} ch | {bits} bit | {dur:.3f} s")

In [31]:
metadata = read_metadata_sox(audio_files, fast_scan=True)
# print_metadata_simple(metadata)
print_metadata_one_line(metadata)

/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/IR_AKG_BX25_3500ms_48kHz24b.wav | wav | 48000 Hz | 1 ch | 24 bit | 5.608 s
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/IR_AKG_BX25_3500ms_dark_48kHz24b.wav | wav | 48000 Hz | 1 ch | 24 bit | 5.541 s
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/ir_reference.wav | wav | 48000 Hz | 1 ch | 24 bit | 10.000 s
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/log_sweep_tone.wav | wav | 48000 Hz | 1 ch | 24 bit | 5.000 s
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/mix.wav | wav | 48000 Hz | 1 ch | 32 bit | 9.537 s
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/plk-fm-base.wav | wav | 48000 Hz | 1 ch | 24 bit | 4.000 s
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/pred-gcn-gru-tfilm-41-20231201-132556-48k.wav | wav | 48000 Hz | 1 ch | 32 bit | 48.000 s
/Users/fra/Documents/Dev-Personal/audio-signal-processing/audio/synthetic_