In [None]:
import os
os.listdir("../../datasets/audio")

In [None]:
"""
Audio Similarity Search with SpeechBrain ECAPA-TDNN + venv FFmpeg
-----------------------------------------------------------------
- Uses SpeechBrain's ECAPA-TDNN speaker embeddings
- Converts any input (m4a/mp3/flac/ogg/...) to 16kHz mono WAV using the FFmpeg
  binary provided by imageio-ffmpeg inside your virtual environment
- Copies matches to an output folder and writes a CSV report
"""

from pathlib import Path
import subprocess, tempfile, shutil, math, warnings
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torchaudio
from torchaudio.transforms import Resample
from speechbrain.pretrained import EncoderClassifier
import librosa
import imageio_ffmpeg as ffmpegio

# =============================
# HARD-CODED PARAMETERS
# =============================
REFERENCE_PATH = Path("../../datasets/audio/Voice 250810_182638.m4a")   # Known suspect voice
SUSPECT_DIR = Path("../../datasets/audio/gallery")      # Folder with suspect audio
OUT_DIR = Path("../../datasets/audio/out_dir")
THRESHOLD      = 0.80                             # Cosine similarity threshold (0..1)
REPORT_CSV     = Path("audio_similarity_report_ecapa.csv")
WINDOW_SEC     = 0.0                              # 0 => whole clip; else sliding window size (e.g., 3.0)
HOP_SEC        = 1.0                              # hop between windows
COPY_ALL       = True                            # copy all files with score prefix
MIN_DURATION   = 0.5                              # skip files shorter than this (seconds)

AUDIO_EXTS = {".wav", ".mp3", ".m4a", ".flac", ".ogg", ".aac", ".wma", ".opus", ".aiff", ".aif"}
TARGET_SR  = 16000

# =============================
# FFmpeg (from your venv)
# =============================
FFMPEG_BIN = ffmpegio.get_ffmpeg_exe()

def convert_to_temp_wav_16k_mono(src_path: Path) -> Path:
    """Convert any format to a temp 16kHz mono WAV using venv FFmpeg."""
    tmpdir = Path(tempfile.mkdtemp(prefix="audio_conv_"))
    dst = tmpdir / (src_path.stem + "_16k_mono.wav")
    cmd = [
        FFMPEG_BIN, "-y", "-loglevel", "error",
        "-i", str(src_path),
        "-ar", str(TARGET_SR), "-ac", "1",
        str(dst),
    ]
    subprocess.run(cmd, check=True)
    return dst

# =============================
# Helpers
# =============================
def find_audio_files(root: Path):
    return sorted([p for p in root.rglob("*") if p.is_file() and p.suffix.lower() in AUDIO_EXTS])

def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    a = a.astype(np.float32); b = b.astype(np.float32)
    denom = (np.linalg.norm(a) * np.linalg.norm(b)) + 1e-9
    return float(np.dot(a, b) / denom)

def load_audio_tensor(wav_path: Path):
    """Load a WAV as mono 16k float32 tensor [1, T]. Assumes file is already 16k mono."""
    wav, sr = torchaudio.load(str(wav_path))  # [C, T]
    if wav.size(0) > 1:
        wav = wav.mean(dim=0, keepdim=True)
    if sr != TARGET_SR:
        wav = Resample(sr, TARGET_SR)(wav)
    peak = wav.abs().max()
    if float(peak) > 0:
        wav = wav / peak.clamp(min=1e-9)
    return wav, TARGET_SR

def ecapa_embed(classifier: EncoderClassifier, wav_tensor: torch.Tensor) -> np.ndarray:
    """Compute one ECAPA embedding for a waveform tensor [1, T]."""
    with torch.no_grad():
        emb = classifier.encode_batch(wav_tensor).squeeze().cpu().numpy()
        emb = emb / (np.linalg.norm(emb) + 1e-9)
        return emb

def ecapa_embed_sliding(classifier: EncoderClassifier, wav_tensor: torch.Tensor, sr: int, window_sec: float, hop_sec: float):
    """Average embeddings from sliding windows."""
    wav = wav_tensor.squeeze(0)   # [T]
    win = int(window_sec * sr)
    hop = int(hop_sec * sr)
    if win <= 0 or hop <= 0 or wav.numel() < win:
        return ecapa_embed(classifier, wav_tensor)
    embs = []
    with torch.no_grad():
        for start in range(0, wav.numel() - win + 1, hop):
            seg = wav[start:start+win].unsqueeze(0)  # [1, win]
            emb = classifier.encode_batch(seg).squeeze().cpu().numpy()
            embs.append(emb)
    if not embs:
        return ecapa_embed(classifier, wav_tensor)
    emb = np.mean(np.vstack(embs), axis=0)
    emb = emb / (np.linalg.norm(emb) + 1e-9)
    return emb

# =============================
# Main
# =============================
def run_similarity_search_ecapa():
    if not REFERENCE_PATH.exists():
        raise FileNotFoundError(f"Reference not found: {REFERENCE_PATH}")
    if not SUSPECT_DIR.exists() or not SUSPECT_DIR.is_dir():
        raise FileNotFoundError(f"Suspect directory not found: {SUSPECT_DIR}")
    if OUT_DIR == SUSPECT_DIR:
        raise ValueError("OUT_DIR must be different from SUSPECT_DIR.")
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    # Load model (first run downloads; then cached for offline use)
    print("[*] Loading SpeechBrain ECAPA-TDNN (speechbrain/spkrec-ecapa-voxceleb)...")
    classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir=None)

    # Reference embedding (convert once)
    print(f"[*] Embedding reference: {REFERENCE_PATH.name}")
    ref_wav_path = convert_to_temp_wav_16k_mono(REFERENCE_PATH)
    ref_wav, _ = load_audio_tensor(ref_wav_path)
    if WINDOW_SEC > 0:
        ref_emb = ecapa_embed_sliding(classifier, ref_wav, TARGET_SR, WINDOW_SEC, HOP_SEC)
    else:
        ref_emb = ecapa_embed(classifier, ref_wav)

    # Scan suspects
    files = find_audio_files(SUSPECT_DIR)
    print(f"[*] Found {len(files)} audio files to scan.")
    results, matches = [], []

    for apath in tqdm(files, desc="Scanning"):
        # Convert each file once and reuse
        try:
            tmp_wav = convert_to_temp_wav_16k_mono(apath)
        except subprocess.CalledProcessError as e:
            warnings.warn(f"FFmpeg conversion failed for {apath}: {e}")
            results.append({"file": str(apath), "filename": apath.name, "similarity": float("nan"), "is_match": False})
            continue

        # Duration filter
        try:
            dur = librosa.get_duration(path=tmp_wav)
            if dur < MIN_DURATION:
                continue
        except Exception:
            pass

        try:
            wav, _ = load_audio_tensor(tmp_wav)
            if WINDOW_SEC > 0:
                emb = ecapa_embed_sliding(classifier, wav, TARGET_SR, WINDOW_SEC, HOP_SEC)
            else:
                emb = ecapa_embed(classifier, wav)
            score = cosine_similarity(ref_emb, emb)
            is_match = score >= THRESHOLD
        except Exception as e:
            warnings.warn(f"Embedding failed for {apath}: {e}")
            score, is_match = float("nan"), False

        results.append({"file": str(apath), "filename": apath.name, "similarity": score, "is_match": bool(is_match)})

        try:
            if COPY_ALL:
                score_str = "nan" if math.isnan(score) else f"{score:.3f}"
                shutil.copy2(apath, OUT_DIR / f"{score_str}__{apath.name}")
            elif is_match:
                shutil.copy2(apath, OUT_DIR / apath.name)
                matches.append(apath)
        except Exception as e:
            warnings.warn(f"Copy failed {apath} -> {OUT_DIR}: {e}")

    # Report
    df = pd.DataFrame(results)
    if not df.empty:
        df.sort_values(by="similarity", ascending=False, inplace=True, na_position="last")
    df.to_csv(REPORT_CSV, index=False)

    print(f"[*] Report written: {REPORT_CSV}")
    if COPY_ALL:
        print(f"[*] Copied all files to: {OUT_DIR}")
    else:
        print(f"[*] Matches copied to: {OUT_DIR} ({len(matches)} files)")
    return df

# Run
df_results = run_similarity_search_ecapa()
df_results.head()
