In [None]:
import os, math, numpy as np, pandas as pd, soundfile as sf, librosa, noisereduce as nr
import torch
import multiprocessing as mp
from multiprocessing import cpu_count
from multiprocessing.dummy import Pool as ThreadPool
from tqdm.auto import tqdm
from tqdm.auto import tqdm
from panns_inference import AudioTagging

# Force spawn for multiprocessing (needed for CUDA-safe workers)
mp.set_start_method('spawn', force=True)

# ─────── Hyperparameters, Paths & Subfolder Creation ───────
import os
import pandas as pd

DATA_ROOT     = '/home/jovyan/Data/birdclef-2025'
AUDIO_DIR     = os.path.join(DATA_ROOT, 'train_audio')
CSV_PATH      = os.path.join(DATA_ROOT, 'train.csv')

DEN_DIR       = '/home/jovyan/Features/denoised'
MEL_DIR       = '/home/jovyan/Features/mel'
EMB_DIR       = '/home/jovyan/Features/embeddings'
os.makedirs(DEN_DIR, exist_ok=True)
os.makedirs(MEL_DIR, exist_ok=True)
os.makedirs(EMB_DIR, exist_ok=True)

PANNS_SR      = 32000        # audio sample rate
CHUNK_SEC     = 10
CHUNK_SAMPLES = PANNS_SR * CHUNK_SEC


# mel params (reduced)
N_FFT        = 2048
HOP_LENGTH   = 1024           # double to halve time‑frames
N_MELS       = 64             # halve mel‑bins

SAMPLE_FRAC   = 1

In [None]:
def calculate_mel_spectrogram(wave_np, sr=PANNS_SR,
                              n_fft=N_FFT, hop_length=HOP_LENGTH,
                              n_mels=N_MELS):
    mel = librosa.feature.melspectrogram(
        y=wave_np, sr=sr,
        n_fft=n_fft, hop_length=hop_length,
        n_mels=n_mels
    )
    return librosa.power_to_db(mel, ref=np.max)

In [None]:
# ─── Read & Sample Metadata ───
meta = pd.read_csv(CSV_PATH)
sampled = []
for label, grp in meta.groupby('primary_label'):
    # target = 10% of this class, but at least 1, and never more than group size
    n = max(1, int(len(grp) * SAMPLE_FRAC))
    n = min(n, len(grp))
    sampled.append(grp.sample(n=n, random_state=42))

meta = pd.concat(sampled, ignore_index=True)

# ─── Create matching subfolders in DEN_DIR & EMB_DIR ───
subdirs = set(os.path.dirname(f) for f in meta['filename'])
for sub in subdirs:
    if not sub:
        continue
    os.makedirs(os.path.join(DEN_DIR, sub), exist_ok=True)
    os.makedirs(os.path.join(MEL_DIR, sub), exist_ok=True)
    os.makedirs(os.path.join(EMB_DIR, sub), exist_ok=True)

print(f"Prepared base dirs and {len(subdirs)} nested subfolders under DEN_DIR and EMB_DIR.")

labels    = sorted(meta['primary_label'].unique())

den_manifest = []
mel_manifest = []
print(f"Processing {len(meta)} files ({SAMPLE_FRAC*100:.0f}% of dataset)")
print(len(labels))

Prepared base dirs and 206 nested subfolders under DEN_DIR and EMB_DIR.
Processing 28564 files (100% of dataset)
206


In [None]:
def process_record(record):
    fname         = record['filename']
    primary_label = record['primary_label']
    src_fp        = os.path.join(AUDIO_DIR, fname)

    # load + mono
    y, sr = sf.read(src_fp, dtype='float32')
    if y.ndim > 1:
        y = y.mean(axis=1)

    # resample if needed
    if sr != PANNS_SR:
        y = librosa.resample(y, orig_sr=sr, target_sr=PANNS_SR)
        sr = PANNS_SR

    base      = os.path.splitext(os.path.basename(fname))[0]
    n_chunks  = math.ceil(len(y) / CHUNK_SAMPLES)

    for ci in range(n_chunks):
        seg = y[ci*CHUNK_SAMPLES:(ci+1)*CHUNK_SAMPLES]
        if len(seg) < CHUNK_SAMPLES:
            seg = np.pad(seg, (0, CHUNK_SAMPLES - len(seg)), mode='constant')

        # denoise
        den = nr.reduce_noise(y=seg, sr=sr, stationary=False)
        den = den / (np.max(np.abs(den)) + 1e-9)

        chunk_id = f"{base}_chk{ci}"

        # ── WRITE DENOISED OGG ───────────────────────────────────────
        rel_audio_path = f"/{primary_label}/{chunk_id}.ogg"
        out_audio = os.path.join(DEN_DIR, primary_label, chunk_id + '.ogg')
        sf.write(out_audio, den, sr, format='OGG', subtype='VORBIS')
        den_manifest.append({
            'chunk_id':     chunk_id,
            'audio_path':   rel_audio_path,
            'primary_label': primary_label
        })

        # ── COMPUTE + WRITE MEL ──────────────────────────────────────
        mel = calculate_mel_spectrogram(den, sr)
        rel_mel_path = f"/{primary_label}/{chunk_id}.npz"
        out_mel = os.path.join(MEL_DIR, primary_label, chunk_id + '.npz')
        np.savez_compressed(out_mel,
                            mel=mel.astype(np.float16),
                            primary_label=primary_label)
        mel_manifest.append({
            'chunk_id':      chunk_id,
            'mel_path':      rel_mel_path,
            'primary_label': primary_label
        })

    return True

In [None]:
records = meta[['filename','primary_label']].to_dict('records')
with ThreadPool(os.cpu_count()) as pool:
    for _ in tqdm(pool.imap_unordered(process_record, records),
                  total=len(records), desc="Denoise & MEL by species"):
        pass

Denoise & MEL by species:   0%|          | 0/28564 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
pd.DataFrame(den_manifest).to_csv(
    os.path.join(DEN_DIR, 'manifest.csv'), index=False
)
pd.DataFrame(mel_manifest).to_csv(
    os.path.join(MEL_DIR, 'manifest.csv'), index=False
)

In [None]:
device      = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
panns_model = AudioTagging(checkpoint_path=None, device=device)
panns_model.model.eval()

# 1) Gather all .npz paths under DEN_DIR (skip any directories)
den_paths = []
for root, _, files in os.walk(DEN_DIR):
    for fname in files:
        if fname.endswith('.ogg'):
            den_paths.append(os.path.join(root, fname))
den_paths.sort()

BATCH_SIZE  = 512
TARGET_SR = PANNS_SR
emb_manifest = []
for i in tqdm(range(0, len(den_paths), BATCH_SIZE), desc="Phase 2: embed"):
    batch = den_paths[i : i + BATCH_SIZE]
    waves, rels, labels = [], [], []

    # Load & optionally resample
    for full_path in batch:
        y, sr = sf.read(full_path, dtype='float32')
        if sr != TARGET_SR:
            y = librosa.resample(y, orig_sr=sr, target_sr=TARGET_SR)
        waves.append(y)

        rel = os.path.relpath(full_path, DEN_DIR)  # e.g. "smbani/XC461360_chk0.ogg"
        rels.append(rel)
        labels.append(rel.split(os.sep)[0])       # primary_label

    # Batch to GPU
    waves_t = torch.from_numpy(np.stack(waves)).to(device)  # [B, samples]

    # PANNs inference
    with torch.no_grad():
        _, emb_out = panns_model.inference(waves_t)

    # Convert to NumPy array if needed
    if isinstance(emb_out, torch.Tensor):
        embs_np = emb_out.cpu().numpy()
    else:
        embs_np = emb_out  # already a NumPy array

    # Save embeddings + record manifest
    for rel, emb_arr, lbl in zip(rels, embs_np, labels):
        subdir, fname = os.path.split(rel)
        chunk_id      = os.path.splitext(fname)[0]
        emb_name      = f"{chunk_id}_emb.npz"
        out_dir       = os.path.join(EMB_DIR, subdir)
        os.makedirs(out_dir, exist_ok=True)
        out_path      = os.path.join(out_dir, emb_name)

        np.savez_compressed(
            out_path,
            embedding=emb_arr.astype(np.float32),
            primary_label=lbl
        )

        emb_manifest.append({
            'chunk_id':      chunk_id,
            'emb_path':      f"/{subdir}/{emb_name}",
            'primary_label': lbl
        })

pd.DataFrame(emb_manifest).to_csv(os.path.join(EMB_DIR, 'manifest.csv'), index=False)


Checkpoint path: /home/jovyan/panns_data/Cnn14_mAP=0.431.pth
Using CPU.


Phase 2: embed:   0%|          | 0/6 [00:00<?, ?it/s]

In [13]:
import shutil

# will create /home/jovyan/features.zip containing everything under /home/jovyan/Features
shutil.make_archive('/home/jovyan/features', 'zip', '/home/jovyan/Features')

'/home/jovyan/features.zip'