In [22]:
import os, math, numpy as np, pandas as pd, soundfile as sf, librosa, noisereduce as nr
import torch
import multiprocessing as mp
from multiprocessing import cpu_count
from multiprocessing.dummy import Pool as ThreadPool
from tqdm.auto import tqdm
from tqdm.auto import tqdm
from panns_inference import AudioTagging

# Force spawn for multiprocessing (needed for CUDA-safe workers)
mp.set_start_method('spawn', force=True)

# ─────── Hyperparameters, Paths & Subfolder Creation ───────
import os
import pandas as pd

DATA_ROOT     = '/home/jovyan/Data/birdclef-2025'
AUDIO_DIR     = os.path.join(DATA_ROOT, 'train_audio')
CSV_PATH      = os.path.join(DATA_ROOT, 'train.csv')

DEN_DIR       = '/home/jovyan/Features/denoised'
EMB_DIR       = '/home/jovyan/Features/embeddings'
os.makedirs(DEN_DIR, exist_ok=True)
os.makedirs(EMB_DIR, exist_ok=True)

PANNS_SR      = 32000       # target SR
CHUNK_SEC     = 10          # seconds per chunk
CHUNK_SAMPLES = PANNS_SR * CHUNK_SEC

# Sampling fraction (e.g. 0.1 → 10% of files)
SAMPLE_FRAC   = 0.1

In [23]:
# ─── Read & Sample Metadata ───
meta = pd.read_csv(CSV_PATH)
meta = meta.sample(frac=SAMPLE_FRAC, random_state=42).reset_index(drop=True)

# ─── Create matching subfolders in DEN_DIR & EMB_DIR ───
subdirs = set(os.path.dirname(f) for f in meta['filename'])
for sub in subdirs:
    if not sub:
        continue
    os.makedirs(os.path.join(DEN_DIR, sub), exist_ok=True)
    os.makedirs(os.path.join(EMB_DIR, sub), exist_ok=True)

print(f"Prepared base dirs and {len(subdirs)} nested subfolders under DEN_DIR and EMB_DIR.")

# build label→index map
labels    = sorted(meta['primary_label'].unique())
label2idx = {l:i for i,l in enumerate(labels)}
meta['label_idx'] = meta['primary_label'].map(label2idx)

print(f"Processing {len(meta)} files ({SAMPLE_FRAC*100:.0f}% of dataset)")

Prepared base dirs and 178 nested subfolders under DEN_DIR and EMB_DIR.
Processing 2856 files (10% of dataset)


In [24]:
def process_denoise(row):
    fname = row['filename']
    lbl   = int(row['label_idx'])
    # 1) load + mono
    y, sr = sf.read(os.path.join(AUDIO_DIR, fname), dtype='float32')
    if y.ndim>1: y = y.mean(axis=1)
    # 2) resample
    y32 = librosa.resample(y, orig_sr=sr, target_sr=PANNS_SR)
    # 3) split into 10s chunks
    n_chunks = math.ceil(len(y32)/CHUNK_SAMPLES)
    for ci in range(n_chunks):
        seg = y32[ci*CHUNK_SAMPLES:(ci+1)*CHUNK_SAMPLES]
        if len(seg)<CHUNK_SAMPLES:
            seg = np.pad(seg, (0, CHUNK_SAMPLES-len(seg)), mode='constant')
        # 4) denoise
        den = nr.reduce_noise(y=seg, sr=PANNS_SR,
                              prop_decrease=0.9,
                              stationary=False)
        # 5) save
        base    = os.path.splitext(fname)[0] + f'_chk{ci}.npz'
        outpath = os.path.join(DEN_DIR, base)
        np.savez_compressed(outpath,
                            waveform=den.astype(np.float32),
                            label=lbl)
    return True



In [18]:
with ThreadPool(mp.cpu_count()) as pool:
    for _ in tqdm(pool.imap_unordered(process_denoise, meta.to_dict('records')),
                  total=len(meta),
                  desc="Phase 1 (threads)"):
        pass

Phase 1 (threads):   0%|          | 0/2856 [00:00<?, ?it/s]

In [25]:
device      = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
panns_model = AudioTagging(checkpoint_path=None, device=device)
panns_model.model.eval()

# 1) Gather all .npz paths under DEN_DIR (skip any directories)
den_paths = []
for root, _, files in os.walk(DEN_DIR):
    for fname in files:
        if fname.endswith('.npz'):
            den_paths.append(os.path.join(root, fname))
den_paths = sorted(den_paths)

BATCH_SIZE = 1024

for i in tqdm(range(0, len(den_paths), BATCH_SIZE), desc="Phase 2: embed"):
    batch_paths = den_paths[i:i+BATCH_SIZE]
    waves, labels, rels = [], [], []

    # 1) Load waveforms & labels
    for full_path in batch_paths:
        data = np.load(full_path)
        waves.append(data['waveform'])
        labels.append(int(data['label']))
        # record relative path to mirror subfolders
        rels.append(os.path.relpath(full_path, DEN_DIR))

    # 2) Stack and send to GPU
    waves_t = torch.from_numpy(np.stack(waves)).to(device)  # [B, CHUNK_SAMPLES]

    # 3) Run inference
    with torch.no_grad():
        _, emb_out = panns_model.inference(waves_t)

    # 4) Convert to NumPy
    if isinstance(emb_out, torch.Tensor):
        embs_np = emb_out.cpu().numpy()
    elif isinstance(emb_out, np.ndarray):
        embs_np = emb_out
    else:
        raise TypeError(f"Unexpected embedding type: {type(emb_out)}")

    # 5) Save each embedding + label
    for rel, emb_arr, lbl in zip(rels, embs_np, labels):
        subdir, fname = os.path.split(rel)         # ('smbani', 'XC461360_chk0.npz')
        base          = os.path.splitext(fname)[0] + '_emb.npz'
        out_dir       = os.path.join(EMB_DIR, subdir)
        os.makedirs(out_dir, exist_ok=True)
        out_path      = os.path.join(out_dir, base)
        np.savez_compressed(out_path,
                            embedding=emb_arr.astype(np.float32),
                            label=lbl)


Checkpoint path: /home/jovyan/panns_data/Cnn14_mAP=0.431.pth


  checkpoint = torch.load(checkpoint_path, map_location=self.device)


Using CPU.


Phase 2: embed:   0%|          | 0/11 [00:00<?, ?it/s]

In [26]:
print("Denoised chunks:", len(den_paths))
emb_count = sum(
    1 for _root, _dirs, files in os.walk(EMB_DIR) for f in files
    if f.endswith('_emb.npz')
)
print("Embedded chunks:", emb_count)

Denoised chunks: 11236
Embedded chunks: 11236


In [27]:
import shutil

# will create /home/jovyan/features.zip containing everything under /home/jovyan/Features
shutil.make_archive('/home/jovyan/features', 'zip', '/home/jovyan/Features')

'/home/jovyan/features.zip'