In [1]:
import os
import math
import numpy as np
import pandas as pd
import soundfile as sf
import librosa
import noisereduce as nr
import torch
from multiprocessing.dummy import Pool as ThreadPool
from tqdm.auto import tqdm
from scipy.signal import butter, filtfilt
from panns_inference import AudioTagging

DATA_ROOT        = '/home/jovyan/Data/birdclef-2025'
AUDIO_DIR        = os.path.join(DATA_ROOT, 'train_audio')
CSV_PATH         = os.path.join(DATA_ROOT, 'train.csv')

DEN_DIR          = '/home/jovyan/Features/denoised'
MEL_DIR          = '/home/jovyan/Features/mel'
EMB_DIR          = '/home/jovyan/Features/embeddings'
MEL_AUG_DIR      = '/home/jovyan/Features/mel_aug'

for d in (DEN_DIR, MEL_DIR, EMB_DIR, MEL_AUG_DIR):
    os.makedirs(d, exist_ok=True)

PANNS_SR         = 32000
CHUNK_SEC        = 10
CHUNK_SAMPLES    = PANNS_SR * CHUNK_SEC

N_FFT            = 2048
HOP_LENGTH       = 1024
N_MELS           = 64

SAMPLE_FRAC      = 1.0

# PANNs & augmentation params
BIRD_CLASS_IDXS  = [
    14,22,27,28,33,34,35,37,40,
    72,73,80,84,
    *range(98,107),108,
    *range(111,122),
    *range(126,133),
    137,361,442,503
]
THRESH           = 0.5

# band‑pass augmentation constants
LOWCUT           = 2000     # Hz
HIGHCUT          = 8000     # Hz
BAND_ALPHA       = 2.0      # boost factor
WINDOW_SEC       = 1.0

PROP_DECREASE    = 0.9
STATIONARY_NOISE = False

In [2]:
def calculate_mel_spectrogram(wave, sr=PANNS_SR):
    m = librosa.feature.melspectrogram(
        y=wave, sr=sr,
        n_fft=N_FFT, hop_length=HOP_LENGTH,
        n_mels=N_MELS
    )
    return librosa.power_to_db(m, ref=np.max)

def butter_bandpass(lowcut, highcut, fs, order=4):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def augment_waveform_with_filter(wave, mask, sr,
                                 lowcut=LOWCUT, highcut=HIGHCUT,
                                 alpha=BAND_ALPHA):
    """
    For each 1s window where mask==True, add a boosted
    band‑pass filtered copy of wave in [lowcut,highcut].
    """
    b, a = butter_bandpass(lowcut, highcut, sr, order=4)
    wave_band = filtfilt(b, a, wave)
    wave_aug  = wave.copy()
    win_len   = int(WINDOW_SEC * sr)
    for i, m in enumerate(mask):
        if m:
            start = i * win_len
            end   = min((i+1) * win_len, len(wave))
            wave_aug[start:end] = wave[start:end] + alpha * wave_band[start:end]
    peak = np.max(np.abs(wave_aug))
    if peak > 1.0:
        wave_aug /= peak
    return wave_aug

def get_per_second_embeddings_and_mask(wave, model, device):
    """
    Returns per-second embeddings and a bird_presence mask.
    """
    win_len   = PANNS_SR
    n_windows = math.ceil(len(wave) / win_len)
    embs, mask = [], []
    for w in range(n_windows):
        seg = wave[w*win_len:(w+1)*win_len]
        if len(seg) < win_len:
            seg = np.pad(seg, (0, win_len-len(seg)), mode='constant')
        inp = torch.from_numpy(seg).unsqueeze(0).to(device)
        with torch.no_grad():
            clipwise, emb = model.inference(inp)
        probs  = clipwise.squeeze(0).cpu().numpy() if torch.is_tensor(clipwise) else np.squeeze(clipwise,0)
        emb_np = emb.squeeze(0).cpu().numpy()      if torch.is_tensor(emb)      else np.squeeze(emb,0)
        embs.append(emb_np)
        score = probs[BIRD_CLASS_IDXS].max()
        present = score > THRESH or int(np.argmax(probs)) in BIRD_CLASS_IDXS
        mask.append(present)
    return np.stack(embs), np.array(mask, dtype=bool)

In [4]:
# ─── READ & SAMPLE METADATA ───────────────────────────────────────────────────
meta = pd.read_csv(CSV_PATH)
sampled = []
for label, grp in meta.groupby('primary_label'):
    n = max(1, int(len(grp) * SAMPLE_FRAC))
    sampled.append(grp.sample(n=n, random_state=42))
meta = pd.concat(sampled, ignore_index=True)

# ─── CREATE SUBFOLDERS ────────────────────────────────────────────────────────
subdirs = {os.path.dirname(f) for f in meta['filename']}
for sub in subdirs:
    if sub:
        for base in (DEN_DIR, MEL_DIR, EMB_DIR, MEL_AUG_DIR):
            os.makedirs(os.path.join(base, sub), exist_ok=True)

# ─── PHASE 1: DENOISE & MEL ────────────────────────────────────────────────────
den_manifest = []
mel_manifest = []

In [None]:
def process_phase1(record):
    fname = record['filename']; label = record['primary_label']
    src_fp = os.path.join(AUDIO_DIR, fname)
    y, sr = sf.read(src_fp, dtype='float32')
    if y.ndim>1: y=y.mean(1)
    if sr!=PANNS_SR:
        y = librosa.resample(y, orig_sr=sr, target_sr=PANNS_SR); sr=PANNS_SR

    base = os.path.splitext(os.path.basename(fname))[0]
    n_chunks = math.ceil(len(y)/CHUNK_SAMPLES)
    for ci in range(n_chunks):
        seg = y[ci*CHUNK_SAMPLES:(ci+1)*CHUNK_SAMPLES]
        if len(seg)<CHUNK_SAMPLES:
            seg = np.pad(seg,(0,CHUNK_SAMPLES-len(seg)),'constant')
        # denoise
        den = nr.reduce_noise(y=seg, sr=sr, stationary=False, prop_decrease=PROP_DECREASE)
        den /= (np.max(np.abs(den))+1e-9)

        chunk_id = f"{base}_chk{ci}"
        # save denoised audio
        rel_audio = f"/{label}/{chunk_id}.ogg"
        sf.write(os.path.join(DEN_DIR,label,chunk_id+'.ogg'), den, sr, format='OGG', subtype='VORBIS')
        den_manifest.append({'chunk_id':chunk_id,'audio_path':rel_audio,'primary_label':label})
        # save mel
        mel = calculate_mel_spectrogram(den, sr)
        rel_mel = f"/{label}/{chunk_id}.npz"
        np.savez_compressed(os.path.join(MEL_DIR,label,chunk_id+'.npz'),
                            mel=mel.astype(np.float16), primary_label=label)
        mel_manifest.append({'chunk_id':chunk_id,'mel_path':rel_mel,'primary_label':label})
    return True

In [None]:
records = meta[['filename','primary_label']].to_dict('records')
with ThreadPool(os.cpu_count()) as pool:
    list(tqdm(pool.imap_unordered(process_phase1, records),
              total=len(records), desc="Phase 1: denoise & mel"))


Denoise & MEL by species:   0%|          | 0/28564 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
pd.DataFrame(den_manifest).to_csv(os.path.join(DEN_DIR,'manifest.csv'), index=False)
pd.DataFrame(mel_manifest).to_csv(os.path.join(MEL_DIR,'manifest.csv'), index=False)

In [5]:
device      = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
panns_model = AudioTagging(checkpoint_path=None, device=device)
panns_model.model.eval()

emb_manifest     = []
mel_aug_manifest = []

# Gather denoised audio paths
den_paths = []
for root, _, files in os.walk(DEN_DIR):
    for fname in files:
        if fname.endswith('.ogg'):
            den_paths.append(os.path.join(root, fname))
den_paths.sort()

Checkpoint path: /home/jovyan/panns_data/Cnn14_mAP=0.431.pth
Using CPU.


In [6]:
def process_phase2(full_path):
    # 1) load denoised audio
    y, sr = sf.read(full_path, dtype='float32')
    if sr != PANNS_SR:
        y = librosa.resample(y, orig_sr=sr, target_sr=PANNS_SR)
        sr = PANNS_SR

    # 2) per‑second embeddings & mask
    embs, mask = get_per_second_embeddings_and_mask(y, panns_model, device)

    # 3) save embeddings
    rel = os.path.relpath(full_path, DEN_DIR)
    subdir, fname = os.path.split(rel)
    chunk_id = os.path.splitext(fname)[0]
    out_emb_dir = os.path.join(EMB_DIR, subdir)
    os.makedirs(out_emb_dir, exist_ok=True)
    emb_path = os.path.join(out_emb_dir, f"{chunk_id}_emb.npz")
    np.savez_compressed(emb_path,
                        embedding=embs.astype(np.float32),
                        mask=mask,
                        primary_label=subdir)
    emb_manifest.append({
        'chunk_id':      chunk_id,
        'emb_path':      f"/{subdir}/{chunk_id}_emb.npz",
        'primary_label': subdir
    })

    # 4) filter‑based augmentation
    y_aug = augment_waveform_with_filter(y, mask, sr)

    # 5) re‑denoise
    y_aug_dn = nr.reduce_noise(
        y=y_aug, sr=sr,
        stationary=STATIONARY_NOISE,
        prop_decrease=PROP_DECREASE
    )

    # 6) compute augmented MEL
    mel_aug = calculate_mel_spectrogram(y_aug_dn, sr)

    # 7) save augmented MEL
    out_mel_dir = os.path.join(MEL_AUG_DIR, subdir)
    os.makedirs(out_mel_dir, exist_ok=True)
    mel_aug_path = os.path.join(out_mel_dir, f"{chunk_id}.npz")
    np.savez_compressed(mel_aug_path,
                        mel=mel_aug.astype(np.float16),
                        primary_label=subdir)
    mel_aug_manifest.append({
        'chunk_id':      chunk_id,
        'mel_aug_path':  f"/{subdir}/{chunk_id}.npz",
        'primary_label': subdir
    })

    return True

In [None]:
with ThreadPool(os.cpu_count()) as pool:
    list(tqdm(pool.imap_unordered(process_phase2, den_paths),
              total=len(den_paths),
              desc="Phase 2: embed & augment"))

# ─── Write manifests ─────────────────────────────────────────────────────────
pd.DataFrame(emb_manifest).to_csv(
    os.path.join(EMB_DIR, 'manifest.csv'),
    index=False
)
pd.DataFrame(mel_aug_manifest).to_csv(
    os.path.join(MEL_AUG_DIR, 'manifest.csv'),
    index=False
)

Phase 2: embed & augment:   0%|          | 0/115357 [00:00<?, ?it/s]

In [13]:
import shutil

# will create /home/jovyan/features.zip containing everything under /home/jovyan/Features
shutil.make_archive('/home/jovyan/features', 'zip', '/home/jovyan/Features')

'/home/jovyan/features.zip'

In [11]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# ── CONFIGURE PATHS ───────────────────────────────────────────────────────────
BASE_FEAT    = '/home/jovyan/Features'
DEN_MANIFEST = os.path.join(BASE_FEAT, 'denoised',   'manifest.csv')
MEL_MANIFEST = os.path.join(BASE_FEAT, 'mel',        'manifest.csv')
EMB_MANIFEST = os.path.join(BASE_FEAT, 'embeddings', 'manifest.csv')

OUT_DIR      = BASE_FEAT  # where we'll write manifest_train/test/reserve.csv

# ── 1) LOAD & MERGE ALL FEATURE MANIFESTS ─────────────────────────────────────
den_df = pd.read_csv(DEN_MANIFEST)[['chunk_id','audio_path','primary_label']]
mel_df = pd.read_csv(MEL_MANIFEST)[['chunk_id','mel_path']]
emb_df = pd.read_csv(EMB_MANIFEST)[['chunk_id','emb_path']]

df = (
    den_df
    .merge(mel_df, on='chunk_id')
    .merge(emb_df, on='chunk_id')
)

In [12]:
df['file_id'] = df['chunk_id'].str.rsplit('_chk', n=1).str[0]

# ── 3) BUILD file‑level DataFrame for stratified splitting ─────────────────────
files_df = df[['file_id','primary_label']].drop_duplicates().reset_index(drop=True)
file_ids = files_df['file_id']
file_labels = files_df['primary_label']

In [13]:
train_files, temp_files, train_lbls, temp_lbls = train_test_split(
    file_ids, file_labels,
    train_size=0.6,
    stratify=file_labels,
    random_state=42
)

# From temp (40%), 25%→test (10% total), 75%→reserve (30% total)
try:
    test_files, reserve_files, _, _ = train_test_split(
        temp_files, temp_lbls,
        train_size=0.25,
        stratify=temp_lbls,
        random_state=42
    )
except ValueError:
    # fallback if some file_ids have only one sample
    test_files, reserve_files, _, _ = train_test_split(
        temp_files, temp_lbls,
        train_size=0.25,
        shuffle=True,
        random_state=42
    )

split_map = {}
for fid in train_files:   split_map[fid] = 'train'
for fid in test_files:    split_map[fid] = 'test'
for fid in reserve_files: split_map[fid] = 'reserve'

In [14]:
df['split'] = df['file_id'].map(split_map)

# ── 6) WRITE OUT chunk‑level manifests for each split ─────────────────────────
for split in ['train','test','reserve']:
    out_df = df[df['split']==split].drop(columns=['file_id','split'])
    out_fp = os.path.join(OUT_DIR, f'manifest_{split}.csv')
    out_df.to_csv(out_fp, index=False)
    print(f"Wrote {len(out_df)} rows to {out_fp}")

Wrote 69676 rows to /home/jovyan/Features/manifest_train.csv
Wrote 11474 rows to /home/jovyan/Features/manifest_test.csv
Wrote 34603 rows to /home/jovyan/Features/manifest_reserve.csv
