In [7]:
import os
import math
import numpy as np
import pandas as pd
import soundfile as sf
import librosa
import noisereduce as nr
import torch
from multiprocessing.dummy import Pool as ThreadPool
from tqdm.auto import tqdm
from scipy.signal import butter, filtfilt
from panns_inference import AudioTagging

DATA_ROOT        = '/home/jovyan/Data/birdclef-2025'
AUDIO_DIR        = os.path.join(DATA_ROOT, 'train_audio')
CSV_PATH         = os.path.join(DATA_ROOT, 'train.csv')

DEN_DIR          = '/home/jovyan/Features/denoised'
MEL_DIR          = '/home/jovyan/Features/mel'
EMB_DIR          = '/home/jovyan/Features/embeddings'
MEL_AUG_DIR      = '/home/jovyan/Features/mel_aug'

for d in (DEN_DIR, MEL_DIR, EMB_DIR, MEL_AUG_DIR):
    os.makedirs(d, exist_ok=True)

PANNS_SR         = 32000
CHUNK_SEC        = 10
CHUNK_SAMPLES    = PANNS_SR * CHUNK_SEC

N_FFT            = 2048
HOP_LENGTH       = 1024
N_MELS           = 64

SAMPLE_FRAC      = 1.0

# PANNs & augmentation params
BIRD_CLASS_IDXS  = [
    14,22,27,28,33,34,35,37,40,
    72,73,80,84,
    *range(98,107),108,
    *range(111,122),
    *range(126,133),
    137,361,442,503
]
THRESH           = 0.5

# band‑pass augmentation constants
LOWCUT           = 2000     # Hz
HIGHCUT          = 8000     # Hz
BAND_ALPHA       = 2.0      # boost factor
WINDOW_SEC      = 1.0
WIN_SAMPLES     = int(WINDOW_SEC * PANNS_SR)

PROP_DECREASE    = 0.9
STATIONARY_NOISE = False

CHUNKS_PER_BATCH = 64 
device      = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def calculate_mel_spectrogram(wave, sr=PANNS_SR):
    m = librosa.feature.melspectrogram(
        y=wave, sr=sr,
        n_fft=N_FFT, hop_length=HOP_LENGTH,
        n_mels=N_MELS
    )
    return librosa.power_to_db(m, ref=np.max)

def butter_bandpass(lowcut, highcut, fs, order=4):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def augment_waveform_with_filter(wave, mask, sr,
                                 lowcut=LOWCUT, highcut=HIGHCUT,
                                 alpha=BAND_ALPHA):
    """
    For each 1s window where mask==True, add a boosted
    band‑pass filtered copy of wave in [lowcut,highcut].
    """
    b, a = butter_bandpass(lowcut, highcut, sr, order=4)
    wave_band = filtfilt(b, a, wave)
    wave_aug  = wave.copy()
    win_len   = int(WINDOW_SEC * sr)
    for i, m in enumerate(mask):
        if m:
            start = i * win_len
            end   = min((i+1) * win_len, len(wave))
            wave_aug[start:end] = wave[start:end] + alpha * wave_band[start:end]
    peak = np.max(np.abs(wave_aug))
    if peak > 1.0:
        wave_aug /= peak
    return wave_aug

def get_per_second_embeddings_and_mask(wave, model, device):
    """
    Returns per-second embeddings and a bird_presence mask.
    """
    win_len   = PANNS_SR
    n_windows = math.ceil(len(wave) / win_len)
    embs, mask = [], []
    for w in range(n_windows):
        seg = wave[w*win_len:(w+1)*win_len]
        if len(seg) < win_len:
            seg = np.pad(seg, (0, win_len-len(seg)), mode='constant')
        inp = torch.from_numpy(seg).unsqueeze(0).to(device)
        with torch.no_grad():
            clipwise, emb = model.inference(inp)
        probs  = clipwise.squeeze(0).cpu().numpy() if torch.is_tensor(clipwise) else np.squeeze(clipwise,0)
        emb_np = emb.squeeze(0).cpu().numpy()      if torch.is_tensor(emb)      else np.squeeze(emb,0)
        embs.append(emb_np)
        score = probs[BIRD_CLASS_IDXS].max()
        present = score > THRESH or int(np.argmax(probs)) in BIRD_CLASS_IDXS
        mask.append(present)
    return np.stack(embs), np.array(mask, dtype=bool)

In [4]:
# ─── READ & SAMPLE METADATA ───────────────────────────────────────────────────
meta = pd.read_csv(CSV_PATH)
sampled = []
for label, grp in meta.groupby('primary_label'):
    n = max(1, int(len(grp) * SAMPLE_FRAC))
    sampled.append(grp.sample(n=n, random_state=42))
meta = pd.concat(sampled, ignore_index=True)

# ─── CREATE SUBFOLDERS ────────────────────────────────────────────────────────
subdirs = {os.path.dirname(f) for f in meta['filename']}
for sub in subdirs:
    if sub:
        for base in (DEN_DIR, MEL_DIR, EMB_DIR, MEL_AUG_DIR):
            os.makedirs(os.path.join(base, sub), exist_ok=True)

# ─── PHASE 1: DENOISE & MEL ────────────────────────────────────────────────────
den_manifest = []
mel_manifest = []

In [None]:
def process_phase1(record):
    fname = record['filename']; label = record['primary_label']
    src_fp = os.path.join(AUDIO_DIR, fname)
    y, sr = sf.read(src_fp, dtype='float32')
    if y.ndim>1: y=y.mean(1)
    if sr!=PANNS_SR:
        y = librosa.resample(y, orig_sr=sr, target_sr=PANNS_SR); sr=PANNS_SR

    base = os.path.splitext(os.path.basename(fname))[0]
    n_chunks = math.ceil(len(y)/CHUNK_SAMPLES)
    for ci in range(n_chunks):
        seg = y[ci*CHUNK_SAMPLES:(ci+1)*CHUNK_SAMPLES]
        if len(seg)<CHUNK_SAMPLES:
            seg = np.pad(seg,(0,CHUNK_SAMPLES-len(seg)),'constant')
        # denoise
        den = nr.reduce_noise(y=seg, sr=sr, stationary=False, prop_decrease=PROP_DECREASE)
        den /= (np.max(np.abs(den))+1e-9)

        chunk_id = f"{base}_chk{ci}"
        # save denoised audio
        rel_audio = f"/{label}/{chunk_id}.ogg"
        sf.write(os.path.join(DEN_DIR,label,chunk_id+'.ogg'), den, sr, format='OGG', subtype='VORBIS')
        den_manifest.append({'chunk_id':chunk_id,'audio_path':rel_audio,'primary_label':label})
        # save mel
        mel = calculate_mel_spectrogram(den, sr)
        rel_mel = f"/{label}/{chunk_id}.npz"
        np.savez_compressed(os.path.join(MEL_DIR,label,chunk_id+'.npz'),
                            mel=mel.astype(np.float16), primary_label=label)
        mel_manifest.append({'chunk_id':chunk_id,'mel_path':rel_mel,'primary_label':label})
    return True

In [None]:
records = meta[['filename','primary_label']].to_dict('records')
with ThreadPool(os.cpu_count()) as pool:
    list(tqdm(pool.imap_unordered(process_phase1, records),
              total=len(records), desc="Phase 1: denoise & mel"))


Denoise & MEL by species:   0%|          | 0/28564 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
pd.DataFrame(den_manifest).to_csv(os.path.join(DEN_DIR,'manifest.csv'), index=False)
pd.DataFrame(mel_manifest).to_csv(os.path.join(MEL_DIR,'manifest.csv'), index=False)

In [10]:
panns = AudioTagging(checkpoint_path=None, device=device)
panns.model.eval()

emb_manifest     = []
mel_aug_manifest = []

# Gather denoised audio paths]
den_paths = sorted([
    os.path.join(root,f)
    for root,_,files in os.walk(DEN_DIR)
    for f in files if f.endswith('.ogg')
])

Checkpoint path: /home/jovyan/panns_data/Cnn14_mAP=0.431.pth
Using CPU.


In [12]:
for i in tqdm(range(0, len(den_paths), CHUNKS_PER_BATCH), desc="Phase1: multi‑chunk batches"):
    batch_files = den_paths[i:i+CHUNKS_PER_BATCH]
    all_segs, mapping = [], []
    # 1) Load + split into 1s windows
    for fp in batch_files:
        y, sr = sf.read(fp, dtype='float32')
        if sr != PANNS_SR:
            y = librosa.resample(y, orig_sr=sr, target_sr=PANNS_SR)
        n_win = math.ceil(len(y) / WIN_SAMPLES)
        for w in range(n_win):
            seg = y[w*WIN_SAMPLES:(w+1)*WIN_SAMPLES]
            if len(seg) < WIN_SAMPLES:
                seg = np.pad(seg, (0, WIN_SAMPLES-len(seg)), mode='constant')
            all_segs.append(seg)
            mapping.append((fp, w))

    # 2) Batch‑infer all windows at once
    segs_t = torch.from_numpy(np.stack(all_segs)).to(device)
    with torch.no_grad():
        clipwise_all, emb_all = panns.inference(segs_t)

    # handle Tensor vs ndarray
    if isinstance(clipwise_all, torch.Tensor):
        probs = clipwise_all.cpu().numpy()
    else:
        probs = clipwise_all
    if isinstance(emb_all, torch.Tensor):
        embs = emb_all.cpu().numpy()
    else:
        embs = emb_all

    # 3) Scatter back to files
    by_file = {}
    for (fp, w), p_arr, e_arr in zip(mapping, probs, embs):
        if fp not in by_file:
            by_file[fp] = {'embs': [], 'probs': []}
        by_file[fp]['embs'].append(e_arr)
        by_file[fp]['probs'].append(p_arr)

    # 4) Save per‑file embeddings + mask
    for fp, data in by_file.items():
        embs_np  = np.stack(data['embs'])
        probs_np = np.stack(data['probs'])
        mask     = (probs_np[:, BIRD_CLASS_IDXS].max(axis=1) > THRESH) \
                   | np.isin(probs_np.argmax(axis=1), BIRD_CLASS_IDXS)
        rel      = os.path.relpath(fp, DEN_DIR)
        sub, fn  = os.path.split(rel)
        cid      = os.path.splitext(fn)[0]
        out_dir  = os.path.join(EMB_DIR, sub)
        os.makedirs(out_dir, exist_ok=True)
        out_path = os.path.join(out_dir, f"{cid}_emb.npz")

        np.savez_compressed(
            out_path,
            embedding=embs_np.astype(np.float32),
            mask=mask,
            primary_label=sub
        )

        emb_manifest.append({
            'chunk_id':      cid,
            'emb_path':      f"/{sub}/{cid}_emb.npz",
            'primary_label': sub
        })

pd.DataFrame(emb_manifest).to_csv(
    os.path.join(EMB_DIR, 'manifest.csv'),
    index=False
)

Phase1: multi‑chunk batches:   0%|          | 0/1803 [00:00<?, ?it/s]

In [14]:
mask_map = {}
for root, _, files in os.walk(EMB_DIR):
    for f in files:
        if f.endswith('_emb.npz'):
            data = np.load(os.path.join(root, f))
            mask = data['mask']
            rel = os.path.relpath(os.path.join(root, f), EMB_DIR)
            sub, fname = os.path.split(rel)
            chunk_id = fname.replace('_emb.npz','')
            den_path = os.path.join(DEN_DIR, sub, chunk_id + '.ogg')
            mask_map[den_path] = mask

In [15]:
den_paths = list(mask_map.keys())

def process_part2(full_path):
    # 1) Load denoised audio
    y, sr = sf.read(full_path, dtype='float32')
    if sr != PANNS_SR:
        y = librosa.resample(y, orig_sr=sr, target_sr=PANNS_SR)
        sr = PANNS_SR

    # 2) Get mask
    mask = mask_map[full_path]

    # 3) Augment on CPU
    y_aug = augment_waveform_with_filter(y, mask, sr)

    # 4) Re-denoise on CPU
    y_aug_dn = nr.reduce_noise(
        y=y_aug, sr=sr,
        stationary=STATIONARY_NOISE,
        prop_decrease=PROP_DECREASE
    )

    # 5) Compute MEL
    mel_aug = calculate_mel_spectrogram(y_aug_dn, sr)

    # 6) Save augmented MEL
    rel = os.path.relpath(full_path, DEN_DIR)
    sub, fname = os.path.split(rel)
    chunk_id = os.path.splitext(fname)[0]
    out_dir  = os.path.join(MEL_AUG_DIR, sub)
    os.makedirs(out_dir, exist_ok=True)
    np.savez_compressed(
        os.path.join(out_dir, chunk_id + '.npz'),
        mel=mel_aug.astype(np.float16),
        primary_label=sub
    )
    mel_aug_manifest.append({
        'chunk_id': chunk_id,
        'mel_aug_path': f"/{sub}/{chunk_id}.npz",
        'primary_label': sub
    })
    return True

In [19]:

with ThreadPool(os.cpu_count()) as pool:
    list(tqdm(pool.imap_unordered(process_part2, den_paths),
              total=len(den_paths),
              desc="Phase 2: augment+denoise+mel"))

pd.DataFrame(mel_aug_manifest).to_csv(
    os.path.join(MEL_AUG_DIR, 'manifest.csv'),
    index=False
)

Phase 2: augment+denoise+mel:   0%|          | 0/115357 [00:00<?, ?it/s]

In [31]:
import shutil

# will create /home/jovyan/features.zip containing everything under /home/jovyan/Features
shutil.make_archive('/home/jovyan/features', 'zip', '/home/jovyan/Features')

'/home/jovyan/features.zip'

In [24]:
import os
import numpy as np
import pandas as pd

# ── CONFIG ─────────────────────────────────────────────────────────────────────
BASE_FEAT    = '/home/jovyan/Features'
DEN_MAN      = os.path.join(BASE_FEAT, 'denoised',   'manifest.csv')
MEL_MAN      = os.path.join(BASE_FEAT, 'mel',        'manifest.csv')
EMB_MAN      = os.path.join(BASE_FEAT, 'embeddings', 'manifest.csv')
AUG_MAN      = os.path.join(BASE_FEAT, 'mel_aug',    'manifest.csv')
OUT_DIR      = BASE_FEAT

# ── 1) LOAD & MERGE ──────────────────────────────────────────────────────────
den_df = pd.read_csv(DEN_MAN)[['chunk_id','audio_path','primary_label']]
mel_df = pd.read_csv(MEL_MAN)[['chunk_id','mel_path']]
emb_df = pd.read_csv(EMB_MAN)[['chunk_id','emb_path']]
aug_df = pd.read_csv(AUG_MAN)[['chunk_id','mel_aug_path']]

df = (den_df
      .merge(mel_df, on='chunk_id')
      .merge(emb_df, on='chunk_id')
      .merge(aug_df, on='chunk_id'))


In [25]:
df['recording_id'] = df['chunk_id'].str.split('_chk').str[0]
unique_recs = df[['recording_id','primary_label']].drop_duplicates()

# first pick one recording per species
rng = np.random.default_rng(42)
seed_per_species = unique_recs.groupby('primary_label')['recording_id'] \
                              .apply(lambda ids: rng.choice(ids.values, 1)[0]) \
                              .tolist()

all_recs = unique_recs['recording_id'].tolist()
remaining = [r for r in all_recs if r not in seed_per_species]

n_total = len(all_recs)
n_train_target = int(round(0.70 * n_total))
n_additional = max(0, n_train_target - len(seed_per_species))

addl_train = rng.choice(remaining, size=n_additional, replace=False).tolist()
train_recs = set(seed_per_species + addl_train)

remaining_after_train = [r for r in all_recs if r not in train_recs]
n_test = int(round(0.10 * n_total))
test_recs = set(rng.choice(remaining_after_train, size=n_test, replace=False).tolist())

val_recs = set(r for r in all_recs if r not in train_recs and r not in test_recs)

df_train = df[df['recording_id'].isin(train_recs)].reset_index(drop=True)
df_test  = df[df['recording_id'].isin(test_recs)].reset_index(drop=True)
df_val   = df[df['recording_id'].isin(val_recs)].reset_index(drop=True)

In [27]:
for name, split_df in [('train', df_train), ('test', df_test), ('val', df_val)]:
    out = split_df.drop(columns=['recording_id'])
    out.to_csv(os.path.join(OUT_DIR, f'manifest_{name}.csv'), index=False)
    print(f"{name.capitalize()}: {len(out)} chunks, {out['primary_label'].nunique()} species")
    
print(f"\nRecordings → total {n_total}, train {len(train_recs)}, test {len(test_recs)}, val {len(val_recs)}")

Train: 82976 chunks, 206 species
Test: 11022 chunks, 173 species
Val: 22733 chunks, 181 species

Recordings → total 28564, train 19990, test 2856, val 5706


In [29]:
train_df = pd.read_csv(os.path.join(OUT_DIR, 'manifest_train.csv'))
train_df['recording_id'] = train_df['chunk_id'].str.split('_chk').str[0]

# group recordings by species
groups = (
    train_df[['recording_id','primary_label']]
    .drop_duplicates()
    .groupby('primary_label')['recording_id']
    .apply(list)
    .to_dict()
)

# target = median number of recordings per species
target = int(np.median([len(v) for v in groups.values()]))

oversampled_recs = []
for sp, recs in groups.items():
    oversampled_recs.extend(recs)
    n_extra = max(0, target - len(recs))
    if n_extra:
        extras = rng.choice(recs, size=n_extra, replace=True)
        oversampled_recs.extend(extras.tolist())

# assemble oversampled train manifest
frames = []
for rec in oversampled_recs:
    frames.append(train_df[train_df['recording_id'] == rec])
oversampled_df = pd.concat(frames, ignore_index=True)
oversampled_df = oversampled_df.drop(columns=['recording_id'])

out_path = os.path.join(OUT_DIR, 'manifest_train_oversampled.csv')
oversampled_df.to_csv(out_path, index=False)
print(f"Oversampled train manifest: {len(oversampled_df)} chunks")

Oversampled train manifest: 108451 chunks


### Manually Renamed manifest_train_oversampled

In [30]:
print("\nRecordings per species (counts):")
for sp, cnt in species_counts.items():
    train_count = (rec2sp.loc[list(oversampled_recs)] == sp).sum()
    test_count  = (rec2sp.loc[list(test_recs)] == sp).sum()
    val_count   = (rec2sp.loc[list(val_recs)] == sp).sum()
    print(f"  {sp}: total={cnt}, "
          f"train={train_count}, "
          f"test={test_count}, "
          f"val={val_count}")


Recordings per species (counts):
  grekis: total=990, train=695, test=95, val=201
  compau: total=808, train=561, test=77, val=170
  trokin: total=787, train=531, test=86, val=170
  roahaw: total=709, train=486, test=71, val=152
  banana: total=610, train=427, test=53, val=130
  whtdov: total=572, train=407, test=57, val=109
  socfly1: total=543, train=368, test=50, val=126
  yeofly1: total=525, train=372, test=55, val=98
  bobfly1: total=514, train=368, test=47, val=101
  wbwwre1: total=499, train=360, test=44, val=95
  soulap1: total=487, train=330, test=50, val=107
  sobtyr1: total=478, train=350, test=38, val=92
  trsowl: total=470, train=335, test=49, val=87
  laufal1: total=467, train=319, test=48, val=100
  strcuc1: total=431, train=296, test=46, val=89
  bbwduc: total=424, train=287, test=45, val=92
  saffin: total=419, train=289, test=49, val=82
  amekes: total=409, train=273, test=41, val=95
  tropar: total=397, train=282, test=42, val=73
  compot1: total=383, train=260, tes