# Ebemdding pipeline
Bioacoustics Model Zoo: https://github.com/kitzeslab/bioacoustics-model-zoo

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd

scratch_dir = "~/scratch/birdclef/data/2025"
model_name = "HawkEars"
embed_dir = f"{scratch_dir}/train_audio-infer-soundscape/{model_name}/parts/embed/"


def load_metadata(input_path: str) -> pd.DataFrame:
    df = pd.read_parquet(input_path, columns=["file"])
    # extract species_name
    df["species_name"] = df["file"].apply(
        lambda x: x.split("train_audio/")[1].split("/")[0]
    )
    # train/test split requries y label to have at least 2 samples
    # remove species with less than 2 samples
    species_count = df["species_name"].value_counts()
    valid_species = species_count[species_count >= 2].index
    df = df[df["species_name"].isin(valid_species)].reset_index(drop=True)
    return df


df = pd.read_parquet(embed_dir)
meta_df = load_metadata(embed_dir)
display(df.head(5))
display(df.shape)
display(meta_df.head(5))

Unnamed: 0,file,start_time,end_time,0,1,2,3,4,5,6,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,0.0,3.0,0.0,0.0,0.039179,0.0,0.106437,0.0,0.0,...,0.00557,0.005109,0.0,0.0,0.0,0.0,0.0,0.071435,0.0,0.0
1,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,1.0,4.0,0.0,0.0,0.044338,0.0,0.102583,0.0,0.0,...,0.014123,0.0,0.0,0.0,0.0,0.0,0.0,0.049859,0.0,0.0
2,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,2.0,5.0,0.0,0.0,0.060322,0.0,0.076364,0.0,0.0,...,0.037997,0.046863,0.0,0.0,0.0,0.0,0.0,0.040215,0.0,0.0
3,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,3.0,6.0,0.0,0.0,0.015319,0.0,0.069734,0.0,0.0,...,0.016078,0.071976,0.002354,0.0,0.0,0.0,0.0,0.043678,0.0,0.0
4,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,4.0,7.0,0.0,0.0,0.03409,0.0,0.077159,0.0,0.0,...,0.013994,0.056684,0.0,0.0,0.0,0.0,0.0,0.064824,0.0,0.0


(939116, 2051)

Unnamed: 0,file,species_name
0,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,1139490
1,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,1139490
2,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,1139490
3,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,1139490
4,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,1139490


: 

In [5]:
row = df.iloc[0]
row["file"]

'/storage/coda1/p-dsgt_clef2025/0/shared/birdclef/raw/birdclef-2025/train_audio/1139490/CSA36385.ogg'

In [None]:
def get_species_names(file_path_list: list) -> list:
    species_list = []
    for file in file_path_list:
        species_name = file.split("train_audio/")[1].split("/")[0]
        species_list.append(species_name)

    species_list = list(set(species_list))
    species_list.sort()
    return species_list


species_list = get_species_names(df["file"].iloc[:])
display(len(species_list))
species_list[:5]

In [None]:
# create species_name column
embed_df = df.copy()
embed_df["species_name"] = df["file"].apply(
    lambda x: x.split("train_audio/")[1].split("/")[0]
)
# group by species_name and count the number of files
species_count = embed_df.groupby("species_name").size().reset_index(name="count")
species_count.sort_values(by="count", ascending=False, inplace=True)
species_count.tail(10)

In [None]:
# train/test split requries y label to have at least 2 samples
# remove species from ebed_df with less than 2 samples
species_count = embed_df["species_name"].value_counts()
valid_species = species_count[species_count >= 2].index
embed_df = embed_df[embed_df["species_name"].isin(valid_species)].reset_index(drop=True)
embed_df.shape, df.shape

In [None]:
from pathlib import Path

raw_root = Path("~/p-dsgt_clef2025-0/shared/birdclef/raw/birdclef-2025").expanduser()
! ls {raw_root}
soundscape_root = raw_root / "train_audio"
! ls {soundscape_root} | head
soundscapes = sorted(soundscape_root.glob("**/*.ogg"))
display(len(soundscapes))

In [None]:
soundscapes = [str(sp) for sp in soundscapes]
soundscape_list = get_species_names(soundscapes)
display(len(soundscape_list))
soundscape_list[:5]

In [None]:
for name in soundscape_list:
    if name not in species_list:
        print(name)