# Ebemdding pipeline
Bioacoustics Model Zoo: https://github.com/kitzeslab/bioacoustics-model-zoo

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

scratch_dir = "~/scratch/birdclef/data/2025"
model_name = "RanaSierraeCNN"
embed_dir = f"{scratch_dir}/train_audio-infer-soundscape/{model_name}/parts/embed/"


def preprocess_data(input_path: str) -> pd.DataFrame:
    df = pd.read_parquet(input_path)
    # concatenate all embeddings into a single DataFrame
    df["species_name"] = df["file"].apply(
        lambda x: x.split("train_audio/")[1].split("/")[0]
    )
    # train/test split requries y label to have at least 2 samples
    # remove species with less than 2 samples
    species_count = df["species_name"].value_counts()
    valid_species = species_count[species_count >= 2].index
    filtered_df = df[df["species_name"].isin(valid_species)].reset_index(drop=True)
    # concatenate embeddings
    non_embed_cols = {"file", "start_time", "end_time", "species_name"}
    embed_cols = [col for col in filtered_df.columns if col not in non_embed_cols]
    filtered_df["embeddings"] = list(filtered_df[embed_cols].to_numpy())
    df_embs = filtered_df[["species_name", "embeddings"]].copy()
    print(f"DataFrame shape: {df_embs.shape}")
    print(f"Embedding size: {len(df_embs['embeddings'].iloc[0])}")
    return df_embs


df = pd.read_parquet(embed_dir)
embed_df = preprocess_data(embed_dir)
display(df.head(5))
display(embed_df.head(5))

DataFrame shape: (967390, 2)
Embedding size: 512


Unnamed: 0,file,start_time,end_time,0,1,2,3,4,5,6,...,502,503,504,505,506,507,508,509,510,511
0,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,0.0,2.0,5.419207,0.87129,1.777295,0.004625,0.003101,0.051039,0.0,...,0.001239,2.434101,0.0,0.003629,0.163328,3.506454,0.103979,1.377844,1.749998,0.10152
1,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,1.0,3.0,4.556544,0.077743,2.052582,0.0,0.033188,0.340124,0.0,...,0.0,1.869135,0.0,0.0,0.069754,1.763988,0.010189,1.924023,0.221941,0.196073
2,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,2.0,4.0,4.653011,0.055394,2.093789,0.004328,0.02384,0.378915,0.0,...,0.0,2.020643,0.0,0.0,0.065655,1.979608,0.010868,1.752458,0.37124,0.235253
3,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,3.0,5.0,4.258378,0.0,1.601861,0.0011,0.013942,0.359071,0.0,...,0.0,2.089144,0.0,0.0,0.039062,2.68579,0.015338,1.498782,0.157769,0.233748
4,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,4.0,6.0,4.549884,0.001267,2.192139,0.002076,0.029677,0.40983,0.0,...,0.0,2.028283,0.0,0.0,0.048678,1.704479,0.018217,1.774186,0.158651,0.203963


Unnamed: 0,species_name,embeddings
0,1139490,"[5.4192066, 0.87128985, 1.7772945, 0.004624543..."
1,1139490,"[4.556544, 0.07774321, 2.0525815, 0.0, 0.03318..."
2,1139490,"[4.653011, 0.05539445, 2.0937886, 0.0043283175..."
3,1139490,"[4.258378, 0.0, 1.601861, 0.0010996396, 0.0139..."
4,1139490,"[4.5498843, 0.0012667002, 2.1921391, 0.0020756..."


In [3]:
def get_species_names(file_path_list: list) -> list:
    species_list = []
    for file in file_path_list:
        species_name = file.split("train_audio/")[1].split("/")[0]
        species_list.append(species_name)

    species_list = list(set(species_list))
    species_list.sort()
    return species_list


species_list = get_species_names(df["file"].iloc[:])
display(len(species_list))
species_list[:5]

206

['1139490', '1192948', '1194042', '126247', '1346504']

In [4]:
# create species_name column
embed_df = df.copy()
embed_df["species_name"] = df["file"].apply(
    lambda x: x.split("train_audio/")[1].split("/")[0]
)
# group by species_name and count the number of files
species_count = embed_df.groupby("species_name").size().reset_index(name="count")
species_count.sort_values(by="count", ascending=False, inplace=True)
species_count.tail(10)

Unnamed: 0,species_name,count
57,81930,42
52,67082,41
48,66016,24
35,548639,22
24,42113,20
32,523060,19
23,42087,18
58,868458,17
11,21116,10
9,1564122,3


In [5]:
# train/test split requries y label to have at least 2 samples
# remove species from ebed_df with less than 2 samples
species_count = embed_df["species_name"].value_counts()
valid_species = species_count[species_count >= 2].index
embed_df = embed_df[embed_df["species_name"].isin(valid_species)].reset_index(drop=True)
embed_df.shape, df.shape

((967390, 516), (967390, 515))

In [6]:
from pathlib import Path

raw_root = Path("~/p-dsgt_clef2025-0/shared/birdclef/raw/birdclef-2025").expanduser()
! ls {raw_root}
soundscape_root = raw_root / "train_audio"
! ls {soundscape_root} | head
soundscapes = sorted(soundscape_root.glob("**/*.ogg"))
display(len(soundscapes))

recording_location.txt	taxonomy.csv	  train.csv    train_soundscapes
sample_submission.csv	test_soundscapes  train_audio
1139490
1192948
1194042
126247
1346504
134933
135045
1462711
1462737
1564122


28564

In [7]:
soundscapes = [str(sp) for sp in soundscapes]
soundscape_list = get_species_names(soundscapes)
display(len(soundscape_list))
soundscape_list[:5]

206

['1139490', '1192948', '1194042', '126247', '1346504']

In [8]:
for name in soundscape_list:
    if name not in species_list:
        print(name)