# Ebemdding pipeline
Bioacoustics Model Zoo: https://github.com/kitzeslab/bioacoustics-model-zoo

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

scratch_dir = "~/scratch/birdclef/data/2025"
model_name = "BirdNET"
embed_dir = (
    f"{scratch_dir}/subset-train_audio-infer-soundscape-cpu/{model_name}/parts/embed/"
)


def preprocess_data(input_path: str) -> pd.DataFrame:
    df = pd.read_parquet(input_path)
    # concatenate all embeddings into a single DataFrame
    df["species_name"] = df["file"].apply(
        lambda x: x.split("train_audio/")[1].split("/")[0]
    )
    # train/test split requries y label to have at least 2 samples
    # remove species with less than 2 samples
    species_count = df["species_name"].value_counts()
    valid_species = species_count[species_count >= 2].index
    filtered_df = df[df["species_name"].isin(valid_species)].reset_index(drop=True)
    # concatenate embeddings
    num_cols = len(df.columns) - 4  # exclude 'file', 'start_time', and 'end_time'
    embed_cols = list(map(str, range(num_cols)))
    filtered_df["embeddings"] = filtered_df[embed_cols].values.tolist()
    # downsample for debugging
    df_embs = filtered_df[["species_name", "embeddings"]].copy()
    print(f"DataFrame shape: {df_embs.shape}")
    print(f"Embedding size: {len(df_embs['embeddings'].iloc[0])}")
    return df_embs


df = pd.read_parquet(embed_dir)
embed_df = preprocess_data(embed_dir)
display(df.head(5))
display(embed_df.head(5))

DataFrame shape: (55199, 2)
Embedding size: 1024


Unnamed: 0,file,start_time,end_time,0,1,2,3,4,5,6,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,0.0,3.0,0.124203,1.563936,0.0,0.0,0.410534,0.315204,0.375027,...,0.452822,0.191502,0.340442,1.007927,0.258186,1.311302,0.0,0.005877,0.232589,0.000318
1,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,1.0,4.0,0.513276,2.377574,0.094876,0.0,0.351363,0.10967,0.576605,...,0.569245,0.0,0.707366,1.300623,0.151832,1.946136,0.047785,0.021347,0.449789,0.0
2,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,2.0,5.0,0.708885,2.282259,0.229707,0.00414,0.562097,0.128851,0.521898,...,0.96056,0.0,0.955411,1.824189,0.446797,1.538955,0.238716,0.0,0.760179,0.0
3,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,3.0,6.0,0.993977,2.557167,0.302402,0.083385,0.199752,0.0,0.310794,...,1.094692,0.029768,0.300263,1.351355,0.005148,0.579277,0.052899,0.070271,0.754705,0.0
4,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,4.0,7.0,0.70658,2.473345,0.320889,0.0,0.790768,0.0,0.317302,...,0.666101,0.0,0.322575,1.099243,0.168204,0.664485,0.0,0.0,0.724873,0.0


Unnamed: 0,species_name,embeddings
0,amakin1,"[0.12420318275690079, 1.56393563747406, 0.0, 0..."
1,amakin1,"[0.5132761001586914, 2.3775739669799805, 0.094..."
2,amakin1,"[0.7088851928710938, 2.282258987426758, 0.2297..."
3,amakin1,"[0.9939765930175781, 2.557166814804077, 0.3024..."
4,amakin1,"[0.706580400466919, 2.4733448028564453, 0.3208..."


In [3]:
def get_species_names(file_path_list: list) -> list:
    species_list = []
    for file in file_path_list:
        species_name = file.split("train_audio/")[1].split("/")[0]
        species_list.append(species_name)

    species_list = list(set(species_list))
    species_list.sort()
    return species_list


species_list = get_species_names(df["file"].iloc[:])
display(len(species_list))
species_list[:5]

10

['amakin1', 'amekes', 'ampkin1', 'anhing', 'babwar']

In [4]:
# create species_name column
embed_df = df.copy()
embed_df["species_name"] = df["file"].apply(
    lambda x: x.split("train_audio/")[1].split("/")[0]
)
# group by species_name and count the number of files
species_count = embed_df.groupby("species_name").size().reset_index(name="count")
species_count.sort_values(by="count", ascending=False, inplace=True)
species_count.tail(10)

Unnamed: 0,species_name,count
6,banana,18784
1,amekes,9471
8,bbwduc,8226
4,babwar,6040
7,baymac,5656
9,bicwre1,2682
3,anhing,2029
0,amakin1,1341
2,ampkin1,547
5,bafibi1,423


In [5]:
# train/test split requries y label to have at least 2 samples
# remove species from ebed_df with less than 2 samples
species_count = embed_df["species_name"].value_counts()
valid_species = species_count[species_count >= 2].index
embed_df = embed_df[embed_df["species_name"].isin(valid_species)].reset_index(drop=True)
embed_df.shape, df.shape

((55199, 1028), (55199, 1027))

In [6]:
from pathlib import Path

raw_root = Path("~/p-dsgt_clef2025-0/shared/birdclef/raw/birdclef-2025").expanduser()
! ls {raw_root}
soundscape_root = raw_root / "train_audio"
! ls {soundscape_root} | head
soundscapes = sorted(soundscape_root.glob("**/*.ogg"))
display(len(soundscapes))

recording_location.txt	taxonomy.csv	  train.csv    train_soundscapes
sample_submission.csv	test_soundscapes  train_audio
1139490
1192948
1194042
126247
1346504
134933
135045
1462711
1462737
1564122


28564

In [7]:
soundscapes = [str(sp) for sp in soundscapes]
soundscape_list = get_species_names(soundscapes)
display(len(soundscape_list))
soundscape_list[:5]

206

['1139490', '1192948', '1194042', '126247', '1346504']

In [8]:
for name in soundscape_list:
    if name not in species_list:
        print(name)

1139490
1192948
1194042
126247
1346504
134933
135045
1462711
1462737
1564122
21038
21116
21211
22333
22973
22976
24272
24292
24322
41663
41778
41970
42007
42087
42113
46010
47067
476537
476538
48124
50186
517119
523060
528041
52884
548639
555086
555142
566513
64862
65336
65344
65349
65373
65419
65448
65547
65962
66016
66531
66578
66893
67082
67252
714022
715170
787625
81930
868458
963335
bkcdon
bkmtou1
blbgra1
blbwre1
blcant4
blchaw1
blcjay1
blctit1
blhpar1
blkvul
bobfly1
bobher1
brtpar1
bubcur1
bubwre1
bucmot3
bugtan
butsal1
cargra1
cattyr
chbant1
chfmac1
cinbec1
cocher1
cocwoo1
colara1
colcha1
compau
compot1
cotfly1
crbtan1
crcwoo1
crebob1
cregua1
creoro1
eardov1
fotfly
gohman1
grasal4
grbhaw1
greani1
greegr
greibi1
grekis
grepot1
gretin1
grnkin
grysee1
gybmar
gycwor1
labter1
laufal1
leagre
linwoo1
littin1
mastit1
neocor
norscr1
olipic1
orcpar
palhor2
paltan1
pavpig2
piepuf1
pirfly1
piwtyr1
plbwoo1
plctan1
plukit1
purgal2
ragmac1
rebbla1
recwoo1
rinkin1
roahaw
rosspo1
royfly1
rtlhum
