How close can we get our mel2vec to fit to the psuedo-labels generated perch? Hopefully the answer is close...

In [49]:
import polars as pl
from pathlib import Path

shared_root = Path("~/shared/birdclef").expanduser()
scratch_root = Path("~/scratch/birdclef").expanduser()

perch = pl.scan_parquet(f"{shared_root}/2025/infer-soundscape/Perch/parts/embed")
display(perch.select(pl.len()).collect())
display(perch.collect_schema())

len
u32
116712


Schema([('file', String),
        ('start_time', Float64),
        ('end_time', Float64),
        ('0', Float64),
        ('1', Float64),
        ('2', Float64),
        ('3', Float64),
        ('4', Float64),
        ('5', Float64),
        ('6', Float64),
        ('7', Float64),
        ('8', Float64),
        ('9', Float64),
        ('10', Float64),
        ('11', Float64),
        ('12', Float64),
        ('13', Float64),
        ('14', Float64),
        ('15', Float64),
        ('16', Float64),
        ('17', Float64),
        ('18', Float64),
        ('19', Float64),
        ('20', Float64),
        ('21', Float64),
        ('22', Float64),
        ('23', Float64),
        ('24', Float64),
        ('25', Float64),
        ('26', Float64),
        ('27', Float64),
        ('28', Float64),
        ('29', Float64),
        ('30', Float64),
        ('31', Float64),
        ('32', Float64),
        ('33', Float64),
        ('34', Float64),
        ('35', Float64),
        ('36', Float

In [None]:
# turn everything from columns [3:] into a list/vector
perch_vec = perch.select(
    "file",
    "start_time",
    "end_time",
    pl.concat_list(perch.columns[3:]).alias("embedding"),
)

perch_vec.first().collect()

  pl.concat_list(perch.columns[3:]).alias("embedding")


file,start_time,end_time,embedding
str,f64,f64,list[f64]
"""/storage/coda1/p-dsgt_clef2025…",0.0,5.0,"[-0.022291, 0.091991, … -0.005284]"


In [26]:
# let's join this with the data that we have for the mfcc dataset
mfcc = pl.scan_parquet(f"{scratch_root}/2025/mfcc-soundscape/data")
mfcc.collect_schema()

Schema([('index', Int64),
        ('file', String),
        ('timestamp', Float64),
        ('mfcc', List(Float32)),
        ('part', Int64)])

In [27]:
mfcc.first().collect()

index,file,timestamp,mfcc,part
i64,str,f64,list[f32],i64
0,"""/storage/coda1/p-dsgt_clef2025…",0.125,"[0.000127, 0.000109, … 0.000012]",0


In [39]:
# we'll join the two datasets on start time with a udf
def get_start_time(timestamp, interval=5) -> int:
    # up to but not including the value
    for i in range(0, 100, interval):
        if i <= timestamp < i + interval:
            return i
    return -1


mfcc_sub = mfcc.filter(pl.col("part") < 10).with_columns(
    pl.col("timestamp")
    .map_elements(get_start_time, return_dtype=pl.Int64)
    .alias("start_time")
)
mfcc_sub.head(10).collect()

index,file,timestamp,mfcc,part,start_time
i64,str,f64,list[f32],i64,i64
0,"""/storage/coda1/p-dsgt_clef2025…",0.125,"[0.000127, 0.000109, … 0.000012]",0,0
1,"""/storage/coda1/p-dsgt_clef2025…",0.25,"[0.000093, 0.000074, … 0.000004]",0,0
2,"""/storage/coda1/p-dsgt_clef2025…",0.375,"[0.000099, 0.000083, … 0.000009]",0,0
3,"""/storage/coda1/p-dsgt_clef2025…",0.5,"[0.000117, 0.000102, … -0.000003]",0,0
4,"""/storage/coda1/p-dsgt_clef2025…",0.625,"[0.000103, 0.000088, … 0.000006]",0,0
5,"""/storage/coda1/p-dsgt_clef2025…",0.75,"[0.000105, 0.000083, … 0.000005]",0,0
6,"""/storage/coda1/p-dsgt_clef2025…",0.875,"[0.000117, 0.000099, … 0.000015]",0,0
7,"""/storage/coda1/p-dsgt_clef2025…",1.0,"[0.0001, 0.000083, … 0.000001]",0,0
8,"""/storage/coda1/p-dsgt_clef2025…",1.125,"[0.000088, 0.000068, … -0.000004]",0,0
9,"""/storage/coda1/p-dsgt_clef2025…",1.25,"[0.000093, 0.000075, … 0.000008]",0,0


In [43]:
# descriptive statistics of mfcc as one feature vector
import numpy as np


def descriptive_stats(group: pl.DataFrame) -> pl.DataFrame:
    X = np.stack(group.get_column("mfcc").to_numpy())
    return pl.DataFrame(
        {
            "file": group.get_column("file").to_numpy()[0],
            "start_time": group.get_column("start_time").to_numpy()[0],
            "embedding": [X.mean(axis=0).tolist() + X.std(axis=0).tolist()],
        }
    )


mfcc_stats = (
    mfcc_sub.head(20)
    .collect()
    .group_by("file", "start_time")
    .map_groups(descriptive_stats)
)
mfcc_stats.head()

file,start_time,embedding
str,i64,list[f64]
"""/storage/coda1/p-dsgt_clef2025…",0,"[0.000101, 0.000083, … 0.000006]"


In [52]:
# let's also calculate the word vectors
from gensim.models import KeyedVectors
import faiss

# tokenizer
centroids = np.load(f"{scratch_root}/2025/mel2vec/tokenizer/centroids.npy")
index = faiss.IndexFlatL2(centroids.shape[1])
index.add(centroids)

prefix = "tokenizer=tokenizer/vector_size=256/window=80/ns_exponent=0.75/sample=0.0001/epochs=100"
word_vectors = KeyedVectors.load(
    f"{scratch_root}/2025/mel2vec/word2vec/{prefix}/word2vec.wordvectors"
)
display(word_vectors.index_to_key[:10])


def mfcc_to_wv(mfcc: list) -> list:
    # convert mfcc to word vectors
    X = np.array(mfcc).reshape(1, -1)
    _, indices = index.search(X, 1)  # get the closest centroid
    return word_vectors[indices[0][0]].tolist()


mfcc_sub.head(5).with_columns(
    pl.col("mfcc")
    .map_elements(mfcc_to_wv, return_dtype=pl.List(pl.Float64))
    .alias("word_vector")
).collect()

[6122, 13688, 1185, 9798, 4637, 10836, 6358, 9107, 10603, 12453]

index,file,timestamp,mfcc,part,start_time,word_vector
i64,str,f64,list[f32],i64,i64,list[f64]
0,"""/storage/coda1/p-dsgt_clef2025…",0.125,"[0.000127, 0.000109, … 0.000012]",0,0,"[0.430293, -0.118337, … -0.197529]"
1,"""/storage/coda1/p-dsgt_clef2025…",0.25,"[0.000093, 0.000074, … 0.000004]",0,0,"[0.487408, -0.016666, … 0.088069]"
2,"""/storage/coda1/p-dsgt_clef2025…",0.375,"[0.000099, 0.000083, … 0.000009]",0,0,"[0.430293, -0.118337, … -0.197529]"
3,"""/storage/coda1/p-dsgt_clef2025…",0.5,"[0.000117, 0.000102, … -0.000003]",0,0,"[0.430293, -0.118337, … -0.197529]"
4,"""/storage/coda1/p-dsgt_clef2025…",0.625,"[0.000103, 0.000088, … 0.000006]",0,0,"[0.430293, -0.118337, … -0.197529]"
