In [1]:
import os

import numpy as np

In [2]:
def load_embedding(folder, model) -> list[np.ndarray]:
    if model == "birdnet":
        # in this case the embedding is a text file for each audio
        # the file is start_time \t end_time \t embedding \n
        # embedding is comma separated float values
        # frames are 3s long (possibly configurable)
        embeddings = []
        for filename in sorted(os.listdir(folder)):
            with open(os.path.join(folder, filename), "r") as f:
                lines = f.readlines()
            # first two els are start and end time
            embedding = np.array([list(map(float, line.split("\t")[2].split(","))) for line in lines])
            embeddings.append(embedding)
        return embeddings
    if model == "yamnet":
        # in this case the embedding is a numpy array 
        # (n_audios=64, max_n_frames=148, embedding_dim=1024)
        # the padding value is 0, the frames are every 0.48 (probably)
        filename = os.listdir(folder)[0]
        embedding = np.load(os.path.join(folder, filename))
        return [emb for emb in embedding]
    if model == "perch":
        pass
    else:
        raise ValueError(f"Unknown pretrained model {model}")

In [3]:
l1 = load_embedding("data/embeddings/yamnet/granollers", "yamnet")
len(l1), l1[0].shape

(64, (148, 1024))

In [4]:
l2 = load_embedding("data/embeddings/birdnet/granollers", "birdnet")
len(l2), l2[0].shape

(64, (8, 1024))

In [7]:
embeddings_folder = "data/embeddings"

emebddings_models = ["birdnet", "yamnet"] #, "perch"]
datasets = ["granollers", "sabadell", "sons_al_balco_2020", "sons_al_balco_2021"]

embeddings = {}
for model in emebddings_models:
    for dataset in datasets:
        folder = os.path.join(embeddings_folder, model, dataset)
        embedding = load_embedding(folder, model)
        embeddings[(model, dataset)] = embedding

In [10]:
embeddings.keys()

dict_keys([('birdnet', 'granollers'), ('birdnet', 'sabadell'), ('birdnet', 'sons_al_balco_2020'), ('birdnet', 'sons_al_balco_2021'), ('yamnet', 'granollers'), ('yamnet', 'sabadell'), ('yamnet', 'sons_al_balco_2020'), ('yamnet', 'sons_al_balco_2021')])

In [None]:
from utils import create_frame_labels

labels_2020 = create_frame_labels(labels_filename=os.path.join("data", "sons_al_balco_labels", "SAB-AudioTagging-2020.json"))   
labels_2021 = create_frame_labels(labels_filename=os.path.join("data", "sons_al_balco_labels", "SAB-AudioTagging-2021.json"))
# dictionaries {filename: {n_frame: list of labels}}

labels_2020

Let's plot different embeddings (with different dimensionality reduction methods)

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# for tsne, first reduce n_dimension to 50 using PCA (suggested by sklearn)
pca = PCA(n_components=50)
pca_yamnet_sob_2020 = pca.fit_transform(np.stack(embeddings[("yamnet", "sons_al_balco_2020")]).reshape(-1, 1024))
pca_yamnet_sob_2021 = pca.fit_transform(np.stack(embeddings[("yamnet", "sons_al_balco_2021")]).reshape(-1, 1024))

tsne = TSNE(n_components=2)
bidim_sob_2020 = tsne.fit_transform(pca_yamnet_sob_2020)
bidim_sob_2021 = tsne.fit_transform(pca_yamnet_sob_2021)

bidim_sob_2020.shape, bidim_sob_2021.shape