In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

from birdclef.utils import get_spark

spark = get_spark()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/01/02 04:53:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/01/02 04:53:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [19]:
from pyspark.sql import functions as F
from pathlib import Path
import numpy as np


@F.udf(returnType="struct<species: string, id: string>")
def parse_meta_path(path: str) -> dict:
    species, name = Path(path).parts[-2:]
    # name is like {ID}.birdnet.embeddings.txt
    id = name.split(".")[0]
    return dict(species=species, id=id)


@F.udf(returnType="array<float>")
def parse_emb_text(emb_text: str) -> np.ndarray:
    return np.fromstring(emb_text, dtype=float, sep=",").tolist()


df = (
    spark.read.csv(
        "../data/intermediate/birdclef-2022/birdnet/embeddings/*/*.txt",
        sep="\t",
        schema="start_sec FLOAT, end_sec FLOAT, emb_text STRING",
    )
    .withColumn("path", parse_meta_path(F.input_file_name()))
    .select("start_sec", "end_sec", "path.*", parse_emb_text("emb_text").alias("emb"))
)
df.show(truncate=80)

+---------+-------+-------+--------+--------------------------------------------------------------------------------+
|start_sec|end_sec|species|      id|                                                                             emb|
+---------+-------+-------+--------+--------------------------------------------------------------------------------+
|      0.0|    3.0| akepa1|XC147002|[1.8758332, 1.4380125, 0.50097305, 0.44348738, 0.5634202, 0.6807248, 0.734888...|
|      3.0|    6.0| akepa1|XC147002|[1.5250791, 0.70023537, 1.3296438, 0.5937032, 0.72436666, 1.0444742, 0.859849...|
|      6.0|    9.0| akepa1|XC147002|[1.5759941, 1.5047598, 1.0167071, 0.15785557, 0.49401703, 0.5429743, 1.005159...|
|      9.0|   12.0| akepa1|XC147002|[1.8822666, 0.84685045, 1.1614047, 0.5988824, 0.526604, 0.95542836, 0.5423438...|
|     12.0|   15.0| akepa1|XC147002|[1.2048845, 0.9359792, 0.593625, 0.41160223, 0.54027706, 0.29067588, 0.577040...|
|     15.0|   18.0| akepa1|XC147002|[1.4317489, 1.183536

In [21]:
processed_df = spark.read.parquet("../data/processed/birdclef-2022/birdnet-embeddings")
processed_df.show(truncate=80)

+---------+-------+-------+--------+--------------------------------------------------------------------------------+
|start_sec|end_sec|species|      id|                                                                             emb|
+---------+-------+-------+--------+--------------------------------------------------------------------------------+
|     78.0|   81.0| akiapo|XC124705|[1.0485483, 0.6709058, 1.0497909, 0.6976401, 0.99132556, 0.35563138, 0.711254...|
|    240.0|  243.0| moudov|XC316787|[1.5284686, 1.3127537, 1.6092612, 1.170875, 1.3254058, 1.2639927, 0.96800447,...|
|     15.0|   18.0| akepa1|XC147002|[1.4317489, 1.1835364, 1.059159, 0.7358842, 0.9712982, 0.3256574, 0.47709683,...|
|     39.0|   42.0| moudov|XC268215|[0.40608868, 0.6424663, 0.83099174, 0.58940667, 0.68857175, 1.78612, 0.379576...|
|     66.0|   69.0| moudov|XC268209|[0.9816236, 0.8503089, 0.72542715, 0.6000835, 1.2116809, 0.8153942, 0.7872277...|
|     15.0|   18.0| akiapo|XC124705|[0.98117536, 0.75242