In [4]:
from pyspark.sql import functions as F, Window
from birdclef.utils import get_spark

In [5]:
spark_path = (
    "/home/nzhon/data/processed/birdclef-2022/birdnet-embeddings-with-neighbors/v1"
)
spark = get_spark(memory="2g")
df = spark.read.parquet(spark_path)
df.count()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/26 17:25:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/03/26 17:25:16 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


                                                                                

276748

In [6]:
# figure out the top 3 species
top_species = (
    df.groupBy("primary_label")
    .count()
    .orderBy("count")
    .select("primary_label")
    .limit(3)
)
top_species.show()

                                                                                

+-------------+
|primary_label|
+-------------+
|       coopet|
|       puaioh|
|       bubsan|
+-------------+



3

In [7]:
# take 10 rows from the first 3 species ordered by name

subset = (
    df
    # keep the top 3 species
    .join(top_species, on="primary_label", how="inner")
    .withColumn(
        "rank", F.row_number().over(Window.partitionBy("primary_label").orderBy("id"))
    )
    .where("rank < 10")
)

In [9]:
subset.show()
subset.count()

                                                                                

+-------------+------+-------------------+---------+-------+----------+-------------+-------------------+--------------------+--------------------+---------------+--------------------+--------------------+----+
|primary_label|    id|           filename|start_sec|end_sec|confidence|birdnet_label|birdnet_common_name|                 emb|    secondary_labels|           type|           neighbors|           distances|rank|
+-------------+------+-------------------+---------+-------+----------+-------------+-------------------+--------------------+--------------------+---------------+--------------------+--------------------+----+
|       bubsan| 30677|bubsan/XC435876.ogg|      0.0|    3.0|    0.1129|       laugul|      Laughing Gull|[1.0587984, 1.633...|                  []|['flight call']|[30677, 30678, 30...|[0.0, 0.0, 0.0, 5...|   1|
|       bubsan| 30678|bubsan/XC435876.ogg|      0.0|    3.0|    0.2012|       blkter|         Black Tern|[1.0587984, 1.633...|                  []|['flight 

26

In [11]:
subset.repartition(1).show()
subset.repartition(1).count()
subset.repartition(1).write.parquet(
    "/home/nzhon/birdclef-2023/tests/workflows/data/cluster_plot_test_data",
    compression="snappy",
)

                                                                                

+-------------+------+-------------------+---------+-------+----------+-------------+-------------------+--------------------+--------------------+---------------+--------------------+--------------------+----+
|primary_label|    id|           filename|start_sec|end_sec|confidence|birdnet_label|birdnet_common_name|                 emb|    secondary_labels|           type|           neighbors|           distances|rank|
+-------------+------+-------------------+---------+-------+----------+-------------+-------------------+--------------------+--------------------+---------------+--------------------+--------------------+----+
|       bubsan| 30677|bubsan/XC435876.ogg|      0.0|    3.0|    0.1129|       laugul|      Laughing Gull|[1.0587984, 1.633...|                  []|['flight call']|[30677, 30678, 30...|[0.0, 0.0, 0.0, 5...|   1|
|       bubsan| 30678|bubsan/XC435876.ogg|      0.0|    3.0|    0.2012|       blkter|         Black Tern|[1.0587984, 1.633...|                  []|['flight 

                                                                                