In [16]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [17]:
from pyspark.sql import functions as F, Window
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from umap import UMAP
from sklearn.decomposition import PCA

In [18]:
from birdclef.utils import get_spark

spark = get_spark(memory="2g")

In [19]:
df = spark.read.parquet(
    "../data/processed/birdclef-2022/birdnet-embeddings-with-neighbors/v1"
)
df.printSchema()
df.show(n=1, vertical=True)

root
 |-- id: integer (nullable = true)
 |-- filename: string (nullable = true)
 |-- start_sec: double (nullable = true)
 |-- end_sec: double (nullable = true)
 |-- confidence: double (nullable = true)
 |-- birdnet_label: string (nullable = true)
 |-- birdnet_common_name: string (nullable = true)
 |-- emb: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- primary_label: string (nullable = true)
 |-- secondary_labels: string (nullable = true)
 |-- type: string (nullable = true)
 |-- neighbors: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- distances: array (nullable = true)
 |    |-- element: double (containsNull = true)

-RECORD 0-----------------------------------
 id                  | 239173               
 filename            | skylar/XC380395.ogg  
 start_sec           | 396.0                
 end_sec             | 399.0                
 confidence          | 0.1293               
 birdnet_label       | skylar               
 b

* https://ebird.org/species/brnowl
* https://ebird.org/species/mallar3

In [20]:
df.groupBy("primary_label").count().orderBy(F.desc("count")).show(n=10)

+-------------+-----+
|primary_label|count|
+-------------+-----+
|       normoc|25068|
|       houspa|18173|
|       skylar|12849|
|       norcar|12216|
|       wesmea| 9479|
|       brnowl| 8186|
|      mallar3| 7489|
|       houfin| 6616|
|       commyn| 6524|
|       dunlin| 6069|
+-------------+-----+
only showing top 10 rows



In [27]:
subset_species = ["normoc", "houspa", "skylar"]
subset = df.where(F.col("primary_label").isin(subset_species))
# take segment from the track that has the highest confidence
subset = (
    subset.withColumn(
        "rank",
        F.row_number().over(
            Window.partitionBy("filename").orderBy(F.desc("confidence"))
        ),
    )
    .where("rank = 1")
    .drop("rank", "id", "neighbors", "distances")
    .orderBy("filename", "start_sec")
    .withColumn("id", F.row_number().over(Window.orderBy("filename", "start_sec")))
)
subset.groupBy("primary_label", "birdnet_label").count().orderBy(F.desc("count")).show(
    n=10
)

+-------------+-------------+-----+
|primary_label|birdnet_label|count|
+-------------+-------------+-----+
|       skylar|       skylar|  370|
|       houspa|       houspa|  320|
|       normoc|       normoc|  106|
|       houspa|       eutspa|   20|
|       skylar|      orisky1|   20|
|       normoc|       tromoc|   16|
|       normoc|       crithr|   16|
|       houspa|      spaspa1|   11|
|       skylar|      tawpip1|   11|
|       normoc|       cubthr|   10|
+-------------+-------------+-----+
only showing top 10 rows



In [28]:
from pynndescent import NNDescent

# recompute the ids and neighbors
X = np.stack(subset.select("emb").toPandas().emb)
index = NNDescent(X, verbose=True)
neighbors, distances = index.query(X, k=20)
query_df = pd.DataFrame(
    dict(
        id=subset.select("id").toPandas().id,
        neighbors=neighbors.tolist(),
        distances=distances.tolist(),
    )
)

subset = subset.join(spark.createDataFrame(query_df), on="id")

Sun Jan 15 22:08:45 2023 Building RP forest with 11 trees
Sun Jan 15 22:08:47 2023 NN descent for 10 iterations
	 1  /  10
	 2  /  10
	 3  /  10
	 4  /  10
	Stopping threshold met -- exiting after 4 iterations
Sun Jan 15 22:08:57 2023 Worst tree score: 0.83783784
Sun Jan 15 22:08:57 2023 Mean tree score: 0.85882197
Sun Jan 15 22:08:57 2023 Best tree score: 0.87363039
Sun Jan 15 22:08:59 2023 Forward diversification reduced edges from 41070 to 6612
Sun Jan 15 22:09:01 2023 Reverse diversification reduced edges from 6612 to 6612
Sun Jan 15 22:09:03 2023 Degree pruning reduced edges from 7454 to 7454
Sun Jan 15 22:09:03 2023 Resorting data and graph based on tree order
Sun Jan 15 22:09:03 2023 Building and compiling search function


In [29]:
subset.toPandas().to_parquet("../data/processed/2022-01-15-assessment.parquet")

In [30]:
pd.read_parquet("../data/processed/2022-01-15-assessment.parquet").head()

Unnamed: 0,id,filename,start_sec,end_sec,confidence,birdnet_label,birdnet_common_name,emb,primary_label,secondary_labels,type,neighbors,distances
0,1,houspa/XC121738.ogg,51.0,54.0,0.7677,eutspa,Eurasian Tree Sparrow,"[1.4410583, 1.5985851, 1.1828262, 0.55653006, ...",houspa,[],['call'],"[0, 57, 292, 24, 27, 269, 199, 445, 111, 196, ...","[0.0, 7.545195579528809, 7.966801643371582, 8...."
1,2,houspa/XC124181.ogg,0.0,3.0,0.9652,houspa,House Sparrow,"[2.0463464, 1.2236276, 0.70286447, 1.3113645, ...",houspa,[],['call'],"[1, 237, 244, 255, 83, 264, 198, 240, 31, 322,...","[0.0, 5.581757068634033, 6.375831604003906, 6...."
2,3,houspa/XC131300.ogg,18.0,21.0,0.8707,eutspa,Eurasian Tree Sparrow,"[1.5261635, 1.939797, 1.4180027, 0.9176945, 1....",houspa,[],"['call', 'male', 'song']","[2, 441, 263, 401, 269, 323, 304, 31, 46, 192,...","[0.0, 7.3610758781433105, 7.4461188316345215, ..."
3,4,houspa/XC133262.ogg,48.0,51.0,0.8861,whcspa,White-crowned Sparrow,"[0.7386549, 1.3903569, 1.2590201, 1.2009335, 0...",houspa,[],"['begging call', 'juvenile']","[3, 1127, 528, 1102, 79, 1155, 1208, 642, 21, ...","[0.0, 8.889424324035645, 9.17027759552002, 9.2..."
4,5,houspa/XC139186.ogg,0.0,3.0,0.738,houspa,House Sparrow,"[1.0389303, 1.9639286, 0.81923, 1.1145979, 1.2...",houspa,[],"['male', 'song']","[4, 269, 305, 82, 346, 150, 192, 460, 367, 149...","[0.0, 5.821669101715088, 5.945547580718994, 5...."
