In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

# birdclef-2023 exploration

The purpose of this notebook is to explore the data and the task of the [BirdCLEF 2023](https://www.kaggle.com/c/birdclef-2023) competition.

In [28]:
from birdclef.utils import get_spark
from pyspark.sql import functions as F

spark = get_spark()

In [4]:
birdclef_root = "../data/raw/birdclef-2023"
! ls {birdclef_root}

eBird_Taxonomy_v2021.csv  test_soundscapes  train_metadata.csv
sample_submission.csv	  train_audio


## sample submission

In [6]:
sample_submission_df = spark.read.csv(
    f"{birdclef_root}/sample_submission.csv", header=True
)
sample_submission_df.toPandas().head()

                                                                                

23/03/19 23:22:26 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Unnamed: 0,row_id,abethr1,abhori1,abythr1,afbfly1,afdfly1,afecuc1,affeag1,afgfly1,afghor1,...,yebsto1,yeccan1,yefcan,yelbis1,yenspu1,yertin1,yesbar1,yespet1,yetgre1,yewgre1
0,soundscape_29201_5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,soundscape_29201_10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,soundscape_29201_15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## ebird taxonomy

In [8]:
ebird_taxonomy_df = spark.read.csv(
    f"{birdclef_root}/eBird_Taxonomy_v2021.csv", header=True
)
ebird_taxonomy_df.printSchema()
ebird_taxonomy_df.toPandas().head()

root
 |-- TAXON_ORDER: string (nullable = true)
 |-- CATEGORY: string (nullable = true)
 |-- SPECIES_CODE: string (nullable = true)
 |-- PRIMARY_COM_NAME: string (nullable = true)
 |-- SCI_NAME: string (nullable = true)
 |-- ORDER1: string (nullable = true)
 |-- FAMILY: string (nullable = true)
 |-- SPECIES_GROUP: string (nullable = true)
 |-- REPORT_AS: string (nullable = true)



Unnamed: 0,TAXON_ORDER,CATEGORY,SPECIES_CODE,PRIMARY_COM_NAME,SCI_NAME,ORDER1,FAMILY,SPECIES_GROUP,REPORT_AS
0,1,species,ostric2,Common Ostrich,Struthio camelus,Struthioniformes,Struthionidae (Ostriches),Ostriches,
1,6,species,ostric3,Somali Ostrich,Struthio molybdophanes,Struthioniformes,Struthionidae (Ostriches),,
2,7,slash,y00934,Common/Somali Ostrich,Struthio camelus/molybdophanes,Struthioniformes,Struthionidae (Ostriches),,
3,8,species,grerhe1,Greater Rhea,Rhea americana,Rheiformes,Rheidae (Rheas),Rheas,
4,14,species,lesrhe2,Lesser Rhea,Rhea pennata,Rheiformes,Rheidae (Rheas),,


## train metadata

In [9]:
train_metadata_df = spark.read.csv(f"{birdclef_root}/train_metadata.csv", header=True)
train_metadata_df.printSchema()
train_metadata_df.toPandas().head()

root
 |-- primary_label: string (nullable = true)
 |-- secondary_labels: string (nullable = true)
 |-- type: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- scientific_name: string (nullable = true)
 |-- common_name: string (nullable = true)
 |-- author: string (nullable = true)
 |-- license: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- url: string (nullable = true)
 |-- filename: string (nullable = true)



Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename
0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/128013,abethr1/XC128013.ogg
1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363501,abethr1/XC363501.ogg
2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363502,abethr1/XC363502.ogg
3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/363503,abethr1/XC363503.ogg
4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/363504,abethr1/XC363504.ogg


In [15]:
train_metadata_df.count()

16941

### is there overlap between 2022 and 2023?

It's a very small overlap of ~500 tracks out of ~15k. 

In [20]:
train_metadata_2022_df = spark.read.csv(
    "../data/raw/birdclef-2022/train_metadata.csv", header=True
)

overlap = train_metadata_df.join(
    train_metadata_2022_df.select("url"), on="url", how="inner"
)
overlap.count(), train_metadata_2022_df.count()

(490, 14852)

In [19]:
overlap.groupBy("primary_label").count().orderBy("count", ascending=False).show()

+-------------+-----+
|primary_label|count|
+-------------+-----+
|       comsan|  296|
|       categr|  120|
|       yefcan|   67|
|      eaywag1|    2|
|       barswa|    2|
|      thrnig1|    2|
|      cohmar1|    1|
+-------------+-----+



### a deeper look into the set of species for the 2023 competition

There are 264 species. Any secondary species that shows up is part of the primary label set.

In [23]:
species = sample_submission_df.columns[1:]
len(species)

264

In [24]:
train_metadata_df.select("primary_label").distinct().count()

264

In [49]:
import ast


@F.udf(returnType="array<string>")
def parse_labels(label_str: str):
    # use literal eval to parse the string
    return ast.literal_eval(label_str)


labels = train_metadata_df.select(
    "primary_label", parse_labels("secondary_labels").alias("secondary_labels")
)
labels.show(n=10)
labels.printSchema()

# explode the secondary labels to see if these are all within the valid set
secondary = labels.select(F.explode("secondary_labels").alias("secondary_label"))
secondary.show(n=10)

# count the number of unique secondary labels
secondary.groupBy("secondary_label").count().show()

+-------------+----------------+
|primary_label|secondary_labels|
+-------------+----------------+
|      abethr1|              []|
|      abethr1|              []|
|      abethr1|              []|
|      abethr1|              []|
|      abethr1|              []|
|      abethr1|       [rbsrob1]|
|      abethr1|              []|
|      abethr1|              []|
|      abethr1|              []|
|      abethr1|              []|
+-------------+----------------+
only showing top 10 rows

root
 |-- primary_label: string (nullable = true)
 |-- secondary_labels: array (nullable = true)
 |    |-- element: string (containsNull = true)

+---------------+
|secondary_label|
+---------------+
|        rbsrob1|
|        eswdov1|
|         helgui|
|         rindov|
|        combul2|
|         rindov|
|        blbpuf2|
|        fotdro5|
|        reedov1|
|        fotdro5|
+---------------+
only showing top 10 rows

+---------------+-----+
|secondary_label|count|
+---------------+-----+
|        ccbeat1

In [53]:
secondary.select(F.col("secondary_label").alias("label")).distinct().join(
    labels.select(F.col("primary_label").alias("label")).distinct(),
    on="label",
    how="left_anti",
).show()

+-----+
|label|
+-----+
+-----+



### what's the overlap between the competition species and the birdnet model?

In [54]:
from pathlib import Path
import json

labels_path = (
    "../vendor/BirdNET-Analyzer/checkpoints/V2.2/BirdNET_GLOBAL_3K_V2.2_Labels.txt"
)
mapping_path = "../vendor/BirdNET-Analyzer/eBird_taxonomy_codes_2021E.json"

labels = Path(labels_path).read_text().splitlines()
mapping = json.loads(Path(mapping_path).read_text())

mapped_labels = [mapping[label] for label in labels]
mapped_labels[:10]

['rufwar1',
 'yebwar1',
 'watgua1',
 'spchon1',
 'lesred1',
 'comred',
 'hoared',
 'yertho1',
 'tastho1',
 'brotho1']

In [58]:
species = sample_submission_df.columns[1:]

len(set(species) & set(mapped_labels)) / len(species)

0.4696969696969697