# audio track lengths for birdclef-2023

Let's take a look at how long these tracks are, and see if we should split them up.

In [2]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

from birdclef.utils import get_spark
from pyspark.sql import functions as F

spark = get_spark()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/01 06:45:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.parquet(
    "gs://birdclef-2023/data/processed/birdclef-2023/train_durations_v2.parquet"
)
df.show(n=5)

                                                                                

+--------------------+------------------+
|            filename|          duration|
+--------------------+------------------+
|abethr1/XC128013.ogg| 45.60979591836735|
|abethr1/XC363501.ogg|18.677596371882085|
|abethr1/XC363502.ogg| 38.76575963718821|
|abethr1/XC363503.ogg|29.257188208616782|
|abethr1/XC363504.ogg| 42.34453514739229|
+--------------------+------------------+
only showing top 5 rows



In [4]:
df.summary().show()

[Stage 2:>                                                          (0 + 1) / 1]

+-------+--------------------+------------------+
|summary|            filename|          duration|
+-------+--------------------+------------------+
|  count|               16941|             16941|
|   mean|                null| 40.88558617402471|
| stddev|                null| 69.61028594350307|
|    min|abethr1/XC128013.ogg|0.5480272108843537|
|    25%|                null|12.773877551020409|
|    50%|                null| 24.60734693877551|
|    75%|                null| 45.87106575963719|
|    max|yewgre1/XC753190.ogg|2373.5280272108844|
+-------+--------------------+------------------+



                                                                                

In [5]:
df.orderBy(F.desc("duration")).show()

+--------------------+------------------+
|            filename|          duration|
+--------------------+------------------+
| grecor/XC629875.ogg|2373.5280272108844|
| wlwwar/XC475384.ogg|2043.1673469387756|
| grecor/XC627838.ogg|1896.5280272108844|
|thrnig1/XC660166.ogg|1787.8465759637188|
|thrnig1/XC660170.ogg|1422.9159183673469|
|combuz1/XC579931.ogg|1391.5951020408163|
|thrnig1/XC725851.ogg|1380.0220408163266|
| greegr/XC558126.ogg|1255.0560090702947|
|thrnig1/XC667666.ogg|1144.0320181405896|
| lawgol/XC661823.ogg| 915.8160090702947|
|thrnig1/XC372879.ogg| 906.1090249433106|
|eubeat1/XC392195.ogg| 873.2996371882086|
|eubeat1/XC392182.ogg| 873.2212244897959|
|eubeat1/XC392184.ogg| 873.2212244897959|
|eubeat1/XC392188.ogg| 873.0645351473922|
|eubeat1/XC392183.ogg| 872.9338775510204|
|eubeat1/XC392185.ogg| 872.8293877551021|
|eubeat1/XC392191.ogg| 872.8293877551021|
|eubeat1/XC392193.ogg| 872.7771882086167|
|eubeat1/XC392192.ogg| 872.7771882086167|
+--------------------+------------

In [6]:
res = df.where("duration >= 60*3")
print(res.count())

411


In [7]:
from pathlib import Path
import shutil

output_path = "../data/processed/birdclef-2023/train_embeddings"

# we need to restart from scratch...
n_files = 0
n_dirs = 0
for row in res.collect():
    stem = row.filename.replace(".ogg", "")
    # find anything that matches the stem
    for path in Path(output_path).glob(f"**/{stem}*"):
        if path.is_file():
            path.unlink()
            n_files += 1
        else:
            shutil.rmtree(path)
            n_dirs += 1
print(f"Removed {n_files} files and {n_dirs} directories")

Removed 684 files and 14 directories


In [11]:
df.withColumn("species", F.split("filename", "/").getItem(0)).groupBy("species").agg(
    F.sum("duration").alias("duration"),
    F.count("*").alias("n_files"),
).orderBy(F.asc("duration")).where("duration > 3*100").show()

+-------+------------------+-------+
|species|          duration|n_files|
+-------+------------------+-------+
|refwar2|304.09233560090706|      9|
|refbar2| 305.0696598639456|     12|
|bltapa1| 313.6170975056689|     11|
|blwlap1|320.21691609977324|     15|
|malkin1| 324.5280725623584|     15|
|brcale1|325.31981859410433|     10|
|whbcan1|333.95074829931974|     10|
|macshr1| 334.3459410430839|      6|
|brrwhe3| 336.1178684807257|      8|
|easmog1| 343.5940589569161|     15|
|spfbar1| 347.6136507936508|     13|
|mcptit1|350.91909297052155|      8|
|yespet1| 360.6465306122449|     15|
|palfly2| 364.1446258503401|     14|
|whbwea1| 366.0772335600907|     19|
|huncis1| 366.7497505668935|     16|
|yebduc1| 371.2188662131519|     17|
|blhher1|386.93877551020404|     16|
|hipbab1| 395.6754648526078|      8|
|whihel1| 399.2384126984127|     14|
+-------+------------------+-------+
only showing top 20 rows

