Preparing Dataset

In [23]:
from birdclef.utils import get_spark
import pyspark.sql.functions as F

In [24]:
spark = get_spark()

In [25]:
df = spark.read.parquet("../data/processed/birdclef-2023/consolidated_v3")
df.printSchema()
df.count()
df.show(n=5, vertical=True)

root
 |-- species: string (nullable = true)
 |-- track_stem: string (nullable = true)
 |-- track_type: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- prediction_vec: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- predictions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- rank: long (nullable = true)
 |    |    |-- index: long (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- mapped_label: string (nullable = true)
 |    |    |-- probability: double (nullable = true)
 |-- start_time: long (nullable = true)
 |-- energy: double (nullable = true)

-RECORD 0------------------------------
 species        | grecor               
 track_stem     | XC629875_part013     
 track_type     | source1              
 track_name     | grecor/XC629875_p... 
 embedding      | [0.75528019666671..

In [26]:
df.groupBy("species").count().orderBy(F.desc("count")).show(20)
df.groupBy("species").count().orderBy(F.asc("count")).show(20)
print(len(set(list(df.select("species").collect()))))

+-------+------+
|species| count|
+-------+------+
|thrnig1|104440|
| wlwwar| 57260|
|eubeat1| 52810|
| hoopoe| 42995|
|combuz1| 42650|
|cohmar1| 35210|
| barswa| 33355|
|eaywag1| 29165|
| comsan| 27180|
|combul2| 24855|
| woosan| 23710|
|rbsrob1| 19910|
|blakit1| 19880|
| litegr| 16710|
| grecor| 16030|
|somgre1| 15280|
|gnbcam2| 13945|
|rerswa1| 13660|
|colsun2| 12545|
|carcha1| 12440|
+-------+------+
only showing top 20 rows

+-------+-----+
|species|count|
+-------+-----+
|afpkin1|   15|
|whctur2|   20|
|whhsaw1|   20|
|golher1|   25|
|rehblu1|   40|
|lotlap1|   40|
|brtcha1|   50|
|dotbar1|   60|
|lotcor1|   60|
|crefra2|   70|
|brcwea1|   75|
|yebsto1|   90|
|fatwid1|  155|
|gobsta5|  160|
|palpri1|  170|
|darter3|  170|
|witswa1|  195|
|sacibi2|  200|
|rufcha2|  225|
|joygre1|  255|
+-------+-----+
only showing top 20 rows

264


In [27]:
from pyspark.sql import Window, functions as F

# keep the track_type for each
highest_energy_channel = (
    df
    # get the track stem without the part
    .withColumn("original_track_stem", F.split(F.col("track_stem"), "_").getItem(0))
    .where("track_type != 'original'")
    # get the track type that has the most energy
    .withColumn(
        "rank",
        F.rank().over(
            Window.partitionBy("original_track_stem").orderBy(F.desc("energy"))
        ),
    )
    # keep the first row
    .where(F.col("rank") == 1)
    # drop the rank column
    .select("species", "track_stem", "track_type")
    .distinct()
)

highest_energy_channel.show()

+-------+----------------+----------+
|species|      track_stem|track_type|
+-------+----------------+----------+
| reccor|        XC312724|   source0|
| litegr|        XC333916|   source2|
|afrjac1|        XC585200|   source0|
|yertin1|        XC634144|   source1|
| hoopoe|        XC240325|   source3|
|cohmar1|        XC294118|   source3|
|combul2|        XC320088|   source3|
|colsun2|        XC433381|   source2|
|yebgre1|        XC131632|   source3|
|slbgre1|        XC284897|   source0|
| litegr|        XC333970|   source3|
|combul2|XC440328_part001|   source0|
|grewoo2|        XC479374|   source3|
| gargan|        XC547205|   source2|
| greegr|        XC674495|   source3|
|rbsrob1|        XC756771|   source0|
|wookin1|        XC116740|   source0|
| yefcan|        XC206825|   source0|
|combul2|        XC307654|   source0|
|btweye2|        XC338487|   source0|
+-------+----------------+----------+
only showing top 20 rows



In [28]:
from pyspark.sql.functions import concat

# get the highest predictions by exploding the values

exploded_embeddings = (
    df
    # join against the highest energy channel
    .join(
        highest_energy_channel,
        on=["species", "track_stem", "track_type"],
        how="inner",
    )
    # explode the embeddings, these are ordered by confidence
    .withColumn("predictions", F.explode("predictions")).select(
        concat(df.track_name, df.start_time, df.track_type).alias("track_id"),
        "species",
        "track_stem",
        "track_type",
        "start_time",
        "track_name",
        "embedding",
        "predictions.*",
    )
    # simplifying assumption: we assume the prediction with the highest confidence is the true label
    .where("rank = 0 and probability > 0.4")
)
exploded_embeddings.drop("embedding").show()

+--------------------+-------+----------------+----------+----------+--------------------+----+-----+--------------------+------------+-------------------+
|            track_id|species|      track_stem|track_type|start_time|          track_name|rank|index|               label|mapped_label|        probability|
+--------------------+-------+----------------+----------+----------+--------------------+----+-----+--------------------+------------+-------------------+
|thrnig1/XC736751_...|thrnig1|        XC736751|   source3|       144|thrnig1/XC736751_...|   0|   26|Acrocephalus arun...|     grrwar1| 0.5790408849716187|
|blacuc1/XC420239_...|blacuc1|        XC420239|   source3|       105|blacuc1/XC420239_...|   0|  883|Cuculus clamosus_...|     blacuc1| 0.9967339038848877|
|thrnig1/XC567040_...|thrnig1|XC567040_part003|   source3|        39|thrnig1/XC567040_...|   0| 1686|Luscinia luscinia...|     thrnig1| 0.7975045442581177|
|yewgre1/XC470576_...|yewgre1|        XC470576|   source3|      

In [29]:
from collections import Counter

species = [s["species"] for s in exploded_embeddings.select("species").collect()]

cnts = Counter(species)
for c in cnts:
    if cnts[c] == 1:
        species.remove(c)

species = set(species)
print(len(species))

253


In [30]:
rare_species = (
    df
    # join against the highest energy channel
    .join(
        highest_energy_channel,
        on=["species", "track_stem", "track_type"],
        how="inner",
    )
    # explode the embeddings, these are ordered by confidence
    .withColumn("predictions", F.explode("predictions")).select(
        concat(df.track_name, df.start_time, df.track_type).alias("track_id"),
        "species",
        "track_stem",
        "track_type",
        "start_time",
        "track_name",
        "embedding",
        "predictions.*",
    )
    .filter(~df.species.isin(species))
    # simplifying assumption: we assume the prediction with the highest confidence is the true label
    .where("rank = 0 and probability > 0.05")
)
rare_species.drop("embedding").show()
print(sorted(set(([s["species"] for s in rare_species.select("species").collect()]))))


+--------------------+-------+----------+----------+----------+--------------------+----+-----+--------------------+------------+--------------------+
|            track_id|species|track_stem|track_type|start_time|          track_name|rank|index|               label|mapped_label|         probability|
+--------------------+-------+----------+----------+----------+--------------------+----+-----+--------------------+------------+--------------------+
|blksaw1/XC292874_...|blksaw1|  XC292874|   source0|         0|blksaw1/XC292874_...|   0| 2935|Sturnus unicolor_...|     sposta1|0.060267090797424316|
|blksaw1/XC516764_...|blksaw1|  XC516764|   source2|         3|blksaw1/XC516764_...|   0| 2338|Phylloscopus sibi...|      woowar| 0.05070174112915993|
|blksaw1/XC236860_...|blksaw1|  XC236860|   source2|         3|blksaw1/XC236860_...|   0| 3005|Tauraco porphyreo...|     puctur2|0.052125513553619385|
|blksaw1/XC516846_...|blksaw1|  XC516846|   source1|         6|blksaw1/XC516846_...|   0|  123

In [31]:
exploded_embeddings = exploded_embeddings.union(rare_species)

exploded_embeddings.drop("embedding").show()

print(len(sorted(set(([s["species"] for s in exploded_embeddings.select("species").collect()])))))

+--------------------+-------+----------------+----------+----------+--------------------+----+-----+--------------------+------------+-------------------+
|            track_id|species|      track_stem|track_type|start_time|          track_name|rank|index|               label|mapped_label|        probability|
+--------------------+-------+----------------+----------+----------+--------------------+----+-----+--------------------+------------+-------------------+
|thrnig1/XC736751_...|thrnig1|        XC736751|   source3|       144|thrnig1/XC736751_...|   0|   26|Acrocephalus arun...|     grrwar1| 0.5790408849716187|
|blacuc1/XC420239_...|blacuc1|        XC420239|   source3|       105|blacuc1/XC420239_...|   0|  883|Cuculus clamosus_...|     blacuc1| 0.9967339038848877|
|thrnig1/XC567040_...|thrnig1|XC567040_part003|   source3|        39|thrnig1/XC567040_...|   0| 1686|Luscinia luscinia...|     thrnig1| 0.7975045442581177|
|yewgre1/XC470576_...|yewgre1|        XC470576|   source3|      

In [32]:
# percentage of clips that have a prediction of any kind
positive = exploded_embeddings.count()
total = df.count()
positive / total * 100

6.135578799859867

In [66]:
from pyspark.sql.functions import rand

length = len(exploded_embeddings.select("embedding").take(1)[0]["embedding"])

# explode_preds_bird_calls = exploded_embeddings.drop("probability")
explode_preds_bird_calls = exploded_embeddings.dropDuplicates(["track_id"])
data = explode_preds_bird_calls.select(
    ["track_id", "species"]
    + [
        explode_preds_bird_calls.embedding[i].alias("embedding" + str(i))
        for i in range(length)
    ]
)
data = data.orderBy(rand())
data = data.toPandas()
print(len(data))
data.head()

73551


Unnamed: 0,track_id,species,embedding0,embedding1,embedding2,embedding3,embedding4,embedding5,embedding6,embedding7,...,embedding310,embedding311,embedding312,embedding313,embedding314,embedding315,embedding316,embedding317,embedding318,embedding319
0,tafpri1/XC700880_source0.mp357source0,tafpri1,1.046905,1.548031,0.315757,0.684051,1.148972,0.572682,1.134159,1.361273,...,1.237676,1.691442,1.06055,0.667959,0.351222,1.243348,1.484354,0.494428,0.99716,0.601789
1,helgui/XC320438_part001_source0.mp384source0,helgui,1.932394,1.415659,1.282802,0.386913,1.05275,0.798684,0.531243,1.28052,...,1.723733,1.448739,0.084623,1.677057,0.209935,1.160101,0.79523,1.230336,1.644855,1.258054
2,gargan/XC553738_source0.mp375source0,gargan,1.49413,1.236415,0.29406,0.324693,0.757699,0.868039,0.810581,0.527687,...,1.360156,0.767236,0.639994,1.654666,0.181551,0.690551,1.324964,0.440684,1.319565,0.406765
3,afpwag1/XC117150_source0.mp315source0,afpwag1,1.032822,1.74062,1.559377,0.707599,0.933662,0.630903,0.284304,1.148938,...,1.253774,1.436459,0.556143,0.93253,1.021008,0.992772,0.747181,1.341511,0.833515,1.377689
4,thrnig1/XC477571_source3.mp342source3,thrnig1,0.963897,1.891311,0.465698,0.28875,2.147638,1.822108,0.426492,0.311288,...,1.235781,0.720326,0.847631,1.392451,1.513188,1.072907,1.090941,0.038482,1.629774,0.943001


In [34]:
print(len(data))
print(df.count())

73551
1198860


base classifiers

In [36]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, average_precision_score


train_x, test_x, train_y, test_y = train_test_split(
    data.loc[:, (data.columns != "species") & (data.columns != "track_id")],
    data["species"],
    test_size=0.33,
    stratify=data["species"]
)

# train_x = pd.concat([train_x, temp.loc[:, (temp.columns != "species") & (temp.columns != "track_id")]])
# train_y = pd.concat([train_y, temp["species"]])

def eval(truth, preds):
    # print("Average Precision (Kaggle)", average_precision_score(truth, preds))
    print("Accuracy:", accuracy_score(truth, preds))
    print(
        "Precision:",
        # precision_score(truth, preds, average=None),
        precision_score(truth, preds, average="macro"),
    )
    print(
        "Recall:",
        # recall_score(truth, preds, average=None),
        recall_score(truth, preds, average="macro"),
    )
    print(
        "F1 Score:",
        # f1_score(truth, preds, average=None),
        f1_score(truth, preds, average="macro"),
    )

In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from skopt import BayesSearchCV
from skopt.space import Real

logs = BayesSearchCV(
    LogisticRegression(max_iter=2000, solver='newton-cholesky'),
    {
        'C': Real(1e-6, 1e+6, prior='log-uniform'),
    },
    n_iter=16,
    scoring="precision_macro",
    n_jobs=-1,
    n_points=6,
    cv=zip([np.arange(0, int(len(train_x)*0.67))], [np.arange(int(len(train_x)*0.67) + 1, len(train_x))]),
    verbose=4
)

logs.fit(train_x, train_y)
print(logs.best_params_)
preds = logs.predict(test_x)
eval(test_y, preds)

# params = {
#     "logistic_regression__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
# }

# pipe = Pipeline([
#     ('pca', PCA(n_components=0.8)),
#     ('logistic_regression', LogisticRegression(max_iter=4000, solver='newton-cholesky'))
# ])

# search = GridSearchCV(
#     pipe, 
#     param_grid=params, scoring="precision_macro", 
#     n_jobs=4, 
#     cv=zip([np.arange(0, int(len(train_x)*0.67))], [np.arange(int(len(train_x)*0.67) + 1, len(train_x))]), 
#     verbose=4
# )
# search.fit(train_x, train_y)
# print(search.best_params_)
# preds = search.predict(test_x)

# eval(test_y, preds)

Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 4 candidates, totalling 4 fits
OrderedDict([('C', 0.6730934693522141)])
Accuracy: 0.9217617007251153


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.8474130875280332
Recall: 0.7319330308865075
F1 Score: 0.7710300145051235


Negative Samples

In [43]:
with open("../data/models/birdnet-analyzer-pruned/labels.txt") as f:
    labels = [x.strip() for x in f.readlines()]
import json
with open('../data/models/birdnet-analyzer-pruned/eBird_taxonomy_codes_2021E.json') as json_file:
    mapped = json.load(json_file)

noise = [(i, x) for i,x in enumerate(labels) if "human" in x.lower() or len(x.split(" ")) < 2]
index = [x[0] for x in noise]
print(noise)
print(index)

[(1022, 'Dog_Dog'), (1136, 'Engine_Engine'), (1141, 'Environmental_Environmental'), (1219, 'Fireworks_Fireworks'), (1352, 'Gun_Gun'), (1449, 'Human non-vocal_Human non-vocal'), (1450, 'Human vocal_Human vocal'), (1451, 'Human whistle_Human whistle'), (1997, 'Noise_Noise'), (2812, 'Siren_Siren')]
[1022, 1136, 1141, 1219, 1352, 1449, 1450, 1451, 1997, 2812]


In [44]:
label_df = spark.createDataFrame(
    [
        {
            "label": label,
            "mapped_label": mapped_label,
            "index": i,
        }
        for i, (label, mapped_label) in enumerate(zip(labels, mapped))
    ]
)
label_df.show(n=5)

+-----+--------------------+--------------------+
|index|               label|        mapped_label|
+-----+--------------------+--------------------+
|    0|Abroscopus albogu...|             ostric2|
|    1|Abroscopus superc...|Struthio camelus_...|
|    2|Aburria aburri_Wa...|             ostric3|
|    3|Acanthagenys rufo...|Struthio molybdop...|
|    4|Acanthis cabaret_...|             grerhe1|
+-----+--------------------+--------------------+
only showing top 5 rows



In [58]:
temp = (
    df
    # explode the predictions with their indices
    .select(
        concat(df.track_name, df.start_time, df.track_type).alias("track_id"),
        "species",
        "embedding",
        F.posexplode("prediction_vec").alias("index", "logit"),
    )
    # join with the labels, in case we want to use it for anything
    .join(label_df, on="index", how="inner")
    # now only keep human vocals
    .where("index in (1022, 1136, 1141, 1219, 1352, 1449, 1450, 1451, 1997, 2812)")
    # and convert the logit to a probability via sigmoid
    .withColumn("probability", F.expr("1/(1+exp(-logit))"))
)

temp.show(n=5)

+-----+--------------------+-------+--------------------+-------------------+-----------+------------+--------------------+
|index|            track_id|species|           embedding|              logit|      label|mapped_label|         probability|
+-----+--------------------+-------+--------------------+-------------------+-----------+------------+--------------------+
| 2812|grecor/XC629875_p...| grecor|[0.75528019666671...|-11.079278945922852|Siren_Siren|     stther2|1.542849591141611...|
| 2812|grecor/XC629875_p...| grecor|[0.69153714179992...| -5.854541301727295|Siren_Siren|     stther2|0.002858654982902...|
| 2812|wlwwar/XC475384_p...| wlwwar|[0.54444074630737...| -10.34199047088623|Siren_Siren|     stther2|3.224902588057650...|
| 2812|grecor/XC629875_p...| grecor|[1.35060644149780...|-12.626036643981934|Siren_Siren|     stther2|3.285341581213456...|
| 2812|grecor/XC629875_p...| grecor|[1.33309292793273...| -11.77197551727295|Siren_Siren|     stther2|7.717784360253755E-6|
+-----+-

In [59]:
highest = (
    temp
    .withColumn("row", F.row_number().over(
        Window.partitionBy("label").orderBy(F.desc("probability"))
    ))
    .filter(F.col("row") <= 100)
    .drop("row")
)
#highest.orderBy(F.asc("probability")).show(n=5)

In [60]:
highest = highest.withColumn("label", F.regexp_replace("label", "^.*$", "no_call"))
highest.show(n=5)

+-----+--------------------+-------+--------------------+-----------------+-------+--------------------+------------------+
|index|            track_id|species|           embedding|            logit|  label|        mapped_label|       probability|
+-----+--------------------+-------+--------------------+-----------------+-------+--------------------+------------------+
| 1451|eswdov1/XC373715_...|eswdov1|[1.15534889698028...|5.881035327911377|no_call|Leptotrygon verag...|0.9972158797134549|
| 1451|scrcha1/XC700642_...|scrcha1|[1.49181497097015...| 5.36312198638916|no_call|Leptotrygon verag...|0.9953356057026981|
| 1451|scrcha1/XC700642_...|scrcha1|[1.20502555370330...|4.819712162017822|no_call|Leptotrygon verag...|0.9919954799469767|
| 1451|scrcha1/XC700642_...|scrcha1|[1.51749086380004...| 4.75782585144043|no_call|Leptotrygon verag...|0.9914888096436446|
| 1451|scrcha1/XC700641_...|scrcha1|[0.83194106817245...|4.519919395446777|no_call|Leptotrygon verag...|  0.98922741102306|
+-----+-

In [67]:
highest = highest.dropDuplicates(["track_id"])
extra_no_call = highest.select(
    ["track_id", highest.label.alias("species")]
    + [
        highest.embedding[i].alias("embedding" + str(i))
        for i in range(length)
    ]
)
extra_no_call = extra_no_call.orderBy(rand())
extra_no_call = extra_no_call.toPandas()
extra_no_call.head()

Unnamed: 0,track_id,species,embedding0,embedding1,embedding2,embedding3,embedding4,embedding5,embedding6,embedding7,...,embedding310,embedding311,embedding312,embedding313,embedding314,embedding315,embedding316,embedding317,embedding318,embedding319
0,litegr/XC332727_part002_source2.mp339source2,no_call,0.453603,1.407206,0.9637,0.328084,0.821067,0.84984,0.58675,0.78933,...,0.4038,0.936048,1.23184,1.154702,0.808597,0.814607,0.769992,1.468771,0.685195,0.461054
1,cohmar1/XC748722_source2.mp3105source2,no_call,0.997039,1.166095,0.624511,0.74663,0.682889,0.3537,0.330337,0.395813,...,0.264442,0.771598,0.463666,1.139696,0.820406,0.764856,0.863483,1.05013,1.058804,1.495795
2,afpfly1/XC664562_part001_source2.mp342source2,no_call,0.858408,1.033582,0.828035,0.170326,0.731404,1.05453,0.17603,0.092698,...,0.406676,0.703045,0.922793,0.678016,0.550343,1.104876,1.068773,1.196381,1.391785,1.253406
3,hadibi1/XC414768_source2.mp3117source2,no_call,0.681253,1.311305,0.482825,0.498515,0.896551,1.192011,1.040744,1.194243,...,1.826901,0.567492,0.773738,1.010281,0.632601,0.816928,0.58661,0.648143,0.894551,0.757499
4,greegr/XC257308_source0.mp39source0,no_call,0.447025,1.110702,0.510107,0.574779,0.896424,1.844411,0.400486,0.33628,...,0.691336,1.172437,0.862326,0.6997,0.672782,0.885479,0.497434,0.977077,0.711216,0.540575


In [68]:
import pandas as pd
combined = pd.concat([data, extra_no_call])

In [115]:
train_x, test_x, train_y, test_y = train_test_split(
    combined.loc[:, (combined.columns != "species") & (combined.columns != "track_id")],
    combined["species"],
    test_size=0.33,
    stratify=combined["species"]
)

new_logs = LogisticRegression(max_iter=2000, solver='newton-cholesky', C=0.6730934693522141)
new_logs.fit(train_x, train_y)
preds = new_logs.predict(test_x)
eval(test_y, preds)

In [116]:
neg_logs = BayesSearchCV(
    LogisticRegression(max_iter=2000, solver='newton-cholesky'),
    {
        'C': Real(1e-6, 1e+6, prior='log-uniform'),
    },
    n_iter=16,
    scoring="precision_macro",
    n_jobs=-1,
    n_points=6,
    cv=zip([np.arange(0, int(len(train_x)*0.67))], [np.arange(int(len(train_x)*0.67) + 1, len(train_x))]),
    verbose=4
)

neg_logs.fit(train_x, train_y)
print(neg_logs.best_params_)
preds = neg_logs.predict(test_x)
eval(test_y, preds)

Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 6 candidates, totalling 6 fits
Fitting 1 folds for each of 4 candidates, totalling 4 fits
OrderedDict([('C', 1.8349964826774303)])
Accuracy: 0.9188056787210674


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.8430991621430692
Recall: 0.7435823725313221
F1 Score: 0.7774946681397493


-----------------------------------------------------------------------------------

Saving models to disk

In [117]:
import pickle

# pickle.dump(
#     log, open("../../data/models/baseline/logistic_reg_2.pkl", "wb"), protocol=2
# )
pickle.dump(
    new_logs, open("../data/logistic_negative_old.pkl", "wb"), protocol=3
)
pickle.dump(
    neg_logs, open("../data/logistic_negative_new.pkl", "wb"), protocol=3
)