In [7]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


## Binary Classifier (call/no call)

In [8]:
from birdclef.utils import get_spark

spark = get_spark(cores=4, memory="10g")
df = spark.read.parquet(
    "../data/processed/birdclef-2023/train_embeddings/consolidated_v3"
)
df.printSchema()

root
 |-- species: string (nullable = true)
 |-- track_stem: string (nullable = true)
 |-- track_type: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- prediction_vec: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- predictions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- rank: long (nullable = true)
 |    |    |-- index: long (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- mapped_label: string (nullable = true)
 |    |    |-- probability: double (nullable = true)
 |-- start_time: long (nullable = true)
 |-- energy: double (nullable = true)



### 1. Data Preprocessing

#### 1.1 Positive labels

In [9]:
from pyspark.sql import Window, functions as F

# keep the track_type for the highest energy
highest_energy_channel = (
    df
    # get the track stem without the part
    .withColumn("original_track_stem", F.split(F.col("track_stem"), "_").getItem(0))
    .where("track_type != 'original'")
    # get the track type that has the most energy
    .withColumn(
        "rank",
        F.rank().over(
            Window.partitionBy("original_track_stem").orderBy(F.desc("energy"))
        ),
    )
    # keep the first row
    .where(F.col("rank") == 1)
    # drop the rank column
    .select("species", "track_stem", "track_type")
    .distinct()
)

# get the highest predictions by exploding the values
exploded_embeddings = (
    df
    # join against the highest energy channel
    .join(
        highest_energy_channel,
        on=["species", "track_stem", "track_type"],
        how="inner",
    )
    # explode the embeddings, these are ordered by confidence
    .withColumn("predictions", F.explode("predictions")).select(
        "species",
        "track_stem",
        "track_type",
        "start_time",
        "track_name",
        "embedding",
        "predictions.*",
    )
    # simplifying assumption: we assume the prediction with the highest confidence is the true label
    .where("rank = 0")
).cache()

exploded_embeddings.drop("embedding").show(n=5)



+-------+----------+----------+----------+--------------------+----+-----+--------------------+------------+--------------------+
|species|track_stem|track_type|start_time|          track_name|rank|index|               label|mapped_label|         probability|
+-------+----------+----------+----------+--------------------+----+-----+--------------------+------------+--------------------+
|abythr1|  XC233199|   source0|         0|abythr1/XC233199_...|   0|  639|Chloropsis hardwi...|     orblea1|0.002208352088928...|
|abythr1|  XC233199|   source0|        57|abythr1/XC233199_...|   0| 1151|Erpornis zanthole...|     whbyuh1|0.025502817705273628|
|abythr1|  XC233199|   source0|        27|abythr1/XC233199_...|   0| 3164|Turdus abyssinicu...|     abythr1|0.024902962148189545|
|abythr1|  XC233199|   source0|        30|abythr1/XC233199_...|   0|  639|Chloropsis hardwi...|     orblea1|0.012038093991577625|
|abythr1|  XC233199|   source0|        21|abythr1/XC233199_...|   0| 3185|Turdus leucomela

                                                                                

In [10]:
# quick count of the number of species
counts = (
    exploded_embeddings.groupBy("species")
    .agg(F.count("*").alias("n"))
    .orderBy(F.desc("n"))
)
counts.show(n=5)
counts.orderBy("n").show(n=5)

                                                                                

+-------+-----+
|species|    n|
+-------+-----+
|thrnig1|12987|
| wlwwar| 9249|
|combuz1| 7173|
| hoopoe| 6731|
| barswa| 6191|
+-------+-----+
only showing top 5 rows





+-------+---+
|species|  n|
+-------+---+
|afpkin1|  3|
|whhsaw1|  4|
|whctur2|  4|
|golher1|  5|
|lotlap1|  8|
+-------+---+
only showing top 5 rows



                                                                                

In [11]:
# Prepared DF
rarity_min_count = 100
rare_species_count = (
    exploded_embeddings.groupBy("species")
    .agg(F.count("*").alias("n"))
    .where(f"n < {rarity_min_count}")
)
rare_species_count.show(n=5)

# if there are a lot of examples, we can use a higher threshold
common_species = exploded_embeddings.where("probability > 0.4").join(
    rare_species_count.select("species"), on="species", how="left_anti"
)
# these ones are less common so we use a lower threshold so we have at least one
# example for each species
rare_species = exploded_embeddings.where("probability > 0.1").join(
    rare_species_count.select("species"), on="species", how="inner"
)
prepared = common_species.union(rare_species).select(
    "species", "probability", "embedding"
)
prepared.show(n=5)
prepared.count()

                                                                                

+-------+---+
|species|  n|
+-------+---+
|purgre2| 60|
|bubwar2| 90|
|rehwea1| 69|
|kvbsun1| 80|
|equaka1| 63|
+-------+---+
only showing top 5 rows



                                                                                

+-------+------------------+--------------------+
|species|       probability|           embedding|
+-------+------------------+--------------------+
|afghor1|0.9965255856513977|[0.57833033800125...|
|afghor1| 0.511886715888977|[1.00166213512420...|
|afghor1|0.9984956979751587|[0.88829582929611...|
|afghor1|0.9988522529602051|[1.26016914844512...|
|afghor1|0.9997662901878357|[1.16302716732025...|
+-------+------------------+--------------------+
only showing top 5 rows



                                                                                

74126

In [12]:
# lets check that we have the right number of classes, and how many examples we are working with
prepared_counts = (
    prepared.groupBy("species").agg(F.count("*").alias("n")).orderBy(F.desc("n"))
)
print(f"number of species {prepared_counts.count()}")

prepared_counts.show(n=5)
prepared_counts.orderBy("n").show(n=5)

                                                                                

number of species 263


                                                                                

+-------+----+
|species|   n|
+-------+----+
|thrnig1|3833|
| hoopoe|3822|
|eubeat1|3116|
| wlwwar|2687|
| barswa|2603|
+-------+----+
only showing top 5 rows



                                                                                

+-------+---+
|species|  n|
+-------+---+
|afpkin1|  2|
|whhsaw1|  2|
|rehblu1|  2|
|golher1|  3|
|lotlap1|  3|
+-------+---+
only showing top 5 rows



#### 1.1 Negative labels

In [13]:
# Negative calls
noise_indices = [
    (1022, "Dog_Dog"),
    (1136, "Engine_Engine"),
    (1141, "Environmental_Environmental"),
    (1219, "Fireworks_Fireworks"),
    (1352, "Gun_Gun"),
    (1449, "Human non-vocal_Human non-vocal"),
    (1450, "Human vocal_Human vocal"),
    (1451, "Human whistle_Human whistle"),
    (1997, "Noise_Noise"),
    (2812, "Siren_Siren"),
]
noise_indices = [i[0] for i in noise_indices]

# Craete negative samples DF
negative_samples = (
    df
    # explode the predictions with their indices
    .select(
        "track_name",
        "start_time",
        "embedding",
        F.posexplode("prediction_vec").alias("index", "logit"),
    )
    .where(F.col("index").isin(noise_indices))
    .withColumn("probability", F.expr("1/(1+exp(-logit))"))
    .where("probability > 0.4")
    .select("track_name", "start_time", F.lit("no_call").alias("species"), "embedding")
).cache()
negative_samples.show()

[Stage 158:>                                                        (0 + 1) / 1]

+--------------------+----------+-------+--------------------+
|          track_name|start_time|species|           embedding|
+--------------------+----------+-------+--------------------+
|hoopoe/XC318438_p...|        66|no_call|[0.47851136326789...|
|litegr/XC332323_p...|       117|no_call|[0.38947930932044...|
|litegr/XC332323_p...|        12|no_call|[0.43516066670417...|
|litegr/XC332323_p...|        12|no_call|[0.43516066670417...|
|litegr/XC332727_p...|        78|no_call|[0.35861393809318...|
|litegr/XC331548_p...|        90|no_call|[0.51571053266525...|
|eubeat1/XC699304_...|         9|no_call|[0.13348712027072...|
|combuz1/XC579930_...|        33|no_call|[0.22507806122303...|
|barswa/XC361240_p...|         0|no_call|[1.25061154365539...|
|combuz1/XC579874_...|        30|no_call|[0.03499644994735...|
|spmthr1/XC602624_...|       123|no_call|[1.07436048984527...|
|blaplo1/XC736812_...|        72|no_call|[0.12970589101314...|
|egygoo/XC109700_s...|        72|no_call|[0.26487210392

                                                                                

In [14]:
from pyspark.sql.functions import lit

# Get the exploded examples
# Only keep examples less than 0.1
# Every example in the new query is a no-call
# Union with the negative_samples

exploded_negatives = exploded_embeddings.where(exploded_embeddings.probability < 0.1)

# Define fractions for each stratum
fractions = (
    exploded_negatives.select("track_stem")
    .distinct()
    .withColumn("fraction", lit(0.6))
    .rdd.collectAsMap()
)

# Perform stratified sampling
exploded_negative_sub = exploded_negatives.stat.sampleBy(
    "track_stem", fractions, seed=42
)

# Select columns from exploded DF that match negative_samples
exploded_negative_select = exploded_negative_sub.select(negative_samples.columns)

# Perform union operation
negatives = negative_samples.union(exploded_negative_select)

# Set all values in species column to "no_call"
negatives = negatives.withColumn("species", lit("no_call"))

# Check unique values
unique_vals = negatives.select("species").distinct()
unique_values_list = unique_vals.rdd.flatMap(lambda x: x).collect()

print(f"Unique values in species column: {len(unique_values_list)}")
print(f"Number of rows: {negatives.count()}")
negatives.show(n=5)

                                                                                

Unique values in species column: 1




Number of rows: 58978
+--------------------+----------+-------+--------------------+
|          track_name|start_time|species|           embedding|
+--------------------+----------+-------+--------------------+
|hoopoe/XC318438_p...|        66|no_call|[0.47851136326789...|
|litegr/XC332323_p...|       117|no_call|[0.38947930932044...|
|litegr/XC332323_p...|        12|no_call|[0.43516066670417...|
|litegr/XC332323_p...|        12|no_call|[0.43516066670417...|
|litegr/XC332727_p...|        78|no_call|[0.35861393809318...|
+--------------------+----------+-------+--------------------+
only showing top 5 rows



                                                                                

In [15]:
negatives_sub = negatives.select("species", "embedding")
positives_sub = prepared.select("species", "embedding")
positives_sub = positives_sub.withColumn("species", lit("call"))
binary_df = negatives_sub.union(positives_sub)
binary_df.show()

[Stage 212:>                                                        (0 + 1) / 1]

+-------+--------------------+
|species|           embedding|
+-------+--------------------+
|no_call|[0.47851136326789...|
|no_call|[0.38947930932044...|
|no_call|[0.43516066670417...|
|no_call|[0.43516066670417...|
|no_call|[0.35861393809318...|
|no_call|[0.51571053266525...|
|no_call|[0.13348712027072...|
|no_call|[0.22507806122303...|
|no_call|[1.25061154365539...|
|no_call|[0.03499644994735...|
|no_call|[1.07436048984527...|
|no_call|[0.12970589101314...|
|no_call|[0.26487210392951...|
|no_call|[0.57878917455673...|
|no_call|[0.91182446479797...|
|no_call|[1.38340187072753...|
|no_call|[0.75772899389266...|
|no_call|[0.45265802741050...|
|no_call|[1.00851356983184...|
|no_call|[0.55738186836242...|
+-------+--------------------+
only showing top 20 rows



                                                                                

In [16]:
# Check the number of samples for each label
# counts = binary_df.groupBy("species").agg(F.count("*").alias("n")).orderBy(F.desc("n"))
# counts.show(n=5)

In [17]:
# Write out res to a parquet file, using 1 or 2 partitions.
# Use the processed/birdnet-2023 folder and make a new dataset under there.
binary_df.repartition(2).write.mode("overwrite").parquet(
    "../data/processed/birdclef-2023/call_no_call"
)

                                                                                

In [18]:
# Data for model training
data = binary_df.toPandas()
data.head()

                                                                                

Unnamed: 0,species,embedding
0,no_call,"[0.47851136326789856, 1.983388900756836, 1.091..."
1,no_call,"[0.38947930932044983, 1.2773804664611816, 0.54..."
2,no_call,"[0.43516066670417786, 1.339859962463379, 1.467..."
3,no_call,"[0.43516066670417786, 1.339859962463379, 1.467..."
4,no_call,"[0.3586139380931854, 1.1902474164962769, 0.520..."


In [19]:
import time
import numpy as np
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
    classification_report,
)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    np.stack(data["embedding"]),
    data["species"],
    test_size=0.33,
    stratify=data["species"],
)

# Create a label encoder object
le = LabelEncoder()

# Fit and transform the target with label encoder
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Data shape
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(89179, 320) (43925, 320)
(89179,) (43925,)


In [20]:
def model_eval(truth, preds):
    print("Accuracy:", round(accuracy_score(truth, preds), 3))
    print("Precision:", round(precision_score(truth, preds, average="macro"), 3))
    print("Recall:", round(recall_score(truth, preds, average="macro"), 3))
    print("F1 Score:", round(f1_score(truth, preds, average="macro"), 3))

In [21]:
from skopt.space import Real
from skopt import BayesSearchCV
from sklearn.utils import class_weight
from sklearn.linear_model import LogisticRegression

weights = class_weight.compute_sample_weight(class_weight="balanced", y=y_train_enc)

# LogisticRegression
lgr_search = BayesSearchCV(
    LogisticRegression(),
    {
        "penalty": ("l2",),
        "C": (0.01, 1.1, "log-uniform"),
    },
    n_iter=10,
    scoring="f1",
    verbose=4,
    cv=3,
    n_points=1,
    n_jobs=-1,
)
%time lgr_search.fit(X_train, y_train_enc, sample_weight=weights)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 2/3] END ..C=0.1184718346633756, penalty=l2;, score=0.842 total time=   3.4s
[CV 3/3] END ..C=0.1184718346633756, penalty=l2;, score=0.846 total time=   5.3s
[CV 1/3] END ..C=0.1184718346633756, penalty=l2;, score=0.845 total time=   5.2s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 3/3] END .C=0.28694322665484145, penalty=l2;, score=0.846 total time=   3.8s
[CV 2/3] END .C=0.28694322665484145, penalty=l2;, score=0.842 total time=   5.2s
[CV 1/3] END .C=0.28694322665484145, penalty=l2;, score=0.845 total time=   5.1s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 2/3] END ..C=0.5594759301857682, penalty=l2;, score=0.842 total time=   4.7s
[CV 1/3] END ..C=0.5594759301857682, penalty=l2;, score=0.845 total time=   4.8s
[CV 3/3] END ..C=0.5594759301857682, penalty=l2;, score=0.846 total time=   5.5s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 2/3] END .C=0.01359034277069561

In [22]:
# Evaluate model
model_eval(y_test_enc, lgr_search.predict(X_test))

Accuracy: 0.865
Precision: 0.863
Recall: 0.863
F1 Score: 0.863


In [23]:
import pickle
from pathlib import Path

# Write model to pickle file
pickle.dump(
    lgr_search.best_estimator_,
    Path("../data/models/baseline/logistic_binary.pkl").open("wb"),
)

# Label Encoder to pickle file
pickle.dump(
    le,
    Path("../data/models/baseline/logistic_binary_label_encoder.pkl").open("wb"),
)

In [16]:
from xgboost import XGBClassifier

weights = class_weight.compute_sample_weight(class_weight="balanced", y=y_train_enc)

# XGBoost model
xgb_search = BayesSearchCV(
    XGBClassifier(tree_method="gpu_hist"),
    {
        "eta": (0.01, 1.0, "log-uniform"),
        "max_depth": (1, 30, "uniform"),
        "gamma": (0, 1, "uniform"),
        "min_child_weight": (1, 10, "uniform"),
    },
    n_iter=10,
    scoring="f1",
    verbose=4,
    cv=3,
    n_points=1,
    n_jobs=1,
)
%time xgb_search.fit(X_train, y_train_enc, sample_weight=weights)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END eta=0.025534761227908934, gamma=1, max_depth=11, min_child_weight=7;, score=0.853 total time=  18.3s
[CV 2/3] END eta=0.025534761227908934, gamma=1, max_depth=11, min_child_weight=7;, score=0.849 total time=  12.6s
[CV 3/3] END eta=0.025534761227908934, gamma=1, max_depth=11, min_child_weight=7;, score=0.841 total time=  13.4s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END eta=0.14968195367854237, gamma=0, max_depth=22, min_child_weight=7;, score=0.863 total time=  11.7s
[CV 2/3] END eta=0.14968195367854237, gamma=0, max_depth=22, min_child_weight=7;, score=0.861 total time=  11.5s
[CV 3/3] END eta=0.14968195367854237, gamma=0, max_depth=22, min_child_weight=7;, score=0.854 total time=  11.5s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END eta=0.04004897299285126, gamma=1, max_depth=14, min_child_weight=6;, score=0.856 total time=  17.1s
[CV 2/3] END eta=0.0400489729

In [17]:
# Evaluate model
model_eval(y_test_enc, xgb_search.predict(X_test))

Accuracy: 0.884
Precision: 0.884
Recall: 0.88
F1 Score: 0.881


### Adding data from Background Noise

In [5]:
from birdclef.utils import get_spark

spark = get_spark(cores=4, memory="10g")
back_noise_df = spark.read.parquet("../data/processed/birdclef-2023/background_noise")
back_noise_df.printSchema()

root
 |-- row_id: string (nullable = true)
 |-- emb: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- call: double (nullable = true)
 |-- no_call: double (nullable = true)



In [6]:
back_noise_df.show()

+--------------------+--------------------+--------+--------+
|              row_id|                 emb|    call| no_call|
+--------------------+--------------------+--------+--------+
|124cda45-4161-46c...|[0.92088675, 0.57...|0.210665|0.789335|
|06838e58-d47a-4ab...|[0.84365565, 0.68...|0.093527|0.906473|
|33dcdd62-7e78-442...|[1.2669431, 0.982...|0.286168|0.713832|
|024278df-d8b6-4ae...|[1.3872962, 0.953...|0.193856|0.806144|
|1a456b0f-fc3d-4d5...|[0.6706297, 0.336...|0.051108|0.948892|
|22fbb7ec-8f14-4c5...|[0.55484676, 0.32...|0.128191|0.871809|
|20b7cbad-6730-4a2...|[0.7614067, 0.875...|0.023886|0.976114|
|0365f3b1-2cde-4a7...|[1.0505773, 0.663...|0.203857|0.796143|
|2552eb28-5f6c-43e...|[1.1077887, 0.733...|0.407528|0.592472|
|0de0dd85-3de1-4e1...|[0.97304064, 0.64...|0.517476|0.482524|
|2eef222c-ac93-4da...|[0.8036644, 0.763...|0.148227|0.851773|
|0c9b98cc-388a-4ba...|[0.8301988, 0.777...|0.132573|0.867427|
|271f6b9b-0c1f-4d8...|[1.2728326, 1.223...|0.348467|0.651533|
|2d98464

                                                                                