In [2]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [3]:
from birdclef.utils import get_spark
import os

os.environ["SPARK_LOCAL_DIRS"] = "../data/tmp/spark"

spark = get_spark(cores=16, memory="20g")
df = spark.read.parquet(
    "../data/processed/birdclef-2023/train_embeddings/consolidated_v3"
)
df.printSchema()

preds = spark.read.parquet("../data/processed/birdclef-2023/consolidated_v3_with_preds")
preds.printSchema()

root
 |-- species: string (nullable = true)
 |-- track_stem: string (nullable = true)
 |-- track_type: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- prediction_vec: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- predictions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- rank: long (nullable = true)
 |    |    |-- index: long (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- mapped_label: string (nullable = true)
 |    |    |-- probability: double (nullable = true)
 |-- start_time: long (nullable = true)
 |-- energy: double (nullable = true)

root
 |-- track_name: string (nullable = true)
 |-- start_time: long (nullable = true)
 |-- prediction: string (nullable = true)
 |-- probability: double (nullable = true)



In [4]:
from pyspark.sql import Window, functions as F


preds.groupBy("prediction").count().orderBy(F.desc("count")).show(truncate=False, n=5)
preds.groupBy("prediction").count().orderBy("count").show(truncate=False, n=5)

counts = (
    preds.groupBy("track_name", "prediction")
    .count()
    .orderBy("track_name", F.desc("count"))
)
counts.show(n=5)

# choose the track name that has the most predictions with the main class
most_representative_source = (
    counts.withColumn("species", F.split("track_name", "/").getItem(0))
    .where("track_name like '%source%'")
    .withColumn(
        "track_stem", F.split(F.split("track_name", "/").getItem(1), "_").getItem(0)
    )
    .where("species = prediction")
    .withColumn(
        "rank",
        F.row_number().over(Window.partitionBy("track_stem").orderBy(F.desc("count"))),
    )
    .where("rank = 1")
    .select("track_name", "prediction", "count")
)
most_representative_source.show(truncate=False, n=5)

most_representative_embeddings = preds.join(
    most_representative_source.select("track_name", "prediction"),
    on=["track_name", "prediction"],
)
most_representative_embeddings.show(truncate=False, n=5)

representative_counts = (
    most_representative_embeddings.groupBy("prediction").count().orderBy("count")
)
print(representative_counts.count())
representative_counts.show(n=5)
representative_counts.orderBy(F.desc("count")).show(n=5)

+----------+------+
|prediction|count |
+----------+------+
|no_call   |128895|
|thrnig1   |127335|
|combuz1   |69723 |
|wlwwar    |49152 |
|barswa    |46486 |
+----------+------+
only showing top 5 rows

+----------+-----+
|prediction|count|
+----------+-----+
|lotcor1   |3    |
|whhsaw1   |3    |
|rostur1   |4    |
|brtcha1   |4    |
|afpkin1   |4    |
+----------+-----+
only showing top 5 rows

+--------------------+----------+-----+
|          track_name|prediction|count|
+--------------------+----------+-----+
|abethr1/XC128013.mp3|   abethr1|   11|
|abethr1/XC128013.mp3|   rbsrob1|    2|
|abethr1/XC128013.mp3|   gnbcam2|    1|
|abethr1/XC128013.mp3|   klacuc1|    1|
|abethr1/XC128013.mp3|   reccuc1|    1|
+--------------------+----------+-----+
only showing top 5 rows

+---------------------------+----------+-----+
|track_name                 |prediction|count|
+---------------------------+----------+-----+
|hoopoe/XC108366_source2.mp3|hoopoe    |10   |
|categr/XC108820_source3.m

In [5]:
embeddings_representative = (
    df
    # join against the most representative source, and only keep the embeddings
    # that are actually labeled by the previous model
    .join(
        most_representative_embeddings.select("track_name", "start_time"),
        on=["track_name", "start_time"],
        how="inner",
    ).select("track_name", "start_time", "species", "embedding")
).cache()
embeddings_representative.show(truncate=False, n=5)

+----------------------------+----------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [6]:
noise_indices = [
    (1022, "Dog_Dog"),
    (1136, "Engine_Engine"),
    (1141, "Environmental_Environmental"),
    (1219, "Fireworks_Fireworks"),
    (1352, "Gun_Gun"),
    (1449, "Human non-vocal_Human non-vocal"),
    (1450, "Human vocal_Human vocal"),
    (1451, "Human whistle_Human whistle"),
    (1997, "Noise_Noise"),
    (2812, "Siren_Siren"),
]
noise_indices = [i[0] for i in noise_indices]
negative_samples = (
    df
    # explode the predictions with their indices
    .select(
        "track_name",
        "start_time",
        "embedding",
        F.posexplode("prediction_vec").alias("index", "logit"),
    )
    .where(F.col("index").isin(noise_indices))
    .withColumn("probability", F.expr("1/(1+exp(-logit))"))
    # keep the top 250 of each class
    .withColumn(
        "rank",
        F.row_number().over(Window.partitionBy("index").orderBy(F.desc("probability"))),
    )
    .where("rank <= 250")
    .select("track_name", "start_time", F.lit("no_call").alias("species"), "embedding")
).cache()
negative_samples.show()

+--------------------+----------+-------+--------------------+
|          track_name|start_time|species|           embedding|
+--------------------+----------+-------+--------------------+
|wlwwar/XC237179_p...|        90|no_call|[0.74351072311401...|
|moccha1/XC382190_...|        27|no_call|[0.56722432374954...|
|somgre1/XC476125_...|        84|no_call|[0.65573865175247...|
|ratcis1/XC307172_...|        30|no_call|[0.50343912839889...|
|yertin1/XC633722_...|         3|no_call|[0.80654186010360...|
|augbuz1/XC493837_...|         6|no_call|[0.82505774497985...|
|cohmar1/XC749488_...|        60|no_call|[1.11857354640960...|
|thrnig1/XC494901_...|        45|no_call|[0.46692180633544...|
|egygoo/XC613342_s...|        63|no_call|[0.93724751472473...|
|ratcis1/XC307172_...|         9|no_call|[0.78493213653564...|
|litegr/XC332727_p...|        93|no_call|[0.41296926140785...|
|thrnig1/XC467330_...|        36|no_call|[0.56190323829650...|
|colsun2/XC188947_...|         6|no_call|[0.61920136213

In [7]:
# anything that has counts under 50 is underrepresented
species_counts = (
    preds.groupBy("prediction").count().orderBy("count").where("count < 50")
)
underrepresented_samples = (
    df.select("track_name", "start_time", "embedding")
    .join(
        preds.select(
            "track_name", "start_time", F.col("prediction").alias("species")
        ).join(species_counts.selectExpr("prediction as species"), on="species"),
        on=["track_name", "start_time"],
    )
    .select("track_name", "start_time", "species", "embedding")
).cache()

base classifiers

In [8]:
train_pdf = (
    embeddings_representative.union(negative_samples)
    .union(underrepresented_samples)
    .distinct()
    .select("species", "embedding")
)

In [9]:
train_pdf = train_pdf.toPandas()

In [13]:
train_pdf.shape

(142047, 2)

In [10]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)


def model_eval(truth, preds):
    print("Accuracy:", accuracy_score(truth, preds))
    print(
        "Precision:",
        precision_score(truth, preds, average="macro"),
    )
    print(
        "Recall:",
        recall_score(truth, preds, average="macro"),
    )
    print(
        "F1 Score:",
        f1_score(truth, preds, average="macro"),
    )


train_x, test_x, train_y, test_y = train_test_split(  # set the
    np.stack(train_pdf.embedding.values),
    train_pdf.species,
    test_size=0.33,
    stratify=train_pdf.species,
)

In [11]:
# train a model without doing any tuning and see how that fares
# from sklearn.svm import SVC

# clf = SVC(probability=True, class_weight="balanced")
# clf.fit(train_x, train_y)
# preds = clf.predict(test_x)
# model_eval(test_y, preds)
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

le = LabelEncoder()
le.fit(train_y)
# xgbc = XGBClassifier(verbose=2, tree_method="gpu_hist", gpu_id=1)
# xgbc.fit(train_x, le.fit_transform(train_y), verbose=True)
# preds = xgbc.predict(test_x)
# model_eval(le.transform(test_y), preds)

In [16]:
import pickle
from pathlib import Path

pickle.dump(
    xgbc,
    Path("../data/models/baseline/xgbc-base.pkl").open("wb"),
)

In [12]:
from skopt import BayesSearchCV
from sklearn.svm import SVC
import pandas as pd

# cv_model = BayesSearchCV(
#     SVC(probability=True, class_weight="balanced"),
#     {
#         "C": (1e-6, 1e6, "log-uniform"),
#         "gamma": (1e-6, 1e1, "log-uniform"),
#         "kernel": ["rbf"],
#     },
#     n_iter=64,
#     scoring="precision_macro",
#     verbose=4,
#     n_points=8,
#     n_jobs=-1,
# )

# cv_model.fit(train_x, train_y)
# print(cv_model.best_params_)
# model_eval(test_y, cv_model.predict(test_x))
# pd.DataFrame(cv_model.cv_results_)

cv_model = BayesSearchCV(
    XGBClassifier(tree_method="gpu_hist"),
    {
        "max_depth": (1, 10, "uniform"),
        "eta": (0.01, 0.5, "uniform"),
        "subsample": (0.5, 1, "uniform"),
        "colsample_bytree": (0.5, 1, "uniform"),
    },
    n_iter=16,
    scoring="precision_macro",
    verbose=4,
    cv=zip(
        [np.arange(0, int(len(train_x) * 0.67))],
        [np.arange(int(len(train_x) * 0.67) + 1, len(train_x))],
    ),
    n_points=4,
    n_jobs=-1,
)
cv_model.fit(train_x, le.transform(train_y))
print(cv_model.best_params_)
model_eval(le.transform(test_y), cv_model.predict(test_x))

Fitting 1 folds for each of 4 candidates, totalling 4 fits
Fitting 1 folds for each of 4 candidates, totalling 4 fits
Fitting 1 folds for each of 4 candidates, totalling 4 fits
Fitting 1 folds for each of 4 candidates, totalling 4 fits
OrderedDict([('colsample_bytree', 1), ('eta', 0.10710027714412612), ('max_depth', 5), ('subsample', 1)])
Accuracy: 0.8668615069545184
Precision: 0.8830148742672373
Recall: 0.652715574060801
F1 Score: 0.7297065917782715


  _warn_prf(average, modifier, msg_start, len(result))


Saving models to disk

In [15]:
import pickle
from pathlib import Path

pickle.dump(
    cv_model.best_estimator_,
    Path("../data/models/baseline/xgbc-best-opt-v1.pkl").open("wb"),
)