In [None]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

## Binary Classifier (call/no call)

In [None]:
from birdclef.utils import get_spark

spark = get_spark(cores=4, memory="10g")
df = spark.read.parquet(
    "../data/processed/birdclef-2023/train_embeddings/consolidated_v3"
)
df.printSchema()

### 1. Data Preprocessing

#### 1.1 Positive labels

In [None]:
from pyspark.sql import Window, functions as F

# keep the track_type for the highest energy
highest_energy_channel = (
    df
    # get the track stem without the part
    .withColumn("original_track_stem", F.split(F.col("track_stem"), "_").getItem(0))
    .where("track_type != 'original'")
    # get the track type that has the most energy
    .withColumn(
        "rank",
        F.rank().over(
            Window.partitionBy("original_track_stem").orderBy(F.desc("energy"))
        ),
    )
    # keep the first row
    .where(F.col("rank") == 1)
    # drop the rank column
    .select("species", "track_stem", "track_type")
    .distinct()
)

# get the highest predictions by exploding the values
exploded_embeddings = (
    df
    # join against the highest energy channel
    .join(
        highest_energy_channel,
        on=["species", "track_stem", "track_type"],
        how="inner",
    )
    # explode the embeddings, these are ordered by confidence
    .withColumn("predictions", F.explode("predictions")).select(
        "species",
        "track_stem",
        "track_type",
        "start_time",
        "track_name",
        "embedding",
        "predictions.*",
    )
    # simplifying assumption: we assume the prediction with the highest confidence is the true label
    .where("rank = 0")
).cache()

exploded_embeddings.drop("embedding").show(n=5)

In [None]:
# quick count of the number of species
counts = (
    exploded_embeddings.groupBy("species")
    .agg(F.count("*").alias("n"))
    .orderBy(F.desc("n"))
)
counts.show(n=5)
counts.orderBy("n").show(n=5)

In [None]:
# Prepared DF
rarity_min_count = 100
rare_species_count = (
    exploded_embeddings.groupBy("species")
    .agg(F.count("*").alias("n"))
    .where(f"n < {rarity_min_count}")
)
rare_species_count.show(n=5)

# if there are a lot of examples, we can use a higher threshold
common_species = exploded_embeddings.where("probability > 0.4").join(
    rare_species_count.select("species"), on="species", how="left_anti"
)
# these ones are less common so we use a lower threshold so we have at least one
# example for each species
rare_species = exploded_embeddings.where("probability > 0.1").join(
    rare_species_count.select("species"), on="species", how="inner"
)
prepared = common_species.union(rare_species).select(
    "species", "probability", "embedding"
)
prepared.show(n=5)
prepared.count()

In [None]:
# lets check that we have the right number of classes, and how many examples we are working with
prepared_counts = (
    prepared.groupBy("species").agg(F.count("*").alias("n")).orderBy(F.desc("n"))
)
print(f"number of species {prepared_counts.count()}")

prepared_counts.show(n=5)
prepared_counts.orderBy("n").show(n=5)

#### 1.1 Negative labels

In [None]:
# Negative calls
noise_indices = [
    (1022, "Dog_Dog"),
    (1136, "Engine_Engine"),
    (1141, "Environmental_Environmental"),
    (1219, "Fireworks_Fireworks"),
    (1352, "Gun_Gun"),
    (1449, "Human non-vocal_Human non-vocal"),
    (1450, "Human vocal_Human vocal"),
    (1451, "Human whistle_Human whistle"),
    (1997, "Noise_Noise"),
    (2812, "Siren_Siren"),
]
noise_indices = [i[0] for i in noise_indices]

# Craete negative samples DF
negative_samples = (
    df
    # explode the predictions with their indices
    .select(
        "track_name",
        "start_time",
        "embedding",
        F.posexplode("prediction_vec").alias("index", "logit"),
    )
    .where(F.col("index").isin(noise_indices))
    .withColumn("probability", F.expr("1/(1+exp(-logit))"))
    .where("probability > 0.4")
    .select("track_name", "start_time", F.lit("no_call").alias("species"), "embedding")
).cache()
negative_samples.show()

In [None]:
from pyspark.sql.functions import lit

# Get the exploded examples
# Only keep examples less than 0.1
# Every example in the new query is a no-call
# Union with the negative_samples

exploded_negatives = exploded_embeddings.where(exploded_embeddings.probability < 0.1)

# Define fractions for each stratum
fractions = (
    exploded_negatives.select("track_stem")
    .distinct()
    .withColumn("fraction", lit(0.6))
    .rdd.collectAsMap()
)

# Perform stratified sampling
exploded_negative_sub = exploded_negatives.stat.sampleBy(
    "track_stem", fractions, seed=42
)

# Select columns from exploded DF that match negative_samples
exploded_negative_select = exploded_negative_sub.select(negative_samples.columns)

# Perform union operation
negatives = negative_samples.union(exploded_negative_select)

# Set all values in species column to "no_call"
negatives = negatives.withColumn("species", lit("no_call"))

# Check unique values
unique_vals = negatives.select("species").distinct()
unique_values_list = unique_vals.rdd.flatMap(lambda x: x).collect()

print(f"Unique values in species column: {len(unique_values_list)}")
print(f"Number of rows: {negatives.count()}")
negatives.show(n=5)

In [None]:
negatives_sub = negatives.select("species", "embedding")
positives_sub = prepared.select("species", "embedding")
positives_sub = positives_sub.withColumn("species", lit("call"))
binary_df = negatives_sub.union(positives_sub)
binary_df.show()

In [None]:
# Check the number of samples for each label
# counts = binary_df.groupBy("species").agg(F.count("*").alias("n")).orderBy(F.desc("n"))
# counts.show(n=5)

In [None]:
# Data for model training
data = binary_df.toPandas()
data.head()

In [None]:
import time
import numpy as np
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
    classification_report,
)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    np.stack(data["embedding"]),
    data["species"],
    test_size=0.33,
    stratify=data["species"],
)

# Create a label encoder object
le = LabelEncoder()

# Fit and transform the target with label encoder
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Data shape
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
def model_eval(truth, preds):
    print("Accuracy:", round(accuracy_score(truth, preds), 3))
    print("Precision:", round(precision_score(truth, preds, average="macro"), 3))
    print("Recall:", round(recall_score(truth, preds, average="macro"), 3))
    print("F1 Score:", round(f1_score(truth, preds, average="macro"), 3))

In [None]:
from skopt.space import Real
from skopt import BayesSearchCV
from sklearn.utils import class_weight
from sklearn.linear_model import LogisticRegression

weights = class_weight.compute_sample_weight(class_weight="balanced", y=y_train_enc)

# LogisticRegression
lgr_search = BayesSearchCV(
    LogisticRegression(),
    {
        "penalty": ("l2",),
        "C": (0.01, 1.1, "log-uniform"),
    },
    n_iter=10,
    scoring="f1",
    verbose=4,
    cv=3,
    n_points=1,
    n_jobs=-1,
)
%time lgr_search.fit(X_train, y_train_enc, sample_weight=weights)

In [None]:
# Evaluate model
model_eval(y_test_enc, lgr_search.predict(X_test))

In [None]:
from xgboost import XGBClassifier

weights = class_weight.compute_sample_weight(class_weight="balanced", y=y_train_enc)

# XGBoost model
xgb_search = BayesSearchCV(
    XGBClassifier(tree_method="gpu_hist"),
    {
        "eta": (0.01, 1.0, "log-uniform"),
        "max_depth": (1, 30, "uniform"),
        "gamma": (0, 1, "uniform"),
        "min_child_weight": (1, 10, "uniform"),
    },
    n_iter=10,
    scoring="f1",
    verbose=4,
    cv=3,
    n_points=1,
    n_jobs=1,
)
%time xgb_search.fit(X_train, y_train_enc, sample_weight=weights)

In [None]:
# Evaluate model
model_eval(y_test_enc, xgb_search.predict(X_test))