In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [1]:
from birdclef.utils import get_spark
from pyspark.sql import SparkSession
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

spark = (
    SparkSession.builder.config("spark.driver.memory", "20g")
    .config("spark.driver.cores", 8)
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .config("spark.driver.maxResultSize", "10g")
    .getOrCreate()
)
df = spark.read.parquet(
    "../data/processed/birdclef-2023/train_embeddings/consolidated_v3"
)
df.printSchema()

root
 |-- species: string (nullable = true)
 |-- track_stem: string (nullable = true)
 |-- track_type: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- prediction_vec: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- predictions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- rank: long (nullable = true)
 |    |    |-- index: long (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- mapped_label: string (nullable = true)
 |    |    |-- probability: double (nullable = true)
 |-- start_time: long (nullable = true)
 |-- energy: double (nullable = true)



In [2]:
from pyspark.sql import Window, functions as F

# keep the track_type for the highest energy
highest_energy_channel = (
    df
    # get the track stem without the part
    .withColumn("original_track_stem", F.split(F.col("track_stem"), "_").getItem(0))
    .where("track_type != 'original'")
    # get the track type that has the most energy
    .withColumn(
        "rank",
        F.rank().over(
            Window.partitionBy("original_track_stem").orderBy(F.desc("energy"))
        ),
    )
    # keep the first row
    .where(F.col("rank") == 1)
    # drop the rank column
    .select("species", "track_stem", "track_type")
    .distinct()
)

# get the highest predictions by exploding the values
highest_energy_tracks = (
    df
    # join against the highest energy channel
    .join(
        highest_energy_channel,
        on=["species", "track_stem", "track_type"],
        how="inner",
    )
    # explode the embeddings, these are ordered by confidence
    .select(
        "species",
        "track_stem",
        "track_type",
        "start_time",
        "track_name",
        "prediction_vec",
    )
).cache()

In [3]:
highest_energy_tracks.show(n=5)

+-------+----------+----------+----------+--------------------+--------------------+
|species|track_stem|track_type|start_time|          track_name|      prediction_vec|
+-------+----------+----------+----------+--------------------+--------------------+
|abythr1|  XC233199|   source0|        15|abythr1/XC233199_...|[-13.661950111389...|
|abythr1|  XC233199|   source0|         6|abythr1/XC233199_...|[-11.474321365356...|
|abythr1|  XC233199|   source0|        18|abythr1/XC233199_...|[-9.2847023010253...|
|abythr1|  XC233199|   source0|         9|abythr1/XC233199_...|[-8.0626249313354...|
|abythr1|  XC233199|   source0|        54|abythr1/XC233199_...|[-10.417481422424...|
+-------+----------+----------+----------+--------------------+--------------------+
only showing top 5 rows



In [4]:
labels = highest_energy_tracks.select("species").toPandas()
len(set(list(labels)))

1

In [5]:
len(set([x[0] for x in list(labels.to_numpy())]))

264

In [6]:
prediction_vec = highest_energy_tracks.select("prediction_vec").toPandas()

In [9]:
prediction_vec.head()

Unnamed: 0,prediction_vec
0,"[-13.66195011138916, -8.719298362731934, -17.7..."
1,"[-11.474321365356445, -6.547040939331055, -15...."
2,"[-9.28470230102539, -9.034313201904297, -18.45..."
3,"[-8.06262493133545, -5.185299873352051, -15.40..."
4,"[-10.417481422424316, -7.542620658874512, -19...."


In [8]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    average_precision_score
)


def model_eval(truth, preds):
    print("Accuracy:", accuracy_score(truth, preds))
    print(
        "Precision:",
        precision_score(truth, preds, average="macro"),
    )
    print(
        "Recall:",
        recall_score(truth, preds, average="macro"),
    )
    print(
        "F1 Score:",
        f1_score(truth, preds, average="macro"),
    )


In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit_transform(labels.species)

array([  2,   2,   2, ..., 260, 260, 260])

In [None]:
train_x, test_x, train_y, test_y = train_test_split(
    np.stack(prediction_vec["prediction_vec"]),
    labels["species"],
    test_size=0.33,
    stratify=labels["species"],
)

In [15]:
from xgboost import XGBClassifier

clf = XGBClassifier(tree_method="gpu_hist")
clf.fit(train_x, le.transform(train_y))
model_eval(le.transform(test_y), clf.predict(train_x))

XGBoostError: [17:43:03] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_gpu_hist.cu:802: Exception in gpu_hist: [17:43:03] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\data\../common/device_helpers.cuh:431: Memory allocation error on worker 0: bad allocation: cudaErrorMemoryAllocation: out of memory
- Free memory: 728970036
- Requested memory: 2147483648



In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
model_eval(le.transform(test_y), clf.predict(test_x))
print("Average Precision Score", average_precision_score(mlb.fit_transform(test_y), clf.predict_proba(test_x)))