In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
from birdclef.utils import get_spark
from pyspark.sql import Window, functions as F
import os

os.environ["SPARK_LOCAL_DIRS"] = "../data/tmp/spark"

spark = get_spark(memory="5g")
df = spark.read.parquet("../data/processed/birdclef-2023/train_postprocessed/v3")
df = (
    df.withColumn("primary_label", F.col("metadata_species")[0])
    .withColumn("species", F.concat("metadata_species", "predicted_species"))
    .withColumn("species_count", F.count("*").over(Window.partitionBy("primary_label")))
)
# df = df.where("species_count > 1")
df.printSchema()
df.show(n=5)

root
 |-- track_stem: string (nullable = true)
 |-- start_time: long (nullable = true)
 |-- metadata_species: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- predicted_species: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- predicted_species_prob: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- embedding: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- next_embedding: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- track_embedding: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- primary_label: string (nullable = true)
 |-- species: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- species_count: long (nullable = false)

+----------+----------+----------------+--------------------+----------------------+--------------------+--------------------+--------------------+-------------+------------

In [3]:
import pickle
import numpy as np
from xgboost import XGBClassifier
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.metrics import average_precision_score


def model_eval(truth, preds):
    print("Accuracy:", accuracy_score(truth, preds))
    print(
        "Precision:",
        precision_score(truth, preds, average="macro"),
    )
    print(
        "Recall:",
        recall_score(truth, preds, average="macro"),
    )
    print(
        "F1 Score:",
        f1_score(truth, preds, average="macro"),
    )

current and next token

In [38]:
parsed = df.select("predicted_species", "primary_label", "embedding", "next_embedding")
parsed.show(n=5)

+--------------------+-------------+--------------------+--------------------+
|   predicted_species|primary_label|           embedding|      next_embedding|
+--------------------+-------------+--------------------+--------------------+
|  [abythr1, wbrcha2]|      abythr1|[1.4855427, 0.454...|[0.9370399, 0.233...|
|[combul2, grbcam1...|      abythr1|[1.6610856, 0.870...|[1.4390839, 1.616...|
|           [fotdro5]|      abythr1|[0.9759565, 0.753...|[1.0162606, 1.178...|
|[abythr1, abythr1...|      abythr1|[1.5794071, 0.886...|[1.0814278, 0.572...|
|           [abythr1]|      abythr1|[0.9370399, 0.233...|[0.9370399, 0.233...|
+--------------------+-------------+--------------------+--------------------+
only showing top 5 rows



In [39]:
data = parsed.toPandas()
data.head(n=5)

Unnamed: 0,predicted_species,primary_label,embedding,next_embedding
0,"[abythr1, wbrcha2]",abythr1,"[1.4855427, 0.4541664, 1.0472355, 1.3523579, 1...","[0.9370399, 0.2331794, 0.5724607, 1.3511919, 0..."
1,"[combul2, grbcam1, combuz1]",abythr1,"[1.6610856, 0.87099504, 0.29712436, 0.7613897,...","[1.4390839, 1.6169457, 1.8661345, 0.8268714, 1..."
2,[fotdro5],abythr1,"[0.9759565, 0.75392014, 0.6601531, 0.51182395,...","[1.0162606, 1.1786319, 0.6472011, 0.5133413, 0..."
3,"[abythr1, abythr1, thrnig1, thrnig1]",abythr1,"[1.5794071, 0.8864207, 1.2426404, 1.2989092, 0...","[1.0814278, 0.5720312, 1.1402651, 1.4965066, 0..."
4,[abythr1],abythr1,"[0.9370399, 0.2331794, 0.5724607, 1.3511919, 0...","[0.9370399, 0.2331794, 0.5724607, 1.3511919, 0..."


In [49]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
# labels = mlb.fit_transform(data["predicted_species"])
# print(labels.shape)

# embeddings = np.stack(data["embedding"])
# next_emb = np.stack(data["next_embedding"])
# print(embeddings.shape, next_emb.shape)
# emb = np.concatenate((embeddings, next_emb), axis=1)
# print(emb.shape)

In [11]:
def split_x_y(data):
    x = np.concatenate(
        (np.stack(data.embedding), np.stack(data.next_embedding)), axis=1
    )
    y = mlb.transform(data.predicted_species)
    return x, y

In [10]:
from sklearn.utils.class_weight import compute_sample_weight

train, test = train_test_split(data, test_size=0.2, stratify=data.primary_label)

print(train.head(5))

weights = compute_sample_weight(
    class_weight="balanced",
    y=train.primary_label,
)
print(weights.shape)

mlb = MultiLabelBinarizer()
mlb.fit(data.predicted_species)
train_x, train_y = split_x_y(train)
test_x, test_y = split_x_y(test)

                                 predicted_species primary_label  \
56324                  [klacuc1, klacuc1, afecuc1]       didcuc1   
48523                  [carcha1, carcha1, carcha1]       carcha1   
43264             [comsan, comsan, comsan, comsan]        comsan   
53116                                    [colsun2]       cibwar1   
56325  [meypar1, meypar1, wlwwar, blbpuf2, wlwwar]       meypar1   

                                               embedding  \
56324  [0.9282538, 0.85244304, 0.64371747, 0.9679206,...   
48523  [1.2264075, 0.91729265, 0.659605, 0.4417752, 1...   
43264  [1.7824721, 1.4432603, 0.51925224, 0.6429393, ...   
53116  [0.76846176, 0.91573334, 0.8185377, 1.0414224,...   
56325  [2.4270287, 1.4127231, 1.0112065, 0.6295299, 0...   

                                          next_embedding  \
56324  [0.73853236, 1.1838487, 1.358802, 0.8135739, 0...   
48523  [1.1825973, 1.2704992, 0.3688372, 1.012795, 0....   
43264  [1.2682344, 1.4572191, 0.32428205, 0.975989

In [57]:
clf = XGBClassifier(tree_method="gpu_hist")
clf.fit(train_x, train_y, verbose=True)
preds = clf.predict(test_x)
model_eval(test_y, preds)
print(average_precision_score(test_y, clf.predict_proba(test_x)))

Accuracy: 0.2870823430728366


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.6341810776088794


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.20198595718608378


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


F1 Score: 0.2917415148921085




0.48481025107652603




In [None]:
search = BayesSearchCV(
    XGBClassifier(tree_method="gpu_hist"),
    {
        "max_depth": (3, 15, "uniform"),
        "gamma": (0.0, 1.0, "uniform"),
        "min_child_weight": (1, 10, "uniform"),
    },
    fit_params={"sample_weight": weights},
    n_iter=8,
    scoring="f1_macro",
    verbose=4,
    cv=2,
    n_points=1,
    n_jobs=-1,
)
search.fit(train_x, train_y)

all three

In [4]:
# parsed = df.select(
#     "predicted_species",
#     "primary_label",
#     "embedding",
#     "next_embedding",
#     "track_embedding",
# )
# parsed.show(n=5)
data = df.toPandas()
data.head(n=5)

Unnamed: 0,track_stem,start_time,metadata_species,predicted_species,predicted_species_prob,embedding,next_embedding,track_embedding,primary_label,species,species_count
0,XC125862,40,[abythr1],"[abythr1, wbrcha2]","[0.7282392344720093, 0.5216352416664949]","[1.4855427, 0.4541664, 1.0472355, 1.3523579, 1...","[0.9370399, 0.2331794, 0.5724607, 1.3511919, 0...","[1.0989836, 0.511663, 0.79813445, 1.251782, 0....",abythr1,"[abythr1, abythr1, wbrcha2]",142
1,XC227239,50,[abythr1],"[combul2, grbcam1, combuz1]","[0.8421137751981587, 0.8241570000064344, 0.754...","[1.6610856, 0.87099504, 0.29712436, 0.7613897,...","[1.4390839, 1.6169457, 1.8661345, 0.8268714, 1...","[1.5228174, 1.2024852, 0.9217374, 0.66376007, ...",abythr1,"[abythr1, combul2, grbcam1, combuz1]",142
2,XC509507,20,[abythr1],[fotdro5],[0.6930028164341019],"[0.9759565, 0.75392014, 0.6601531, 0.51182395,...","[1.0162606, 1.1786319, 0.6472011, 0.5133413, 0...","[0.99444264, 1.1462014, 0.775025, 0.4755587, 1...",abythr1,"[abythr1, fotdro5]",142
3,XC620997,10,[abythr1],"[abythr1, abythr1, thrnig1, thrnig1]","[0.9680242396307245, 0.8786584090490244, 0.553...","[1.5794071, 0.8864207, 1.2426404, 1.2989092, 0...","[1.0814278, 0.5720312, 1.1402651, 1.4965066, 0...","[1.3417553, 0.7599494, 1.1932622, 0.980361, 0....",abythr1,"[abythr1, abythr1, abythr1, thrnig1, thrnig1]",142
4,XC125862,50,[abythr1],[abythr1],[0.8696589410631123],"[0.9370399, 0.2331794, 0.5724607, 1.3511919, 0...","[0.9370399, 0.2331794, 0.5724607, 1.3511919, 0...","[1.0989836, 0.511663, 0.79813445, 1.251782, 0....",abythr1,"[abythr1, abythr1]",142


In [5]:
from sklearn.preprocessing import MultiLabelBinarizer


def split_x_y(data):
    x = np.concatenate(
        (
            np.stack(data.embedding),
            np.stack(data.next_embedding),
            np.stack(data.track_embedding),
        ),
        axis=1,
    )
    y = mlb.transform(data.predicted_species)
    return x, y


train, test = train_test_split(data, test_size=0.2, stratify=data.primary_label)

print(train.head(5))

weights = compute_sample_weight(
    class_weight=None,
    y=train.primary_label,
)
print(weights.shape)

mlb = MultiLabelBinarizer()
mlb.fit(data.species)
train_x, train_y = split_x_y(train)
test_x, test_y = split_x_y(test)

      track_stem  start_time metadata_species  \
20816   XC471862          60         [hoopoe]   
47602   XC399994          80        [bswdov1]   
60128   XC675673          30        [eaywag1]   
60359   XC334142           0        [eaywag1]   
40963   XC322373          10         [yefcan]   

                        predicted_species  \
20816                            [hoopoe]   
47602                  [bswdov1, bswdov1]   
60128         [eaywag1, eaywag1, eaywag1]   
60359                            [barswa]   
40963  [yefcan, eaywag1, yefcan, eaywag1]   

                                  predicted_species_prob  \
20816                               [0.5947268731195006]   
47602           [0.9891283636639615, 0.9747976973291222]   
60128  [0.942331024526395, 0.9197274155762972, 0.5563...   
60359                               [0.8097300783595898]   
40963  [0.9109804922063218, 0.9103243629862736, 0.875...   

                                               embedding  \
20816  [1.950

In [62]:
clf = XGBClassifier(tree_method="gpu_hist")
clf.fit(train_x, train_y, verbose=True)
preds = clf.predict(test_x)
model_eval(test_y, preds)
print(average_precision_score(test_y, clf.predict_proba(test_x)))

Accuracy: 0.3048371312735915


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.6907287964057018


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.3036790183529477


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


F1 Score: 0.40340066136931235




0.5434118400483244




In [20]:
len(mlb.classes_)

264

In [7]:
search = BayesSearchCV(
    XGBClassifier(tree_method="hist"),
    {
        "max_depth": (3, 15, "uniform"),
        "gamma": (0.0, 1.0, "uniform"),
        "min_child_weight": (1, 10, "uniform"),
    },
    fit_params={"sample_weight": weights},
    n_iter=10,
    scoring="f1_macro",
    verbose=4,
    cv=2,
    n_points=1,
    n_jobs=-1,
)
search.fit(train_x, train_y)
model_eval(test_y, search.predict(test_x))
print(average_precision_score(test_y, search.predict_proba(test_x)))

Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
