In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
from birdclef.utils import get_spark
from pyspark.sql import Window, functions as F
import os

os.environ["SPARK_LOCAL_DIRS"] = "../data/tmp/spark"

spark = get_spark(memory="5g")
df = spark.read.parquet("../data/processed/birdclef-2023/train_postprocessed/v3")
df = (
    df.withColumn("primary_label", F.col("metadata_species")[0])
    .withColumn("species", F.concat("metadata_species", "predicted_species"))
    .withColumn("species_count", F.count("*").over(Window.partitionBy("primary_label")))
)
# df = df.where("species_count > 1")
df.printSchema()
df.show(n=5)

root
 |-- track_stem: string (nullable = true)
 |-- start_time: long (nullable = true)
 |-- metadata_species: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- predicted_species: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- predicted_species_prob: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- embedding: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- next_embedding: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- track_embedding: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- primary_label: string (nullable = true)
 |-- species: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- species_count: long (nullable = false)

+----------+----------+----------------+--------------------+----------------------+--------------------+--------------------+--------------------+-------------+------------

In [3]:
import pickle
import numpy as np
from xgboost import XGBClassifier
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
from sklearn.metrics import average_precision_score


def model_eval(truth, preds):
    print("Accuracy:", accuracy_score(truth, preds))
    print(
        "Precision:",
        precision_score(truth, preds, average="macro"),
    )
    print(
        "Recall:",
        recall_score(truth, preds, average="macro"),
    )
    print(
        "F1 Score:",
        f1_score(truth, preds, average="macro"),
    )

current and next token

In [38]:
parsed = df.select("predicted_species", "primary_label", "embedding", "next_embedding")
parsed.show(n=5)

+--------------------+-------------+--------------------+--------------------+
|   predicted_species|primary_label|           embedding|      next_embedding|
+--------------------+-------------+--------------------+--------------------+
|  [abythr1, wbrcha2]|      abythr1|[1.4855427, 0.454...|[0.9370399, 0.233...|
|[combul2, grbcam1...|      abythr1|[1.6610856, 0.870...|[1.4390839, 1.616...|
|           [fotdro5]|      abythr1|[0.9759565, 0.753...|[1.0162606, 1.178...|
|[abythr1, abythr1...|      abythr1|[1.5794071, 0.886...|[1.0814278, 0.572...|
|           [abythr1]|      abythr1|[0.9370399, 0.233...|[0.9370399, 0.233...|
+--------------------+-------------+--------------------+--------------------+
only showing top 5 rows



In [39]:
data = parsed.toPandas()
data.head(n=5)

Unnamed: 0,predicted_species,primary_label,embedding,next_embedding
0,"[abythr1, wbrcha2]",abythr1,"[1.4855427, 0.4541664, 1.0472355, 1.3523579, 1...","[0.9370399, 0.2331794, 0.5724607, 1.3511919, 0..."
1,"[combul2, grbcam1, combuz1]",abythr1,"[1.6610856, 0.87099504, 0.29712436, 0.7613897,...","[1.4390839, 1.6169457, 1.8661345, 0.8268714, 1..."
2,[fotdro5],abythr1,"[0.9759565, 0.75392014, 0.6601531, 0.51182395,...","[1.0162606, 1.1786319, 0.6472011, 0.5133413, 0..."
3,"[abythr1, abythr1, thrnig1, thrnig1]",abythr1,"[1.5794071, 0.8864207, 1.2426404, 1.2989092, 0...","[1.0814278, 0.5720312, 1.1402651, 1.4965066, 0..."
4,[abythr1],abythr1,"[0.9370399, 0.2331794, 0.5724607, 1.3511919, 0...","[0.9370399, 0.2331794, 0.5724607, 1.3511919, 0..."


In [49]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
# labels = mlb.fit_transform(data["predicted_species"])
# print(labels.shape)

# embeddings = np.stack(data["embedding"])
# next_emb = np.stack(data["next_embedding"])
# print(embeddings.shape, next_emb.shape)
# emb = np.concatenate((embeddings, next_emb), axis=1)
# print(emb.shape)

In [55]:
def split_x_y(data):
    x = np.concatenate(
        (np.stack(data.embedding), np.stack(data.next_embedding)), axis=1
    )
    y = mlb.transform(data.predicted_species)
    return x, y

In [56]:
from sklearn.utils.class_weight import compute_sample_weight

train, test = train_test_split(data, test_size=0.2, stratify=data.primary_label)

print(train.head(5))

weights = compute_sample_weight(
    class_weight="balanced",
    y=train.primary_label,
)
print(weights.shape)

mlb = MultiLabelBinarizer()
mlb.fit(train.predicted_species)
train_x, train_y = split_x_y(train)
test_x, test_y = split_x_y(test)

                 predicted_species primary_label  \
46627           [trobou1, trobou1]       gobbun1   
9298      [wlwwar, wlwwar, wlwwar]        wlwwar   
16062     [gargan, gargan, woosan]        gargan   
54580  [eaywag1, eaywag1, combuz1]       eaywag1   
31676           [blakit1, blakit1]       blakit1   

                                               embedding  \
46627  [0.99324703, 1.2544072, 1.1864733, 0.76876074,...   
9298   [1.3639158, 1.1384009, 0.30133072, 1.1934186, ...   
16062  [0.9726482, 0.80500895, 0.6744415, 0.58976775,...   
54580  [0.37639582, 0.8841811, 0.8896645, 0.40079167,...   
31676  [0.7950919, 0.6024793, 0.11996893, 0.8943327, ...   

                                          next_embedding  
46627  [1.1276547, 1.0443239, 1.4978375, 0.6715452, 0...  
9298   [1.2790996, 0.8213889, 0.13639079, 0.62267214,...  
16062  [0.7132024, 0.9949344, 0.6074874, 0.43088412, ...  
54580  [0.918205, 0.7151461, 0.29273865, 0.5959397, 0...  
31676  [0.9469747, 0.51665765, 

In [57]:
clf = XGBClassifier(tree_method="gpu_hist")
clf.fit(train_x, train_y, verbose=True)
preds = clf.predict(test_x)
model_eval(test_y, preds)
print(average_precision_score(test_y, clf.predict_proba(test_x)))

Accuracy: 0.2870823430728366


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.6341810776088794


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.20198595718608378


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


F1 Score: 0.2917415148921085




0.48481025107652603




In [None]:
search = BayesSearchCV(
    XGBClassifier(tree_method="gpu_hist"),
    {
        "max_depth": (3, 15, "uniform"),
        "gamma": (0.0, 1.0, "uniform"),
        "min_child_weight": (1, 10, "uniform"),
    },
    fit_params={"sample_weight": weights},
    n_iter=8,
    scoring="f1_macro",
    verbose=4,
    cv=2,
    n_points=1,
    n_jobs=-1,
)
search.fit(train_x, train_y)

all three

In [4]:
parsed = df.select(
    "predicted_species",
    "primary_label",
    "embedding",
    "next_embedding",
    "track_embedding",
)
parsed.show(n=5)
data = parsed.toPandas()
data.head(n=5)

+--------------------+-------------+--------------------+--------------------+--------------------+
|   predicted_species|primary_label|           embedding|      next_embedding|     track_embedding|
+--------------------+-------------+--------------------+--------------------+--------------------+
|[categr, cohmar1,...|       categr|[1.6699009, 1.923...|[1.6699009, 1.923...|[1.6699009, 1.923...|
|    [egygoo, egygoo]|       egygoo|[1.1547908, 1.127...|[1.4386853, 1.303...|[1.5258964, 1.228...|
|[bawhor2, bawhor2...|      bawhor2|[0.4971428, 2.136...|[0.99904233, 1.15...|[0.768064, 1.3247...|
|  [cibwar1, cibwar1]|      cibwar1|[0.75827354, 1.25...|[0.80793333, 1.24...|[0.78310347, 1.25...|
|           [eubeat1]|      eubeat1|[1.7087168, 1.229...|[2.0123289, 0.626...|[1.7032318, 1.002...|
+--------------------+-------------+--------------------+--------------------+--------------------+
only showing top 5 rows



Unnamed: 0,predicted_species,primary_label,embedding,next_embedding,track_embedding
0,"[categr, cohmar1, sichor1]",categr,"[1.6699009, 1.9233328, 0.37807903, 1.2282478, ...","[1.6699009, 1.9233328, 0.37807903, 1.2282478, ...","[1.6699009, 1.9233328, 0.37807903, 1.2282478, ..."
1,"[egygoo, egygoo]",egygoo,"[1.1547908, 1.1271043, 0.36353567, 0.8874313, ...","[1.4386853, 1.303846, 0.23873292, 1.1468445, 1...","[1.5258964, 1.2280338, 0.4696629, 1.0480278, 1..."
2,"[bawhor2, bawhor2, bawhor2, bawhor2, grecor]",bawhor2,"[0.4971428, 2.1368887, 0.69026494, 1.0474387, ...","[0.99904233, 1.1514673, 0.62358767, 1.1789998,...","[0.768064, 1.324733, 0.79627794, 1.0657245, 0...."
3,"[cibwar1, cibwar1]",cibwar1,"[0.75827354, 1.2599143, 1.2712938, 1.2929367, ...","[0.80793333, 1.2478226, 1.50593, 1.0412738, 0....","[0.78310347, 1.2538685, 1.3886118, 1.1671052, ..."
4,[eubeat1],eubeat1,"[1.7087168, 1.2295375, 0.5481695, 0.48683575, ...","[2.0123289, 0.6269798, 0.7979782, 0.42369375, ...","[1.7032318, 1.0029125, 0.46927804, 0.48526886,..."


In [5]:
from sklearn.preprocessing import MultiLabelBinarizer


def split_x_y(data):
    x = np.concatenate(
        (
            np.stack(data.embedding),
            np.stack(data.next_embedding),
            np.stack(data.track_embedding),
        ),
        axis=1,
    )
    y = mlb.transform(data.predicted_species)
    return x, y


train, test = train_test_split(data, test_size=0.2, stratify=data.primary_label)

print(train.head(5))

weights = compute_sample_weight(
    class_weight="balanced",
    y=train.primary_label,
)
print(weights.shape)

mlb = MultiLabelBinarizer()
mlb.fit(data.predicted_species)
train_x, train_y = split_x_y(train)
test_x, test_y = split_x_y(test)

               predicted_species primary_label  \
73358          [yebsto1, barswa]       yebsto1   
42649                  [cohmar1]        litegr   
38661  [litegr, barswa, yertin1]        litegr   
41084           [gargan, gargan]        gargan   
31211           [wlwwar, hoopoe]       tafpri1   

                                               embedding  \
73358  [0.80422944, 0.68525517, 0.17662634, 0.493083,...   
42649  [0.8279805, 0.92492956, 1.0016271, 0.44183776,...   
38661  [1.3842217, 1.0944815, 1.3357898, 0.9954681, 1...   
41084  [1.2485094, 1.2730036, 0.4728993, 1.344677, 0....   
31211  [1.9700922, 1.7753143, 0.3254336, 0.455853, 1....   

                                          next_embedding  \
73358  [0.80422944, 0.68525517, 0.17662634, 0.493083,...   
42649  [0.66272324, 0.82441515, 0.62881213, 0.3760345...   
38661  [1.4363765, 1.5221066, 1.1466409, 0.44075704, ...   
41084  [1.2485094, 1.2730036, 0.4728993, 1.344677, 0....   
31211  [1.9700922, 1.7753143, 0.325433

In [62]:
clf = XGBClassifier(tree_method="gpu_hist")
clf.fit(train_x, train_y, verbose=True)
preds = clf.predict(test_x)
model_eval(test_y, preds)
print(average_precision_score(test_y, clf.predict_proba(test_x)))

Accuracy: 0.3048371312735915


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.6907287964057018


  _warn_prf(average, modifier, msg_start, len(result))


Recall: 0.3036790183529477


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


F1 Score: 0.40340066136931235




0.5434118400483244




In [8]:
search = BayesSearchCV(
    XGBClassifier(tree_method="gpu_hist", objective="multi:softprob"),
    {
        "max_depth": (3, 15, "uniform"),
        "gamma": (0.0, 1.0, "uniform"),
        "min_child_weight": (1, 10, "uniform"),
    },
    fit_params={"sample_weight": weights},
    n_iter=8,
    scoring="f1_macro",
    verbose=4,
    cv=2,
    n_points=1,
    n_jobs=-1,
)
search.fit(train_x, train_y)
model_eval(test_y, search.predict(test_x))
print(average_precision_score(test_y, search.predict_proba(test_x)))

Fitting 2 folds for each of 1 candidates, totalling 2 fits
