In [2]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [3]:
from birdclef.utils import get_spark
import os

os.environ["SPARK_LOCAL_DIRS"] = "../data/tmp/spark"

spark = get_spark(cores=16, memory="2g")
df = spark.read.parquet("../data/processed/birdclef-2023/train_postprocessed/v1")
df.printSchema()
df.show(n=5)

data = df.toPandas()
data.shape

root
 |-- species: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- embedding: array (nullable = true)
 |    |-- element: double (containsNull = true)

+---------+--------------------+
|  species|           embedding|
+---------+--------------------+
|[gobsta5]|[0.61802852153778...|
|[chespa1]|[0.61116945743560...|
|[golher1]|[0.90223264694213...|
|[marsto1]|[0.61373400688171...|
|[gobwea1]|[0.62747323513031...|
+---------+--------------------+
only showing top 5 rows



(239569, 2)

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data["species"])
print(labels.shape)

embeddings = np.stack(data["embedding"])
print(embeddings.shape)

(239569, 265)
(239569, 320)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)


def model_eval(truth, preds):
    print("Accuracy:", accuracy_score(truth, preds))
    print(
        "Precision:",
        precision_score(truth, preds, average="macro"),
    )
    print(
        "Recall:",
        recall_score(truth, preds, average="macro"),
    )
    print(
        "F1 Score:",
        f1_score(truth, preds, average="macro"),
    )


train_x, test_x, train_y, test_y = train_test_split(embeddings, labels, test_size=0.3)

In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import average_precision_score
from sklearn.utils import class_weight

# weights = class_weight.compute_sample_weight(
#     class_weight='balanced',
#     y=test_y
# )

# clf = XGBClassifier(tree_method="gpu_hist")
# %time clf.fit(train_x, train_y, sample_weight=weights)

# CPU times: total: 7min 20s
# Wall time: 6min 53s

In [7]:
# preds = clf.predict(test_x)
# print(preds.shape)
# print(test_y.shape)
# model_eval(test_y, preds)
# print(average_precision_score(test_y, clf.predict_proba(test_x)))

# (47914, 265)
# (47914, 265)
# Accuracy: 0.7581708895103727
# Precision: 0.9899858454212693
# Recall: 0.7119283478314029
# F1 Score: 0.8243448902217967
# 0.9465624875416011

In [8]:
from skopt import BayesSearchCV

weights = class_weight.compute_sample_weight(class_weight="balanced", y=train_y)

search = BayesSearchCV(
    XGBClassifier(tree_method="gpu_hist"),
    {
        "eta": (0.01, 1.0, "log-uniform"),
        "max_depth": (1, 30, "uniform"),
        "gamma": (0, 1, "uniform"),
        "min_child_weight": (1, 10, "uniform"),
    },
    n_iter=10,
    scoring="f1_macro",
    verbose=4,
    cv=3,
    n_points=1,
    n_jobs=1,
)
%time search.fit(train_x, train_y, sample_weight=weights)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


: 

: 

In [None]:
model_eval(test_y, search.predict(test_x))
print(average_precision_score(test_y, search.predict_proba(test_x)))

Accuracy: 0.7677714238009767
Precision: 0.9866059859940541
Recall: 0.7278978653841502
F1 Score: 0.8345197010695892
0.9432796210139014


In [None]:
search.best_params_

OrderedDict([('gamma', 0), ('max_depth', 6), ('min_child_weight', 5)])

In [None]:
import pickle
from pathlib import Path

pickle.dump(
    clf,
    Path("../data/models/baseline/xgbc-postprocess-best-acm-v1.pkl").open("wb"),
)