In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
from birdclef.utils import get_spark
import os

os.environ["SPARK_LOCAL_DIRS"] = "../data/tmp/spark"

spark = get_spark(cores=16, memory="5g")
df = spark.read.parquet("../data/processed/birdclef-2023/train_postprocessed/v1")
df.printSchema()
df.show(n=5)

root
 |-- species: string (nullable = true)
 |-- track_stem: string (nullable = true)
 |-- track_type: string (nullable = true)
 |-- track_name: string (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- prediction_vec: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- predictions: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- rank: long (nullable = true)
 |    |    |-- index: long (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- mapped_label: string (nullable = true)
 |    |    |-- probability: double (nullable = true)
 |-- start_time: long (nullable = true)
 |-- energy: double (nullable = true)

+-------+----------------+----------+--------------------+--------------------+--------------------+--------------------+----------+-------------------+
|species|      track_stem|track_type|          track_name|           embedding|     

In [3]:
data = df.toPandas()
data.head(n=5)

Unnamed: 0,species,embedding
0,[gobsta5],"[0.6180285215377808, 1.0199153423309326, 0.333..."
1,[chespa1],"[0.6111694574356079, 0.9573187828063965, 0.464..."
2,[golher1],"[0.9022326469421387, 0.9815549850463867, 0.468..."
3,[marsto1],"[0.6137340068817139, 1.4362423419952393, 0.376..."
4,[gobwea1],"[0.6274732351303101, 0.6725507974624634, 0.169..."


In [4]:
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data["species"])
print(labels.shape)

embeddings = np.stack(data["embedding"])
print(embeddings.shape)

(239569, 265)
(239569, 320)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

train_x, test_x, train_y, test_y = train_test_split(embeddings, labels, test_size=0.2)


def model_eval(truth, preds):
    print("Accuracy:", accuracy_score(truth, preds))
    print(
        "Precision:",
        precision_score(truth, preds, average="macro"),
    )
    print(
        "Recall:",
        recall_score(truth, preds, average="macro"),
    )
    print(
        "F1 Score:",
        f1_score(truth, preds, average="macro"),
    )

In [18]:
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

clf = OneVsRestClassifier(XGBClassifier(tree_method="gpu_hist"), verbose=10, n_jobs=-1)
clf.fit(train_x, train_y)
preds = clf.predict(test_x)
print(preds.shape)
print(test_y.shape)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 18.3min
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed: 20.1min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 22.9min
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed: 2

(47914, 265)
(47914, 265)


In [19]:
model_eval(test_y, preds)

Accuracy: 0.7617815252327086
Precision: 0.9904813713294939
Recall: 0.7177069995024414
F1 Score: 0.8285109262140783


In [6]:
from sklearn.metrics import average_precision_score

# print(average_precision_score(test_y, clf.predict_proba(test_x)))

In [23]:
import pickle
from pathlib import Path

pickle.dump(
    clf,
    Path("../data/models/baseline/one-v-rest-xgbc-base.pkl").open("wb"),
)

In [35]:
xgb_clf = XGBClassifier(tree_method="gpu_hist", verbosity=3)
xgb_clf.fit(train_x, train_y, verbose=True)
preds = xgb_clf.predict(test_x)
model_eval(test_y, preds)
print(average_precision_score(test_y, xgb_clf.predict_proba(test_x)))

[14:38:51] AllReduce: 0.06697s, 1 calls @ 66970us

[14:38:51] MakeCuts: 0.069203s, 1 calls @ 69203us

[14:38:51] DEBUG: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\tree\updater_gpu_hist.cu:755: [GPU Hist]: Configure
[14:38:52] InitCompressedData: 0.005695s, 1 calls @ 5695us

[14:54:24] Configure: 0.002267s, 1 calls @ 2267us

[14:54:24] EvalOneIter: 0.001502s, 100 calls @ 1502us

[14:54:24] GetGradient: 1.25392s, 100 calls @ 1253919us

[14:54:24] PredictRaw: 0.040488s, 100 calls @ 40488us

[14:54:24] UpdateOneIter: 929.382s, 100 calls @ 929382217us

[14:54:24] BoostNewTrees: 928.078s, 100 calls @ 928078194us

[14:54:24] CommitModel: 0.000986s, 100 calls @ 986us

[14:54:24] Peak memory usage: 1097MiB
[14:54:24] Number of allocations: 1335393
[14:54:24] InitData: 0.029063s, 26500 calls @ 29063us

[14:54:24] InitDataOnce: 0.024826s, 1 calls @ 24826us

[14:54:24] Update: 908.911s, 26500 calls @ 908910629us

[14:54:24

In [43]:
from skopt import BayesSearchCV

search = BayesSearchCV(
    XGBClassifier(tree_method="gpu_hist"),
    {
        "max_depth": (1, 30, "uniform"),
        "gamma": (0, 1, "uniform"),
        "min_child_weight": (1, 10, "uniform"),
    },
    n_iter=8,
    scoring="precision_macro",
    verbose=4,
    cv=zip(
        [np.arange(0, int(len(train_x) * 0.67))],
        [np.arange(int(len(train_x) * 0.67) + 1, len(train_x))],
    ),
    n_points=1,
    n_jobs=-1,
)
search.fit(train_x, train_y)

Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Fitting 1 folds for each of 1 candidates, totalling 1 fits


In [44]:
model_eval(test_y, search.predict(test_x))
print(average_precision_score(test_y, search.predict_proba(test_x)))

Accuracy: 0.7677714238009767
Precision: 0.9866059859940541
Recall: 0.7278978653841502
F1 Score: 0.8345197010695892
0.9432796210139014


In [46]:
search.best_params_

OrderedDict([('gamma', 0), ('max_depth', 6), ('min_child_weight', 5)])

In [7]:
from sklearn.utils.class_weight import compute_sample_weight


labels_expanded = []
for x in data["species"]:
    for j in x:
        labels_expanded.append(j)
weights = compute_sample_weight(
    class_weight="balanced",
    y=[x[0] for x in mlb.inverse_transform(train_y)],
)
print(weights.shape)

(191655,)


In [9]:
from xgboost import XGBClassifier

xgbc = XGBClassifier(tree_method="gpu_hist")
xgbc.fit(train_x, train_y, sample_weight=weights, verbose=True)
model_eval(test_y, xgbc.predict(test_x))
print(
    "Average Precision: ", average_precision_score(test_y, xgbc.predict_proba(test_x))
)

Accuracy: 0.7555829193972534
Precision: 0.9882951533790961
Recall: 0.712574707471642
F1 Score: 0.8245915807290878
Average Precision:  0.9427516362441193


In [8]:
from skopt import BayesSearchCV
from xgboost import XGBClassifier

search = BayesSearchCV(
    XGBClassifier(tree_method="gpu_hist"),
    {
        "max_depth": (3, 15, "uniform"),
        "gamma": (0.0, 1.0, "uniform"),
        "min_child_weight": (1, 10, "uniform"),
    },
    fit_params={"sample_weight": weights},
    n_iter=8,
    scoring="f1_macro",
    verbose=4,
    cv=2,
    n_points=1,
    n_jobs=-1,
)
search.fit(train_x, train_y)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Fitting 2 folds for each of 1 candidates, totalling 2 fits


In [9]:
print(search.best_params_)
model_eval(test_y, search.predict(test_x))
print(average_precision_score(test_y, search.predict_proba(test_x)))

OrderedDict([('gamma', 0.21678970864911834), ('max_depth', 6), ('min_child_weight', 4)])
Accuracy: 0.7654756438619192
Precision: 0.987203693248337
Recall: 0.7244783548864888
F1 Score: 0.832269913980735
0.9421163147336618


In [13]:
import pickle
from pathlib import Path

pickle.dump(
    search.best_estimator_,
    Path("../data/models/baseline/xgbc-sample-weight-opt.pkl").open("wb"),
)

In [14]:
import pickle
from pathlib import Path

pickle.dump(
    mlb,
    Path("../data/models/baseline/multilabel-binarizer.pkl").open("wb"),
)

In [15]:
mlb.classes_

array(['abethr1', 'abhori1', 'abythr1', 'afbfly1', 'afdfly1', 'afecuc1',
       'affeag1', 'afgfly1', 'afghor1', 'afmdov1', 'afpfly1', 'afpkin1',
       'afpwag1', 'afrgos1', 'afrgrp1', 'afrjac1', 'afrthr1', 'amesun2',
       'augbuz1', 'bagwea1', 'barswa', 'bawhor2', 'bawman1', 'bcbeat1',
       'beasun2', 'bkctch1', 'bkfruw1', 'blacra1', 'blacuc1', 'blakit1',
       'blaplo1', 'blbpuf2', 'blcapa2', 'blfbus1', 'blhgon1', 'blhher1',
       'blksaw1', 'blnmou1', 'blnwea1', 'bltapa1', 'bltbar1', 'bltori1',
       'blwlap1', 'brcale1', 'brcsta1', 'brctch1', 'brcwea1', 'brican1',
       'brobab1', 'broman1', 'brosun1', 'brrwhe3', 'brtcha1', 'brubru1',
       'brwwar1', 'bswdov1', 'btweye2', 'bubwar2', 'butapa1', 'cabgre1',
       'carcha1', 'carwoo1', 'categr', 'ccbeat1', 'chespa1', 'chewea1',
       'chibat1', 'chtapa3', 'chucis1', 'cibwar1', 'cohmar1', 'colsun2',
       'combul2', 'combuz1', 'comsan', 'crefra2', 'crheag1', 'crohor1',
       'darbar1', 'darter3', 'didcuc1', 'dotbar1', 'du

In [16]:
search.best_estimator_.classes_

array([0, 1])