In [1]:
%load_ext autoreload
%autoreload 2
%load_ext lab_black

In [2]:
from birdclef.utils import get_spark
from pyspark.sql import SparkSession
import os
import sys

os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

spark = (
    SparkSession.builder.config("spark.driver.memory", "20g")
    .config("spark.driver.cores", 8)
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .config("spark.driver.maxResultSize", "10g")
    .getOrCreate()
)
df = spark.read.parquet("../data/processed/birdclef-2023/train_postprocessed/v7")
df.printSchema()

root
 |-- track_stem: string (nullable = true)
 |-- track_type: string (nullable = true)
 |-- start_time: long (nullable = true)
 |-- species: string (nullable = true)
 |-- embedding: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- prediction_vec: array (nullable = true)
 |    |-- element: float (containsNull = true)



In [3]:
data = df.select("species", "prediction_vec").toPandas()
data.head(5)

Unnamed: 0,species,prediction_vec
0,afmdov1,"[-14.449501, -12.099122, -15.694858, -15.91962..."
1,afecuc1,"[-11.292007, -7.078901, -10.191986, -11.164602..."
2,hoopoe,"[-13.212698, -10.587885, -14.680091, -12.89165..."
3,no_call,"[-12.035065, -11.438778, -9.2674675, -10.81824..."
4,yertin1,"[-11.899866, -10.146761, -13.387293, -9.727022..."


In [4]:
len(set(data.species))

264

In [5]:
import numpy as np

pred_vec = np.stack(data.prediction_vec)

In [6]:
import math


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


pred_vec_sig = sigmoid(pred_vec)
pred_vec_sig[:5]

array([[5.3047046e-07, 5.5643654e-06, 1.5268975e-07, ..., 1.2264467e-05,
        8.1818898e-06, 1.6978345e-05],
       [1.2472056e-05, 8.4198936e-04, 3.7467991e-05, ..., 2.0654597e-06,
        4.5453929e-05, 1.7708689e-07],
       [1.8272475e-06, 2.5219069e-05, 4.2122812e-07, ..., 9.2062101e-06,
        2.3898406e-06, 6.0677999e-06],
       [5.9324652e-06, 1.0769541e-05, 9.4438488e-05, ..., 9.8574410e-06,
        8.9855246e-05, 9.6929343e-06],
       [6.7912674e-06, 3.9201321e-05, 1.5345177e-06, ..., 2.2636792e-05,
        3.2216660e-05, 6.1912538e-06]], dtype=float32)

In [8]:
from sklearn.feature_selection import f_classif, SelectKBest

pred_vec_new = SelectKBest(f_classif, k=1000).fit_transform(pred_vec_sig, data.species)
pred_vec_new.shape

(255372, 1000)

In [7]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    average_precision_score,
)


def model_eval(truth, preds):
    print("Accuracy:", accuracy_score(truth, preds))
    print(
        "Precision:",
        precision_score(truth, preds, average="macro"),
    )
    print(
        "Recall:",
        recall_score(truth, preds, average="macro"),
    )
    print(
        "F1 Score:",
        f1_score(truth, preds, average="macro"),
    )

In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit_transform(data.species)

array([  9,   5, 123, ...,  89,  97, 152])

In [11]:
train_x, test_x, train_y, test_y = train_test_split(
    pred_vec_new,
    data["species"],
    test_size=0.33,
    stratify=data["species"],
)

In [12]:
from xgboost import XGBClassifier

clf = XGBClassifier(tree_method="gpu_hist")
clf.fit(train_x, le.transform(train_y))
model_eval(le.transform(test_y), clf.predict(test_x))

Accuracy: 0.553747938248312
Precision: 0.03821604205847956
Recall: 0.06348431288693118
F1 Score: 0.04594312716659158


  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
model_eval(le.transform(test_y), test_x, clf)

Accuracy: 0.5610931140460171
Precision: 0.0505582552546785
Recall: 0.07936385959742892
F1 Score: 0.06035850970024102


  _warn_prf(average, modifier, msg_start, len(result))


ValueError: multiclass format is not supported

In [9]:
from sklearn.preprocessing import LabelBinarizer

mlb = LabelBinarizer()
mlb.fit(data.species)
# model_eval(le.transform(test_y), clf.predict(test_x))
print(
    "Average Precision Score",
    average_precision_score(mlb.transform(test_y), clf.predict_proba(test_x)),
)

NameError: name 'test_y' is not defined

In [13]:
from sklearn.feature_selection import chi2, SelectKBest

pred_vec_new = SelectKBest(chi2, k=1000).fit_transform(pred_vec_sig, data.species)
pred_vec_new.shape

(255372, 1000)

In [14]:
train_x, test_x, train_y, test_y = train_test_split(
    pred_vec_new,
    data["species"],
    test_size=0.33,
    stratify=data["species"],
)

clf = XGBClassifier(tree_method="gpu_hist")
clf.fit(train_x, le.transform(train_y))
model_eval(le.transform(test_y), clf.predict(test_x))

Accuracy: 0.5933573030507991
Precision: 0.06951132317610352
Recall: 0.10706627485772997
F1 Score: 0.08283287978070282


  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
average_precision_score(mlb.transform(test_y), clf.predict_proba(test_x))

0.08747211758095778

In [10]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1000)
pred_vec_pca = pca.fit_transform(pred_vec, data.species)

In [11]:
train_x, test_x, train_y, test_y = train_test_split(
    pred_vec_pca,
    data["species"],
    test_size=0.33,
    stratify=data["species"],
)

In [14]:
from xgboost import XGBClassifier

clf = XGBClassifier(tree_method="gpu_hist")
clf.fit(train_x, le.transform(train_y))
model_eval(le.transform(test_y), clf.predict(test_x))

Accuracy: 0.5428666358145551
Precision: 0.05341081765516573
Recall: 0.08413594267825758
F1 Score: 0.060460626993904906


  _warn_prf(average, modifier, msg_start, len(result))


Naives Bayes

In [11]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(
    pred_vec_sig,
    data["species"],
    test_size=0.33,
    stratify=data["species"],
)

In [23]:
from sklearn.naive_bayes import ComplementNB, BernoulliNB, MultinomialNB

cnf = ComplementNB()
cnf.fit(train_x, train_y)
model_eval(test_y, cnf.predict(test_x))

Accuracy: 0.7730471206673549


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.4208924562448272
Recall: 0.4104981716785569
F1 Score: 0.3919914724797036


In [30]:
cnf.predict_proba(test_x)

array([[0.00378404, 0.00377104, 0.00378256, ..., 0.00378371, 0.00378986,
        0.00377662],
       [0.00377983, 0.00375872, 0.0037772 , ..., 0.00377954, 0.00378001,
        0.00376541],
       [0.00378698, 0.00378572, 0.00378767, ..., 0.00378687, 0.00378701,
        0.00378607],
       ...,
       [0.00377032, 0.00375809, 0.0037689 , ..., 0.00377004, 0.00377018,
        0.00376333],
       [0.0036751 , 0.00365163, 0.00367265, ..., 0.00367496, 0.00367521,
        0.00366138],
       [0.00374872, 0.00372108, 0.00374491, ..., 0.00374843, 0.00374855,
        0.00372983]])

In [27]:
from sklearn.naive_bayes import ComplementNB, BernoulliNB, MultinomialNB

bnb = MultinomialNB()
bnb.fit(train_x, train_y)
model_eval(test_y, bnb.predict(test_x))

Accuracy: 0.5722829375956712


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.14413474207782342
Recall: 0.039349876757526485
F1 Score: 0.0503143168943167


In [12]:
from skopt import BayesSearchCV
from skopt.space import Real
from sklearn.naive_bayes import ComplementNB, BernoulliNB, MultinomialNB


def scorer(estimator, X, y):
    return average_precision_score(y, estimator.predict_proba(X), average="macro")


search = BayesSearchCV(
    ComplementNB(),
    {
        "alpha": Real(0.0, 1.0, "uniform"),
    },
    n_iter=32,
    scoring="f1_macro",
    n_jobs=4,
    n_points=1,
    cv=2,
    verbose=1,
)
search.fit(train_x, train_y)
model_eval(test_y, search.predict(test_x))

Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Fitting 2 folds for each of 1 candidates, totalling 2 fits




Accuracy: 0.7744591980824226


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.40755965097118646
Recall: 0.4078544452103385
F1 Score: 0.3873062731030318


In [15]:
search.best_estimator_.classes_

array(['abethr1', 'abhori1', 'abythr1', 'afbfly1', 'afdfly1', 'afecuc1',
       'affeag1', 'afgfly1', 'afghor1', 'afmdov1', 'afpfly1', 'afpkin1',
       'afpwag1', 'afrgos1', 'afrgrp1', 'afrjac1', 'afrthr1', 'amesun2',
       'augbuz1', 'bagwea1', 'barswa', 'bawhor2', 'bawman1', 'bcbeat1',
       'beasun2', 'bkctch1', 'bkfruw1', 'blacra1', 'blacuc1', 'blakit1',
       'blaplo1', 'blbpuf2', 'blcapa2', 'blfbus1', 'blhgon1', 'blhher1',
       'blksaw1', 'blnmou1', 'blnwea1', 'bltapa1', 'bltbar1', 'bltori1',
       'blwlap1', 'brcale1', 'brcsta1', 'brctch1', 'brcwea1', 'brican1',
       'brobab1', 'broman1', 'brosun1', 'brrwhe3', 'brtcha1', 'brubru1',
       'brwwar1', 'bswdov1', 'btweye2', 'bubwar2', 'butapa1', 'cabgre1',
       'carcha1', 'carwoo1', 'categr', 'ccbeat1', 'chespa1', 'chewea1',
       'chibat1', 'chtapa3', 'chucis1', 'cibwar1', 'cohmar1', 'colsun2',
       'combul2', 'combuz1', 'comsan', 'crefra2', 'crheag1', 'crohor1',
       'darbar1', 'darter3', 'didcuc1', 'dotbar1', 'du

In [14]:
import pickle
from pathlib import Path

pickle.dump(
    search.best_estimator_,
    Path("../data/models/baseline_v2/nzh-complement-nb.pkl").open("wb"),
)

In [24]:
import pickle
from pathlib import Path

clf = pickle.loads(Path("../data/models/baseline_v2/nzh-current-next-token.pkl").read_bytes())

In [25]:
clf.get_xgb_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'eval_metric': None,
 'gamma': 0.0,
 'gpu_id': None,
 'grow_policy': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 3,
 'max_leaves': None,
 'min_child_weight': 1,
 'monotone_constraints': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': 'gpu_hist',
 'validate_parameters': None,
 'verbosity': None}