# XGBoost classifier

In [31]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import pandas as pd

scratch_dir = "~/scratch/birdclef/data/2025"
model_name = "Perch"
embed_dir = (
    f"{scratch_dir}/subset-train_audio-infer-soundscape-cpu/{model_name}/parts/embed/"
)
df = pd.read_parquet(embed_dir)
df.head(5)

Unnamed: 0,file,start_time,end_time,0,1,2,3,4,5,6,...,1270,1271,1272,1273,1274,1275,1276,1277,1278,1279
0,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,0.0,5.0,-0.026628,0.070334,0.262309,-0.048873,0.00822,-0.054829,-0.007992,...,0.138983,0.288452,0.428865,-0.055781,-0.054962,0.003641,0.322033,0.114651,-0.078176,0.097951
1,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,0.0,5.0,0.095628,0.004034,0.026977,0.188966,0.159863,-0.117913,-0.035654,...,0.082373,0.011245,-0.095996,-0.15137,-0.003978,0.041113,0.131503,-0.08533,0.143421,-0.008003
2,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,5.0,10.0,0.112114,-0.001911,0.003879,0.130287,-0.056202,-0.110515,-0.066042,...,0.095946,0.023502,-0.082662,-0.116258,0.018548,0.036823,0.11074,0.064803,0.093969,0.035622
3,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,10.0,15.0,0.099123,-0.030861,0.049289,0.051599,-0.000955,-0.112892,-0.046274,...,-0.004677,-0.014064,0.031421,-0.060198,0.072975,0.034785,0.024503,0.081893,0.117334,0.083845
4,/storage/coda1/p-dsgt_clef2025/0/shared/birdcl...,15.0,20.0,0.06368,0.019854,-0.034153,0.041198,-0.058784,-0.0851,-0.024099,...,0.080744,0.020897,-0.074799,-0.100872,0.023476,0.093092,0.01536,-0.030749,0.213515,0.119746


In [33]:
display(df.shape)

for file in df["file"].iloc[:5]:
    print(file)

(55, 1283)

/storage/coda1/p-dsgt_clef2025/0/shared/birdclef/raw/birdclef-2025/train_audio/amakin1/XC113758.ogg
/storage/coda1/p-dsgt_clef2025/0/shared/birdclef/raw/birdclef-2025/train_audio/amekes/XC772375.ogg
/storage/coda1/p-dsgt_clef2025/0/shared/birdclef/raw/birdclef-2025/train_audio/amekes/XC772375.ogg
/storage/coda1/p-dsgt_clef2025/0/shared/birdclef/raw/birdclef-2025/train_audio/amekes/XC772375.ogg
/storage/coda1/p-dsgt_clef2025/0/shared/birdclef/raw/birdclef-2025/train_audio/amekes/XC772375.ogg


In [34]:
# concatenate all embeddings into a single DataFrame
df["species_name"] = df["file"].apply(
    lambda x: x.split("train_audio/")[1].split("/")[0]
)
embed_cols = list(map(str, range(1280)))
df["embeddings"] = df[embed_cols].values.tolist()
df_embs = df[["species_name", "embeddings"]].copy()
display(df_embs.head(5))
print(f"DF shape: {df_embs.shape}")
print(f"Embed size: {len(df_embs['embeddings'].iloc[0])}")

Unnamed: 0,species_name,embeddings
0,amakin1,"[-0.026628008112311363, 0.07033359259366989, 0..."
1,amekes,"[0.09562845528125763, 0.004033610224723816, 0...."
2,amekes,"[0.11211416870355606, -0.0019105728715658188, ..."
3,amekes,"[0.09912332147359848, -0.030860736966133118, 0..."
4,amekes,"[0.0636802390217781, 0.01985364407300949, -0.0..."


DF shape: (55, 2)
Embed size: 1280


In [35]:
import time
import numpy as np
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
    classification_report,
)

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    np.stack(df_embs["embeddings"]),
    df_embs["species_name"],
    test_size=0.33,
    stratify=df_embs["species_name"],
)

# Data shape
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

### Learner class

In [23]:
class Learner:
    def __init__(self, pipe, params):
        self.pipe = pipe
        self.params = params
        self.clf = None
        self.scores = None
        self.average = "macro"
        self.search_name = None
        self.class_report = None
        self.dataset_name = None
        self.learning_curve = {}
        self.validation_curve = {}
        self.cv = StratifiedKFold(n_splits=5, shuffle=True)
        self.name = str(self.pipe["model"].__class__.__name__)

    def fit_gridsearch(self, search_func, X_train, y_train, verbose=False):
        """
        Method to train the model using a search algorithm.

        search_func: GridSearchCV, RandomizedSearchCV from sklearn.
        X_train: training features dataset.
        y_train: training labels dataset.
        verbose: int() Controls the verbosity: the higher, the more messages (1, 2, or 3).
        """
        np.random.seed(42)

        # train learner
        self.clf = search_func(
            self.pipe,
            self.params,
            scoring={
                "accuracy": make_scorer(accuracy_score),
                "precision": make_scorer(precision_score, average=self.average),
                "recall": make_scorer(recall_score, average=self.average),
                "f1": make_scorer(f1_score, average=self.average),
            },
            refit="f1",
            cv=self.cv,
            verbose=verbose,
            n_jobs=-1,
        )
        # fit the model
        self.clf.fit(X_train, y_train)
        self.search_name = str(self.clf.__class__.__name__)

    def get_scores(self, X_train, X_test, y_train, y_test, average=None):
        """
        Method to get model scores.

        X_train: training features dataset.
        X_test: test features dataset.
        y_train: training labels dataset.
        y_test: test labels dataset.
        """
        if self.search_name == "Benchmark":
            best_estimator = self.clf
        else:
            best_estimator = self.clf.best_estimator_

        np.random.seed(42)
        # score on training data
        start_time = time.time()
        best_estimator.fit(X_train, y_train)
        end_time = time.time()
        wall_clock_fit = end_time - start_time
        train_score = best_estimator.score(X_train, y_train)

        # score on test data
        start_time = time.time()
        y_pred = best_estimator.predict(X_test)
        end_time = time.time()
        wall_clock_pred = end_time - start_time
        test_score = best_estimator.score(X_test, y_test)

        # metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average=average)
        recall = recall_score(y_test, y_pred, average=average)
        f1 = f1_score(y_test, y_pred, average=average)

        # classification report
        self.class_report = classification_report(y_test, y_pred)

        self.scores = {
            "train_score": round(train_score, 3),
            "test_score": round(test_score, 3),
            "accuracy": round(accuracy, 3),
            "precision": round(precision, 3),
            "recall": round(recall, 3),
            "f1": round(f1, 3),
            "wall_clock_fit": wall_clock_fit,
            "wall_clock_pred": wall_clock_pred,
        }

    # evaluate Learner class
    def evaluate_learner(self):
        """
        Print model scores
        """
        print(f"{'#################################' * 2}")
        print(f"{self.search_name}:\t  {self.name}")
        print(f"Train score:     {round(self.scores['train_score'], 3)}")
        print(f"Test score:      {round(self.scores['test_score'], 3)}")
        print(f"Accuracy score:  {round(self.scores['accuracy'], 3)}")
        print(f"Precision score: {round(self.scores['precision'], 3)}")
        print(f"Recall score:    {round(self.scores['recall'], 3)}")
        print(f"F1 score:        {round(self.scores['f1'], 3)}")
        print(f"Wall Clock Fit:  {round(self.scores['wall_clock_fit'], 3)}")
        print(f"Wall Clock Pred: {round(self.scores['wall_clock_pred'], 3)}")
        # classification report
        print(f"\nClassification report:\n{self.class_report}")

        # best score and best params
        print(f"Best score: {round(self.clf.best_score_, 3)}")
        print("Best params:")
        for param in self.clf.best_params_.items():
            print(f"\t{param}")
        print()

### XGBoost model

In [24]:
from sklearn.preprocessing import LabelEncoder

# create a label encoder object
le = LabelEncoder()

# fit and transform the target with label encoder
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

In [None]:
from xgboost import XGBClassifier

# XGBoost pipeline
xgb_pipe = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("model", XGBClassifier(seed=42)),
    ]
)
# GridSearchCV params
xgb_param_grid = {
    "model__objective": ["multi:softmax"],
    "model__n_estimators": [400],  # np.arange(50, 200, 50),
    "model__max_depth": [
        3,
        4,
        5,
    ],  # np.arange(3, 10),
    # "model__min_child_weight": np.arange(1, 6),
    # "model__gamma": np.linspace(0, 0.6, 5),
    # "model__subsample": np.linspace(0.5, 1.0, 6),
    # "model__colsample_bytree": np.linspace(0.5, 1.0, 6),
}


# init learners
xgb = Learner(pipe=xgb_pipe, params=xgb_param_grid)

In [26]:
# fit model
xgb.fit_gridsearch(RandomizedSearchCV, X_train, y_train_enc, verbose=2)

Fitting 5 folds for each of 3 candidates, totalling 15 fits




[CV] END model__max_depth=3, model__n_estimators=400, model__objective=multi:softmax; total time=   0.0s
[CV] END model__max_depth=3, model__n_estimators=400, model__objective=multi:softmax; total time=   1.8s
[CV] END model__max_depth=4, model__n_estimators=400, model__objective=multi:softmax; total time=   1.7s
[CV] END model__max_depth=4, model__n_estimators=400, model__objective=multi:softmax; total time=   1.8s[CV] END model__max_depth=3, model__n_estimators=400, model__objective=multi:softmax; total time=   1.8s

[CV] END model__max_depth=3, model__n_estimators=400, model__objective=multi:softmax; total time=   1.8s
[CV] END model__max_depth=3, model__n_estimators=400, model__objective=multi:softmax; total time=   1.8s
[CV] END model__max_depth=4, model__n_estimators=400, model__objective=multi:softmax; total time=   0.0s
[CV] END model__max_depth=5, model__n_estimators=400, model__objective=multi:softmax; total time=   0.0s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
3 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "/storage/home/hcoda1/9/mgustineli3/clef/birdclef-2025/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/storage/home/hcoda1/9/mgustineli3/clef/birdclef-2025/.venv/lib/python3.10/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_m

[CV] END model__max_depth=4, model__n_estimators=400, model__objective=multi:softmax; total time=   0.4s
[CV] END model__max_depth=5, model__n_estimators=400, model__objective=multi:softmax; total time=   0.4s
[CV] END model__max_depth=4, model__n_estimators=400, model__objective=multi:softmax; total time=   0.5s
[CV] END model__max_depth=5, model__n_estimators=400, model__objective=multi:softmax; total time=   0.5s
[CV] END model__max_depth=5, model__n_estimators=400, model__objective=multi:softmax; total time=   0.5s
[CV] END model__max_depth=5, model__n_estimators=400, model__objective=multi:softmax; total time=   0.5s


In [None]:
# get model scores
# xgb.get_scores(X_train, X_test, y_train_enc, y_test_enc, average="macro")

In [27]:
from pathlib import Path

raw_root = Path("~/p-dsgt_clef2025-0/shared/birdclef/raw/birdclef-2025").expanduser()
! ls {raw_root}
soundscape_root = raw_root / "train_audio"
! ls {soundscape_root} | head
soundscapes = sorted(soundscape_root.glob("**/*.ogg"))
display(len(soundscapes))

recording_location.txt	taxonomy.csv	  train.csv    train_soundscapes
sample_submission.csv	test_soundscapes  train_audio
1139490
1192948
1194042
126247
1346504
134933
135045
1462711
1462737
1564122


28564

In [28]:
soundscapes[:100]

[PosixPath('/storage/home/hcoda1/9/mgustineli3/p-dsgt_clef2025-0/shared/birdclef/raw/birdclef-2025/train_audio/1139490/CSA36385.ogg'),
 PosixPath('/storage/home/hcoda1/9/mgustineli3/p-dsgt_clef2025-0/shared/birdclef/raw/birdclef-2025/train_audio/1139490/CSA36389.ogg'),
 PosixPath('/storage/home/hcoda1/9/mgustineli3/p-dsgt_clef2025-0/shared/birdclef/raw/birdclef-2025/train_audio/1192948/CSA36358.ogg'),
 PosixPath('/storage/home/hcoda1/9/mgustineli3/p-dsgt_clef2025-0/shared/birdclef/raw/birdclef-2025/train_audio/1192948/CSA36366.ogg'),
 PosixPath('/storage/home/hcoda1/9/mgustineli3/p-dsgt_clef2025-0/shared/birdclef/raw/birdclef-2025/train_audio/1192948/CSA36373.ogg'),
 PosixPath('/storage/home/hcoda1/9/mgustineli3/p-dsgt_clef2025-0/shared/birdclef/raw/birdclef-2025/train_audio/1192948/CSA36388.ogg'),
 PosixPath('/storage/home/hcoda1/9/mgustineli3/p-dsgt_clef2025-0/shared/birdclef/raw/birdclef-2025/train_audio/1194042/CSA18783.ogg'),
 PosixPath('/storage/home/hcoda1/9/mgustineli3/p-dsgt_c

In [29]:
# Get species directories whose names start with a letter
species_dirs = sorted(
    [d for d in soundscape_root.iterdir() if d.is_dir() and d.name[0].isalpha()]
)

# Take top 10 species in alphabetical order
selected_species = species_dirs[:10]
selected_species_names = {d.name for d in selected_species}
print(f"[Subset] Selected species: {sorted(selected_species_names)}")

# Gather audio files only from the selected species
audio_files = sorted([p for d in selected_species for p in d.rglob("*.ogg")])

[Subset] Selected species: ['amakin1', 'amekes', 'ampkin1', 'anhing', 'babwar', 'bafibi1', 'banana', 'baymac', 'bbwduc', 'bicwre1']


In [30]:
len(species_dirs)

146