# PyTorch Lightning model

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd

scratch_dir = "~/scratch/birdclef/data/2025"
model_name = "Perch"
embed_dir = (
    f"{scratch_dir}/subset-train_audio-infer-soundscape-cpu/{model_name}/parts/embed/"
)


def preprocess_data(input_path: str) -> pd.DataFrame:
    df = pd.read_parquet(input_path)
    # concatenate all embeddings into a single DataFrame
    df["species_name"] = df["file"].apply(
        lambda x: x.split("train_audio/")[1].split("/")[0]
    )
    # train/test split requries y label to have at least 2 samples
    # remove species with less than 2 samples
    species_count = df["species_name"].value_counts()
    valid_species = species_count[species_count >= 2].index
    filtered_df = df[df["species_name"].isin(valid_species)].reset_index(drop=True)
    # concatenate embeddings
    embed_cols = list(map(str, range(1280)))
    filtered_df["embeddings"] = filtered_df[embed_cols].values.tolist()
    # downsample for debugging
    df_embs = filtered_df[["species_name", "embeddings"]].copy()
    print(f"DataFrame shape: {df_embs.shape}")
    print(f"Embedding size: {len(df_embs['embeddings'].iloc[0])}")
    return df_embs


df = preprocess_data(embed_dir)
df.head(5)

DataFrame shape: (11037, 2)
Embedding size: 1280


Unnamed: 0,species_name,embeddings
0,amakin1,"[-0.026628008112311363, 0.07033359259366989, 0..."
1,amekes,"[0.09562845528125763, 0.004033610224723816, 0...."
2,amekes,"[0.11211416870355606, -0.0019105728715658188, ..."
3,amekes,"[0.09912332147359848, -0.030860736966133118, 0..."
4,amekes,"[0.0636802390217781, 0.01985364407300949, -0.0..."


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split


def perform_train_test_split(
    df: pd.DataFrame, test_size: float = 0.2, random_state: int = 42
) -> tuple:
    # train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        np.stack(df["embeddings"]),
        df["species_name"],
        test_size=test_size,
        stratify=df["species_name"],
    )

    # data shape
    print(f"X_train, X_test shape: {X_train.shape, X_test.shape}")
    print(f"y_train, y_test shape: {y_train.shape, y_test.shape}")
    return X_train, X_test, y_train, y_test


# perform train/test split
X_train, X_test, y_train, y_test = perform_train_test_split(df)

X_train, X_test shape: ((8829, 1280), (2208, 1280))
y_train, y_test shape: ((8829,), (2208,))


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger


class BirdDataset(Dataset):
    def __init__(self, X, y, label_to_idx):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor([label_to_idx[label] for label in y], dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


class BirdDataModule(pl.LightningDataModule):
    def __init__(self, X_train, X_test, y_train, y_test, label_to_idx, batch_size=64):
        super().__init__()
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.label_to_idx = label_to_idx
        self.batch_size = batch_size

    def setup(self, stage=None):
        self.train_dataset = BirdDataset(self.X_train, self.y_train, self.label_to_idx)
        self.val_dataset = BirdDataset(self.X_test, self.y_test, self.label_to_idx)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)


class LinearClassifier(pl.LightningModule):
    def __init__(self, input_dim, num_classes, lr=1e-3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes),
        )
        self.loss_fn = nn.CrossEntropyLoss()
        self.lr = lr

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log("train_loss", loss, prog_bar=True)
        self.log("train_acc", acc, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        acc = (logits.argmax(dim=1) == y).float().mean()
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", acc, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.lr)

In [None]:
from pytorch_lightning import Trainer

# create label index mapping
unique_labels = sorted(df["species_name"].unique())
label_to_idx = {label: i for i, label in enumerate(unique_labels)}
num_classes = len(label_to_idx)

# instantiate DataModule
data_module = BirdDataModule(X_train, X_test, y_train, y_test, label_to_idx)

# instantiate model
model = LinearClassifier(input_dim=1280, num_classes=num_classes)

# logger and callbacks
logger = TensorBoardLogger("tb_logs", name="linear_classifier")

checkpoint_cb = ModelCheckpoint(
    monitor="val_loss",
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    mode="min",
)

early_stopping_cb = EarlyStopping(monitor="val_loss", patience=5, mode="min")

# trainer
trainer = Trainer(
    max_epochs=50,
    logger=logger,
    callbacks=[checkpoint_cb, early_stopping_cb],
    accelerator="auto",
)

# fit model
trainer.fit(model, datamodule=data_module)

/storage/home/hcoda1/9/mgustineli3/clef/birdclef-2025/.venv/lib/python3.10/site-packages/lightning_fabric/plugins/environments/slurm.py:204: The `srun` command is available on your system but is not used. HINT: If your intention is to run Lightning on SLURM, prepend your python command with `srun` like so: srun python3.10 /storage/home/hcoda1/9/mgustineli3/clef/birdclef ...
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
2025-05-30 23:39:57.925623: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-30 23:40:08.939736: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
202

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/storage/home/hcoda1/9/mgustineli3/clef/birdclef-2025/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=5` in the `DataLoader` to improve performance.
/storage/home/hcoda1/9/mgustineli3/clef/birdclef-2025/.venv/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=5` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [7]:
input_dim = X_train.shape[1]
input_dim

1280