In [2]:
import pandas as pd
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from transformers import AutoImageProcessor, AutoModelForImageClassification
from torch.utils.data import Dataset
from PIL import Image
import torch

In [3]:
# choose device, not recommended to train with 'cpu

device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

print(device)

mps


In [4]:
# obtain the models from HF

processor = AutoImageProcessor.from_pretrained("Hemg/Birds-Species-classification")
model = AutoModelForImageClassification.from_pretrained("Hemg/Birds-Species-classification", num_labels=200, ignore_mismatched_sizes=True)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at Hemg/Birds-Species-classification and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([526]) in the checkpoint and torch.Size([200]) in the model instantiated
- classifier.weight: found shape torch.Size([526, 768]) in the checkpoint and torch.Size([200, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
class BirdDataset(Dataset):
    def __init__(self, df, processor, is_test=False):
        self.df = df
        self.processor = processor
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        image = Image.open(row["image_path"]).convert("RGB")
        encoding = self.processor(image, return_tensors="pt")
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}

        if not self.is_test:
            encoding["labels"] = torch.tensor(int(row["label"]))
        else:
            # Solo en test devolvemos el id
            encoding["id"] = torch.tensor(int(row["id"]))

        return encoding

In [6]:
def load_data():
    # reads csvs
    train_df = pd.read_csv("../aml-2025-feathers-in-focus/train_images.csv")
    test_df = pd.read_csv("../aml-2025-feathers-in-focus/test_images_path.csv")

    # adjusts labels for the model
    train_df["label"] = train_df["label"] - 1

    # rewrite full image_path to have the correct folder
    train_df["image_path"] = "../aml-2025-feathers-in-focus/train_images/train_images/" + train_df["image_path"].str.split("/").str[-1]
    test_df["image_path"] = "../aml-2025-feathers-in-focus/test_images/test_images/" + test_df["image_path"].str.split("/").str[-1]

    # print sizes
    print(f"Train: {len(train_df)} | Test: {len(test_df)}")
    return train_df, test_df


In [18]:
train_df, test_df = load_data()

train_ds = BirdDataset(train_df, processor)
test_ds = BirdDataset(test_df, processor, is_test=True)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False)


Train: 3926 | Test: 4000


In [8]:
# freeze all the parameters except the final layer
for param in model.base_model.parameters():  # should check if 'base_model' is all but the classifier
    param.requires_grad = False

# only the parameters of the final layer will be trained
for param in model.classifier.parameters():
    param.requires_grad = True


In [10]:
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-4)
epochs = 10

train_losses = []
train_accuracies = []

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    correct = 0
    total = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} [TRAIN]"):
        # batch_id = batch.pop("id") 
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        logits = outputs.logits

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # accumulate loss and accuracy
        epoch_loss += loss.item() * logits.size(0)
        preds = torch.argmax(logits, dim=1)
        correct += (preds == batch["labels"]).sum().item()
        total += logits.size(0)

    epoch_loss /= total
    epoch_acc = correct / total

    train_losses.append(epoch_loss)
    train_accuracies.append(epoch_acc)

    print(f"Epoch {epoch+1} | Train Loss: {epoch_loss:.4f} | Train Acc: {epoch_acc:.4f}")


Epoch 1 [TRAIN]: 100%|██████████| 123/123 [02:28<00:00,  1.21s/it]


Epoch 1 | Train Loss: 4.0454 | Train Acc: 0.3576


Epoch 2 [TRAIN]: 100%|██████████| 123/123 [02:36<00:00,  1.28s/it]


Epoch 2 | Train Loss: 2.2532 | Train Acc: 0.5813


Epoch 3 [TRAIN]: 100%|██████████| 123/123 [02:55<00:00,  1.43s/it]


Epoch 3 | Train Loss: 1.5858 | Train Acc: 0.6803


Epoch 4 [TRAIN]: 100%|██████████| 123/123 [02:52<00:00,  1.40s/it]


Epoch 4 | Train Loss: 1.2737 | Train Acc: 0.7282


Epoch 5 [TRAIN]: 100%|██████████| 123/123 [02:50<00:00,  1.39s/it]


Epoch 5 | Train Loss: 1.0842 | Train Acc: 0.7723


Epoch 6 [TRAIN]: 100%|██████████| 123/123 [02:51<00:00,  1.39s/it]


Epoch 6 | Train Loss: 0.9535 | Train Acc: 0.7947


Epoch 7 [TRAIN]: 100%|██████████| 123/123 [02:51<00:00,  1.39s/it]


Epoch 7 | Train Loss: 0.8499 | Train Acc: 0.8192


Epoch 8 [TRAIN]: 100%|██████████| 123/123 [02:50<00:00,  1.38s/it]


Epoch 8 | Train Loss: 0.7661 | Train Acc: 0.8329


Epoch 9 [TRAIN]: 100%|██████████| 123/123 [02:49<00:00,  1.38s/it]


Epoch 9 | Train Loss: 0.6994 | Train Acc: 0.8500


Epoch 10 [TRAIN]: 100%|██████████| 123/123 [02:49<00:00,  1.38s/it]

Epoch 10 | Train Loss: 0.6417 | Train Acc: 0.8658





In [11]:
model.save_pretrained("saved_model")
processor.save_pretrained("saved_model")

['saved_model/preprocessor_config.json']

In [12]:
model = AutoModelForImageClassification.from_pretrained("saved_model")
processor = AutoImageProcessor.from_pretrained("saved_model")

model.to(device)

ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=7

In [19]:
model.to(device)
model.eval()

predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        # Sacar ids y no enviarlos al modelo
        batch_ids = batch.pop("id")
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        logits = outputs.logits

        # Predicciones (1-200)
        preds = torch.argmax(logits, dim=1) + 1

        # Guardar id y label
        for i in range(len(preds)):
            predictions.append({
                "id": int(batch_ids[i].item()),  # tensor -> int
                "label": int(preds[i].item())
            })

# Crear DataFrame y guardar CSV
pred_df = pd.DataFrame(predictions)
pred_df.to_csv("submission.csv", index=False)

("Predictions saved to submission.csv")


'Predictions saved to submission.csv'