
# **Data preparation**

Creating a dataset that is compatible with torch.DataLoader


In [24]:
# importing packages

import numpy as np
import pandas as pd
import torch as tc

from pathlib import Path
from PIL import Image

from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import v2

Transforming images to make them compatible for CNN

In [25]:
TRANSFORM_DEFAULT = v2.Compose([
    v2.ToImage(),
    v2.Resize((224, 224)),
    v2.ToDtype(tc.float32, scale=True), 
])

In [26]:
DATA_DIR = Path("/kaggle/input/feathers-in-focus-model/aml-2025-feathers-in-focus")

attributes = np.load(DATA_DIR / "attributes.npy")

In [27]:
train_df = pd.read_csv("/kaggle/input/feathers-in-focus-model//aml-2025-feathers-in-focus/train_images.csv")
print(train_df.head())

            image_path  label
0  /train_images/1.jpg      1
1  /train_images/2.jpg      1
2  /train_images/3.jpg      1
3  /train_images/4.jpg      1
4  /train_images/5.jpg      1


Defining the type and config parameters of the data

In [28]:
# idx, image, label, path

ItemType = tuple[int, tc.Tensor, int, tc.Tensor, str] 

Define how to read a sample given the index

In [29]:
class ImageClassification(Dataset[ItemType]):
    def __init__(
        self,
        df: pd.DataFrame,
        search_root: Path,
        attributes: np.ndarray, 
        transform: v2.Transform = TRANSFORM_DEFAULT,
    ):
        self.df = df.reset_index(drop=True)
        self.search_root = search_root
        self.transform = transform
        self.attributes = tc.tensor(attributes, dtype=tc.float32)
    
    def __len__(self) -> int:
        return len(self.df)
    
    def _find_image_path(self, filename: str) -> Path:
        path = self.search_root / filename.lstrip('/')
        if not path.exists():
            raise FileNotFoundError(f"Image file '{filename}' not found at {path}")
        return path
    
    def __getitem__(self, idx: int) -> ItemType:
        row = self.df.iloc[idx]
        filename = Path(str(row["image_path"])).name
        path = self._find_image_path(filename)
        
        image = Image.open(path).convert("RGB")
        image = self.transform(image)
        
        label = int(row["label"]) - 1
        attr = self.attributes[label]  
        
        return idx, image, label, attr, str(path)

Check if set-up works

In [30]:
attributes = np.load("/kaggle/input/feathers-in-focus-model/aml-2025-feathers-in-focus/attributes.npy")

SEARCH_ROOT = Path("/kaggle/input/feathers-in-focus-model/aml-2025-feathers-in-focus/train_images")

train_dataset = ImageClassification(
    df=train_df,
    search_root=SEARCH_ROOT,
    attributes=attributes, 
    transform=TRANSFORM_DEFAULT,
)

idx0, img0, label0, attr0, path0 = train_dataset[0] 
print("Pad eerste image:", path0)
print("Image shape:", img0.shape)
print("Label:", label0)
print("Attributes shape:", attr0.shape)  

Pad eerste image: /kaggle/input/feathers-in-focus-model/aml-2025-feathers-in-focus/train_images/1.jpg
Image shape: torch.Size([3, 224, 224])
Label: 0
Attributes shape: torch.Size([312])


Splitting the data in training and validation set (80/20) and creating DataLoader batches

In [31]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Train/validation split
from torch.utils.data import random_split

train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_set, val_set = random_split(train_dataset, [train_size, val_size])

# DataLoaders batches
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(val_set, batch_size=32, shuffle=False)

# **CREATING, TRAINING, VALIDATING MODEL**

In [32]:
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import transforms
from torch.optim.lr_scheduler import StepLR

from pathlib import Path

In [33]:
DATA_DIR  = Path("/kaggle/input/feathers-in-focus-model/aml-2025-feathers-in-focus")

In [34]:
def load_argparser():
    parser = argparse.ArgumentParser(description="Feathers in Focus")
    parser.add.argument("--batch-size", type=int, default=64)
    parser.add.argument("__test-batch-size", type=int, default=1000)
    parser.add.argument("--epochs", type=int, default=14)
    parser.add.argument("--lr", type=float, default=1.0)
    parder.add.argument("--gamma", type=float, default=0.7)
    parser.add.argument("--dry-run", action="store_true")
    parser.add.argument("--seed", type=int, default=1)
    parser.add.argument("--log-interval", type=int, default=10)
    parder.add.argument("--save-model", action="store_true")
    return parser

**Defining the model to use**

Model with 4 convolional layers (RGB, simple features, complex pattorns, higher level features). 
This leads to a fully convolutional layer with 9216 input and 128 output. 
This leads to 200 classes, bird species. 

**Forward pass**

Forward pass trough the model, using the 4 convolutional layers, combined with the ReLu activation function for non-linearity. Making use of max pooling and creating the fully connected layer followed by the output layer and softmax function. 

In [35]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.conv3 = nn.Conv2d(64, 64, 3, 1)
        self.conv4 = nn.Conv2d(64, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 200)     
        self.fc_attr = nn.Linear(128, 312)   

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)

        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)

        x = self.conv3(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)

        x = self.conv4(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)

        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        features = self.dropout2(x) 
        
        class_output = F.log_softmax(self.fc2(features), dim=1)
        attr_output = torch.sigmoid(self.fc_attr(features))  
        
        return class_output, attr_output 

**Training loop: forward - backward propagation**

From forward pass, to calculating the loss, to backward propagation calculating the gradient to adjusting the weights. 

In [36]:
def train(model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (idx, data, target, attr_target, path) in enumerate(train_loader):
        data = data.to(device)
        target = target.to(device)
        attr_target = attr_target.to(device)
        
        optimizer.zero_grad()
        class_output, attr_output = model(data) 
        
        # Gecombineerde loss
        class_loss = F.nll_loss(class_output, target)
        attr_loss = F.mse_loss(attr_output, attr_target) 
        loss = class_loss + 0.5 * attr_loss 
        
        loss.backward()
        optimizer.step()

        if batch_idx % 10 == 0:
            print(f"Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)}] "
                  f"Loss: {loss.item():.6f} (class: {class_loss.item():.4f}, attr: {attr_loss.item():.4f})")

**Testing / validation loop**

In [37]:
def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        for idx, data, target, attr_target, path in test_loader:
            data = data.to(device)
            target = target.to(device)
            
            class_output, attr_output = model(data) 
            test_loss += F.nll_loss(class_output, target, reduction="sum").item()
            pred = class_output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    print(f"Val set: Avg loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({accuracy:.2f}%)")

In [38]:
# Config
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
EPOCHS = 3
LR = 1.0
GAMMA = 0.7

# Model, optimizer, scheduler
model = Net().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=LR)
scheduler = StepLR(optimizer, step_size=1, gamma=GAMMA)

# Training loop
for epoch in range(1, EPOCHS + 1):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, val_loader)
    scheduler.step()

# Model opslaan
torch.save(model.state_dict(), "bird_cnn.pt")

Epoch: 1 [0/3140] Loss: 5.421504 (class: 5.3083, attr: 0.2264)
Epoch: 1 [320/3140] Loss: 5.421217 (class: 5.3081, attr: 0.2263)
Epoch: 1 [640/3140] Loss: 5.397836 (class: 5.2857, attr: 0.2242)
Epoch: 1 [960/3140] Loss: 5.460761 (class: 5.3502, attr: 0.2211)
Epoch: 1 [1280/3140] Loss: 5.389899 (class: 5.2798, attr: 0.2201)
Epoch: 1 [1600/3140] Loss: 5.389816 (class: 5.2816, attr: 0.2165)
Epoch: 1 [1920/3140] Loss: 5.405420 (class: 5.2993, attr: 0.2122)
Epoch: 1 [2240/3140] Loss: 5.374874 (class: 5.2698, attr: 0.2102)
Epoch: 1 [2560/3140] Loss: 5.357296 (class: 5.2531, attr: 0.2083)
Epoch: 1 [2880/3140] Loss: 5.372732 (class: 5.2753, attr: 0.1948)
Val set: Avg loss: 5.2613, Accuracy: 3/786 (0.38%)
Epoch: 2 [0/3140] Loss: 5.334018 (class: 5.2402, attr: 0.1876)
Epoch: 2 [320/3140] Loss: 5.388085 (class: 5.2993, attr: 0.1776)
Epoch: 2 [640/3140] Loss: 5.362556 (class: 5.2753, attr: 0.1745)
Epoch: 2 [960/3140] Loss: 5.436480 (class: 5.3500, attr: 0.1731)
Epoch: 2 [1280/3140] Loss: 5.342386 (

**Creating predictions on the test set and creating a file for submission in Kaggle**

In [47]:
class ImageClassification(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        search_root: Path,
        attributes: np.ndarray = None,  # nu optioneel
        transform: v2.Transform = TRANSFORM_DEFAULT,
    ):
        self.df = df.reset_index(drop=True)
        self.search_root = search_root
        self.transform = transform
        self.attributes = tc.tensor(attributes, dtype=tc.float32) if attributes is not None else None

    def __len__(self) -> int:
        return len(self.df)
    
    def _find_image_path(self, filename: str) -> Path:
        path = self.search_root / filename.lstrip('/')
        if not path.exists():
            raise FileNotFoundError(f"Image file '{filename}' not found at {path}")
        return path

    def __getitem__(self, idx: int):
        row = self.df.iloc[idx]
        filename = Path(str(row["image_path"])).name
        path = self._find_image_path(filename)
        
        image = Image.open(path).convert("RGB")
        image = self.transform(image)
        
        if "label" in row.index and self.attributes is not None:
            label = int(row["label"]) - 1
            attr = self.attributes[label]
            return idx, image, label, attr, str(path)
        else:
            return idx, image, str(path)

In [54]:
test_df = pd.read_csv("/kaggle/input/feathers-in-focus-model/aml-2025-feathers-in-focus/test_images_path.csv")
test_dataset = ImageClassification(
    df=test_df,
    search_root=Path("/kaggle/input/feathers-in-focus-model/aml-2025-feathers-in-focus/test_images"),
    transform=TRANSFORM_DEFAULT,
)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model.eval()
predictions = []

with torch.no_grad():
    for idx, data, path in test_loader:
        data = data.to(device)
        class_output, attr_output = model(data)
        pred = class_output.argmax(dim=1) + 1
        predictions.extend(pred.tolist())

submission = pd.DataFrame({
    "id": test_df["id"].values,
    "label": predictions
})
submission.to_csv("submission.csv", index=False)

print(f"Aantal predictions: {len(predictions)}")
print(f"Aantal test samples: {len(test_df)}")
print(submission.head(10))

Aantal predictions: 4000
Aantal test samples: 4000
   id  label
0   1      4
1   2      4
2   3      4
3   4      4
4   5      4
5   6      4
6   7      4
7   8      4
8   9      4
9  10      4
