In [2]:
import pandas as pd
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from transformers import AutoImageProcessor, AutoModelForImageClassification
from torch.utils.data import Dataset
from PIL import Image
import torch
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchvision import transforms, models
from PIL import Image
import torch.nn as nn
from tqdm import tqdm
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [3]:
# choose device, not recommended to train with 'cpu

device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

print(device)

mps


In [4]:
class BirdDataset(Dataset):
    def __init__(self, df, transform=None, is_test=False):
        self.df = df
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        img_path = row["image_path"]     # ya viene completo desde load_data()
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        if self.is_test:
            sample_id = int(row["id"])
            return image, sample_id

        label = int(row["label"])
        return image, label


In [5]:
def load_data():
    # reads csvs
    train_df = pd.read_csv("aml-2025-feathers-in-focus/train_images.csv")
    test_df = pd.read_csv("aml-2025-feathers-in-focus/test_images_path.csv")

    # adjusts labels for the model
    train_df["label"] = train_df["label"] - 1

    # rewrite full image_path to have the correct folder
    train_df["image_path"] = "aml-2025-feathers-in-focus/train_images/train_images/" + train_df["image_path"].str.split("/").str[-1]
    test_df["image_path"] = "aml-2025-feathers-in-focus/test_images/test_images/" + test_df["image_path"].str.split("/").str[-1]

    # print sizes
    print(f"Train: {len(train_df)} | Test: {len(test_df)}")
    return train_df, test_df


In [6]:
train_tfms = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomResizedCrop(224, scale=(0.7, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.RandomErasing(p=0.2, scale=(0.02, 0.2), ratio=(0.3, 3.3), value='random'),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

test_tfms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


In [7]:
train_df, test_df = load_data()

train_ds = BirdDataset(train_df, transform=train_tfms, is_test=False)
test_ds  = BirdDataset(test_df,  transform=test_tfms,  is_test=True)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=0)
test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False, num_workers=0)

print("Train:", len(train_loader))
print("Test:", len(test_loader))


Train: 3926 | Test: 4000
Train: 123
Test: 125


In [8]:
# ----------------------------
# SE Block
# ----------------------------
class SEBlock(nn.Module):
    def __init__(self, channels, reduction=16):
        super().__init__()
        self.fc1 = nn.Linear(channels, channels // reduction)
        self.fc2 = nn.Linear(channels // reduction, channels)

    def forward(self, x):
        b, c, h, w = x.size()
        y = x.mean((2,3))              # Global Average Pooling
        y = F.relu(self.fc1(y))
        y = torch.sigmoid(self.fc2(y))
        y = y.view(b, c, 1, 1)
        return x * y

# ----------------------------
# Basic Residual Block con SE
# ----------------------------
class BasicBlockSE(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride=1, reduction=16):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.se = SEBlock(out_channels, reduction)

        # Identity / shortcut
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 1, stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        identity = self.shortcut(x)
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.se(out)         # SE block
        out += identity
        out = self.relu(out)
        return out

# ----------------------------
# ResNet18 mejorado: Stem moderno + SE blocks
# ----------------------------
class ResNet18Enhanced(nn.Module):
    def __init__(self, num_classes=200):
        super().__init__()
        # ----------------------------
        # Stem moderno (3 convs en lugar de 7x7)
        # ----------------------------
        self.stem = nn.Sequential(
            nn.Conv2d(3, 32, 3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, 3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 64, 3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(3, stride=2, padding=1)
        )

        # ----------------------------
        # Bloques residuales con SE
        # ----------------------------
        self.layer1 = self._make_layer(64, 64, 2, stride=1)
        self.layer2 = self._make_layer(64, 128, 2, stride=2)
        self.layer3 = self._make_layer(128, 256, 2, stride=2)
        self.layer4 = self._make_layer(256, 512, 2, stride=2)

        # ----------------------------
        # Clasificador
        # ----------------------------
        self.gap = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512, num_classes)

    def _make_layer(self, in_channels, out_channels, blocks, stride):
        layers = [BasicBlockSE(in_channels, out_channels, stride)]
        for _ in range(1, blocks):
            layers.append(BasicBlockSE(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.stem(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.gap(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

In [9]:
# load pretrained 
model = ResNet18Enhanced(num_classes=200)
state_dict = torch.load("resnet18_enhanced_best.pth", map_location=torch.device(device))
model.load_state_dict(state_dict)

<All keys matched successfully>

In [10]:
model.to(device)
model.eval()

predictions = []

with torch.no_grad():
    for images, ids in tqdm(test_loader):

        images = images.to(device)

        logits = model(images)
        preds = torch.argmax(logits, dim=1) + 1   # convertir 0–199 → 1–200

        for i in range(len(preds)):
            predictions.append({
                "id": int(ids[i].item()),
                "label": int(preds[i].item())
            })

pred_df = pd.DataFrame(predictions)
pred_df.to_csv("submission_enhanced.csv", index=False)
print("Predictions saved to submission_enhanced.csv")


100%|██████████| 125/125 [00:31<00:00,  4.01it/s]

Predictions saved to submission_enhanced.csv



