In [2]:
import kagglehub
kagglehub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

In [3]:
aa_iv_2025_i_object_localization_path = kagglehub.competition_download('aa-iv-2025-i-object-localization')
print('Data source import complete.')

Data source import complete.


In [4]:
!pip install typing



In [5]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchsummary import summary
import albumentations as A
from sklearn.model_selection import train_test_split
from efficientnet_pytorch import EfficientNet
from torchvision.models import resnet50
from tqdm import tqdm
import cv2
from PIL import Image
import matplotlib.pyplot as plt
import os.path as osp
import typing as ty

In [6]:
# Set device
torch.manual_seed(32)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device}')
test = torch.ones((100, 100)).to(device)
del test
torch.cuda.empty_cache()

Using cuda



## Dataset and preprocessing

In [7]:
# Configuración de rutas y parámetros
DATA_DIR = aa_iv_2025_i_object_localization_path
WORK_DIR = '/kaggle/working'
BATCH_SIZE = 16
IMG_SIZE = (128, 200)  # (height, width)
h_real, w_real = 720, 1280

In [8]:
# Cargar datos
df = pd.read_csv(osp.join(DATA_DIR, "train.csv"))
obj2id = {'f16':0, 'cougar':1, 'chinook':2, 'ah64':3, 'f15':4, 'seahawk':5}
id2obj = {v:k for k,v in obj2id.items()}
df["class_id"] = df["class"].map(obj2id)


In [9]:
df

Unnamed: 0,filename,class,xmin,ymin,xmax,ymax,object_count,class_id
0,image_00373.jpeg,cougar,506,303,628,414,1,1
1,image_00065.jpeg,seahawk,381,35,856,273,1,5
2,image_00111.jpeg,cougar,467,136,830,302,1,1
3,image_00112.jpeg,cougar,447,122,799,287,1,1
4,image_00113.jpeg,cougar,418,141,631,291,1,1
...,...,...,...,...,...,...,...,...
184,image_00107.jpeg,cougar,439,181,837,319,1,1
185,image_00210.jpeg,chinook,631,208,1064,402,1,2
186,image_00002.jpeg,f16,140,268,912,518,1,0
187,image_00044.jpeg,f16,764,311,1042,437,1,0


In [10]:
# Normalizar bboxes
df[["ymin", "ymax"]] = df[["ymin", "ymax"]].div(h_real, axis=0)
df[["xmin", "xmax"]] = df[["xmin", "xmax"]].div(w_real, axis=0)

# Split dataset
train_df, val_df = train_test_split(df, stratify=df['class_id'], test_size=0.25, random_state=42)


In [11]:
def collate_fn(batch):
    batch = [item for item in batch if item is not None]  # Filtrar None
    return torch.utils.data.dataloader.default_collate(batch)

## Custom CNN Backbone

In [12]:
class CustomBackbone(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((7, 7)),
            nn.Flatten()
        )

    def forward(self, x):
        return self.features(x)

## Model with multiple backbone options

In [13]:
class MultiTaskModel(nn.Module):
    def __init__(self, backbone_type='custom', n_classes=6):
        super().__init__()

        # Selección de backbone
        if backbone_type == 'resnet':
            self.backbone = resnet50(pretrained=True)
            self.backbone = nn.Sequential(*list(self.backbone.children())[:-2])
            self.pool = nn.AdaptiveAvgPool2d((1,1))
            out_features = 2048
        elif backbone_type == 'efficientnet':
            # Cargar modelo preentrenado y eliminar la capa FC
            self.backbone = EfficientNet.from_pretrained('efficientnet-b0')
            # Extraer características hasta antes de la capa FC
            self.backbone._avg_pooling = nn.Identity()  # Deshabilitar avg pooling original
            self.backbone._fc = nn.Identity()  # Deshabilitar FC original
            self.pool = nn.AdaptiveAvgPool2d((1,1))  # Nuevo pooling
            out_features = 1280
        else:  # Custom
            self.backbone = CustomBackbone()
            out_features = 256 * 7 * 7

        # Cabezas
        self.cls_head = nn.Sequential(
            nn.Linear(out_features, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, n_classes)
        )

        self.reg_head = nn.Sequential(
            nn.Linear(out_features, 1024),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 4),
            nn.Sigmoid()
        )

    def forward(self, x):

        # features = self.backbone(x)
        # # Check if the features have less than 4 dimensions (batch, channel, height, width)
        # if features.dim() < 4:
        #     # Add a dimension for height and width
        #     features = features.unsqueeze(-1).unsqueeze(-1)

        # # Apply AdaptiveAvgPool2d before flattening for all backbone types if self.pool exists
        # if hasattr(self, 'pool'):
        #     features = self.pool(features)

        # features = features.view(features.size(0), -1)
        if isinstance(self.backbone, EfficientNet):
            # Extraer características y aplicar pooling
            features = self.backbone.extract_features(x)
            features = self.pool(features)
            features = features.view(features.size(0), -1)
        else:
            features = self.backbone(x)
            if hasattr(self, 'pool'):
                features = self.pool(features)
            # if features.dim() < 4:
            #   # Add a dimension for height and width
            #   features = features.unsqueeze(-1).unsqueeze(-1)
            features = features.view(features.size(0), -1)


        return {
            'class_id': self.cls_head(features),
            'bbox': self.reg_head(features)
        }

## Data Augmentation Strategies

In [14]:

# Estrategia 1 - Transformaciones básicas
aug_strategy1 = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.Rotate(limit=30, p=0.5),
    A.RandomBrightnessContrast(p=0.2),
], bbox_params=A.BboxParams(format='albumentations',label_fields=['class_labels'],  min_area=0.01))



In [15]:
# Estrategia 2 - Transformaciones más agresivas
aug_strategy2 = A.Compose([
    A.CoarseDropout(max_holes=8, max_height=32, max_width=32, p=0.5), # Use CoarseDropout instead of Cutout
    A.GaussNoise(var_limit=(10.0, 50.0), p=0.3),
    A.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=0.3),
    A.RandomShadow(p=0.2),
], bbox_params=A.BboxParams(format='albumentations',label_fields=['class_labels'],  min_area=0.01))

  A.CoarseDropout(max_holes=8, max_height=32, max_width=32, p=0.5), # Use CoarseDropout instead of Cutout
  A.GaussNoise(var_limit=(10.0, 50.0), p=0.3),


## Dataset and Transforms

In [16]:
class MilitaryDataset(Dataset):
    def __init__(self, df, root_dir, transform=None, output_size=IMG_SIZE):
        self.df = df
        self.root_dir = root_dir
        self.transform = transform
        self.output_size = output_size
        self.has_annotations = 'xmin' in df.columns

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.root_dir, row['filename'])
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.output_size:
            image = cv2.resize(image, (self.output_size[1], self.output_size[0]))

        bbox = np.zeros(4, dtype=np.float32)
        class_id = -1
        if self.has_annotations:  # Solo si existen anotaciones
            bbox = row[['xmin', 'ymin', 'xmax', 'ymax']].values.astype(np.float32)
            class_id = row['class_id']


        # bbox = row[['xmin', 'ymin', 'xmax', 'ymax']].values.astype(np.float32)
        # class_id = row['class_id']

        if self.transform:
            transformed = self.transform(image=image, bboxes=[bbox], class_labels=[class_id])
            image = transformed['image']

            if len(transformed['bboxes']) == 0:
                return None  # Omitir este ejemplo
            # bbox = transformed['bboxes'][0]
            if transformed['bboxes']:
                bbox = transformed['bboxes'][0]
                # if transformed['class_labels']:
                #   class_id = transformed['class_labels'][0]
            else:
                # Handle empty 'bboxes' - you might want to skip this sample or assign a default bbox
                print(f"Warning: Empty bboxes for image {img_path}. Skipping this sample.")
                if self.output_size:
                    bbox = bbox * [self.output_size[1] / w_real, self.output_size[0] / h_real,
                                   self.output_size[1] / w_real, self.output_size[0] / h_real]

                # New Check if 'class_labels' is empty and use default class_id if necessary
                if transformed['class_labels']:
                    class_id = transformed['class_labels'][0]
                else:
                    print(f"Warning: Empty class_labels for image {img_path}. Using default class_id: {class_id}")
                # end

            class_id = transformed['class_labels'][0]

        image = image.transpose(2, 0, 1).astype(np.float32) / 255.0
        return {
            'image': torch.tensor(image, dtype=torch.float),
            'bbox': torch.tensor(bbox, dtype=torch.float),
            'class_id': torch.tensor(class_id, dtype=torch.long)
        }


## Training Loop


In [17]:
def train_model(backbone_type, aug_strategy, num_epochs=10):
    # Datasets
    train_dataset = MilitaryDataset(
        train_df,
        osp.join(DATA_DIR, "images/images"),
        transform=aug_strategy,
        output_size=IMG_SIZE
    )

    val_dataset = MilitaryDataset(
        val_df,
        osp.join(DATA_DIR, "images/images"),
        output_size=IMG_SIZE
    )

    # DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=0)

    # Model
    model = MultiTaskModel(backbone_type=backbone_type).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    cls_criterion = nn.CrossEntropyLoss()
    reg_criterion = nn.SmoothL1Loss()

    best_iou = 0.0
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0.0

        for batch in tqdm(train_loader):
            images = batch['image'].to(device)
            bboxes = batch['bbox'].to(device)
            classes = batch['class_id'].to(device)

            optimizer.zero_grad()
            outputs = model(images)

            cls_loss = cls_criterion(outputs['class_id'], classes)
            reg_loss = reg_criterion(outputs['bbox'], bboxes)
            loss = cls_loss + reg_loss

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Validation
        model.eval()
        val_cls_correct = 0
        val_iou = 0.0
        with torch.no_grad():
            for batch in val_loader:
                images = batch['image'].to(device)
                bboxes = batch['bbox'].to(device)
                classes = batch['class_id'].to(device)

                outputs = model(images)

                # Classification
                pred_classes = outputs['class_id'].argmax(dim=1)
                val_cls_correct += (pred_classes == classes).sum().item()

                # Regression (IoU)
                pred_boxes = outputs['bbox'].cpu().numpy()
                true_boxes = bboxes.cpu().numpy()
                iou = calculate_batch_iou(pred_boxes, true_boxes)
                val_iou += iou * images.size(0)

        avg_loss = total_loss / len(train_loader)
        val_acc = val_cls_correct / len(val_dataset)
        val_iou = val_iou / len(val_dataset)

        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Train Loss: {avg_loss:.4f} | Val Acc: {val_acc:.4f} | Val IoU: {val_iou:.4f}")

        if val_iou > best_iou:
            best_iou = val_iou
            torch.save(model.state_dict(), f"best_model_{backbone_type}.pth")

    return model

## Evaluation Metrics

In [18]:
def calculate_iou(box1, box2):
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])

    inter_area = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])

    return inter_area / (area1 + area2 - inter_area + 1e-6)

def calculate_batch_iou(pred_boxes, true_boxes):
    ious = []
    for pred, true in zip(pred_boxes, true_boxes):
        pred = pred * [w_real, h_real, w_real, h_real]
        true = true * [w_real, h_real, w_real, h_real]
        ious.append(calculate_iou(pred, true))
    return np.mean(ious)

In [19]:
## Entrenamiento Comparativo
backbones = ['custom', 'resnet', 'efficientnet']
# backbones = ['efficientnet']
aug_strategies = [aug_strategy1, aug_strategy2]
#aug_strategies = [aug_strategy2]

for backbone in backbones:
    for strategy in aug_strategies:
        print(f"\nTraining with {backbone} and augmentation strategy:{strategy} ")
        model = train_model(backbone_type=backbone, aug_strategy=strategy, num_epochs=10)


Training with custom and augmentation strategy:Compose([
  HorizontalFlip(p=0.5),
  Rotate(p=0.5, limit=(-30, 30), interpolation=1, border_mode=0, fill=0.0, fill_mask=0.0, rotate_method='largest_box', crop_border=False, mask_interpolation=0),
  RandomBrightnessContrast(p=0.2, brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), brightness_by_max=True, ensure_safe_range=False),
], p=1.0, bbox_params={'format': 'albumentations', 'label_fields': ['class_labels'], 'min_area': 0.01, 'min_visibility': 0.0, 'min_width': 0.0, 'min_height': 0.0, 'check_each_transform': True, 'clip': False, 'max_accept_ratio': None}, keypoint_params=None, additional_targets={}, is_check_shapes=True) 


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:08<00:00,  1.03it/s]


Epoch 1/10
Train Loss: 1.8197 | Val Acc: 0.2083 | Val IoU: 0.1697


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.20it/s]


Epoch 2/10
Train Loss: 1.7849 | Val Acc: 0.2083 | Val IoU: 0.2744


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.20it/s]


Epoch 3/10
Train Loss: 1.7779 | Val Acc: 0.2083 | Val IoU: 0.2122


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.23it/s]


Epoch 4/10
Train Loss: 1.7654 | Val Acc: 0.2083 | Val IoU: 0.2645


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.21it/s]


Epoch 5/10
Train Loss: 1.7572 | Val Acc: 0.3542 | Val IoU: 0.2715


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.21it/s]


Epoch 6/10
Train Loss: 1.7415 | Val Acc: 0.2292 | Val IoU: 0.2773


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.17it/s]


Epoch 7/10
Train Loss: 1.7509 | Val Acc: 0.2292 | Val IoU: 0.1459


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.18it/s]


Epoch 8/10
Train Loss: 1.7291 | Val Acc: 0.3542 | Val IoU: 0.2771


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.21it/s]


Epoch 9/10
Train Loss: 1.6939 | Val Acc: 0.3125 | Val IoU: 0.2645


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.23it/s]


Epoch 10/10
Train Loss: 1.6668 | Val Acc: 0.3750 | Val IoU: 0.2109

Training with custom and augmentation strategy:Compose([
  CoarseDropout(p=0.5, fill=0.0, fill_mask=None, num_holes_range=(1, 2), hole_height_range=(0.1, 0.2), hole_width_range=(0.1, 0.2)),
  GaussNoise(p=0.3, std_range=(0.2, 0.44), mean_range=(0.0, 0.0), per_channel=True, noise_scale_factor=1.0),
  HueSaturationValue(p=0.3, hue_shift_limit=(-20, 20), sat_shift_limit=(-30, 30), val_shift_limit=(-20, 20)),
  RandomShadow(p=0.2, shadow_roi=(0.0, 0.5, 1.0, 1.0), num_shadows_limit=(1, 2), shadow_dimension=5),
], p=1.0, bbox_params={'format': 'albumentations', 'label_fields': ['class_labels'], 'min_area': 0.01, 'min_visibility': 0.0, 'min_width': 0.0, 'min_height': 0.0, 'check_each_transform': True, 'clip': False, 'max_accept_ratio': None}, keypoint_params=None, additional_targets={}, is_check_shapes=True) 


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:08<00:00,  1.09it/s]


Epoch 1/10
Train Loss: 1.8163 | Val Acc: 0.2083 | Val IoU: 0.1894


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:08<00:00,  1.06it/s]


Epoch 2/10
Train Loss: 1.7906 | Val Acc: 0.2083 | Val IoU: 0.2418


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.15it/s]


Epoch 3/10
Train Loss: 1.7826 | Val Acc: 0.2292 | Val IoU: 0.2148


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:08<00:00,  1.08it/s]


Epoch 4/10
Train Loss: 1.7580 | Val Acc: 0.2083 | Val IoU: 0.2617


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.13it/s]


Epoch 5/10
Train Loss: 1.7633 | Val Acc: 0.2083 | Val IoU: 0.2380


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.20it/s]


Epoch 6/10
Train Loss: 1.7333 | Val Acc: 0.3542 | Val IoU: 0.2621


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.19it/s]


Epoch 7/10
Train Loss: 1.7422 | Val Acc: 0.2917 | Val IoU: 0.1932


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.13it/s]


Epoch 8/10
Train Loss: 1.7095 | Val Acc: 0.3542 | Val IoU: 0.2814


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.20it/s]


Epoch 9/10
Train Loss: 1.6545 | Val Acc: 0.3333 | Val IoU: 0.2367


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:07<00:00,  1.18it/s]


Epoch 10/10
Train Loss: 1.6055 | Val Acc: 0.2917 | Val IoU: 0.2463

Training with resnet and augmentation strategy:Compose([
  HorizontalFlip(p=0.5),
  Rotate(p=0.5, limit=(-30, 30), interpolation=1, border_mode=0, fill=0.0, fill_mask=0.0, rotate_method='largest_box', crop_border=False, mask_interpolation=0),
  RandomBrightnessContrast(p=0.2, brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), brightness_by_max=True, ensure_safe_range=False),
], p=1.0, bbox_params={'format': 'albumentations', 'label_fields': ['class_labels'], 'min_area': 0.01, 'min_visibility': 0.0, 'min_width': 0.0, 'min_height': 0.0, 'check_each_transform': True, 'clip': False, 'max_accept_ratio': None}, keypoint_params=None, additional_targets={}, is_check_shapes=True) 


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.51s/it]


Epoch 1/10
Train Loss: 1.7800 | Val Acc: 0.3542 | Val IoU: 0.2702


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.49s/it]


Epoch 2/10
Train Loss: 1.5414 | Val Acc: 0.4375 | Val IoU: 0.2989


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.49s/it]


Epoch 3/10
Train Loss: 1.1673 | Val Acc: 0.5833 | Val IoU: 0.2870


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.47s/it]


Epoch 4/10
Train Loss: 0.8345 | Val Acc: 0.6875 | Val IoU: 0.2485


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.47s/it]


Epoch 5/10
Train Loss: 0.4666 | Val Acc: 0.8125 | Val IoU: 0.2786


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.48s/it]


Epoch 6/10
Train Loss: 0.3160 | Val Acc: 0.8333 | Val IoU: 0.3137


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.51s/it]


Epoch 7/10
Train Loss: 0.2867 | Val Acc: 0.8750 | Val IoU: 0.3093


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.49s/it]


Epoch 8/10
Train Loss: 0.2778 | Val Acc: 0.8333 | Val IoU: 0.3152


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.50s/it]


Epoch 9/10
Train Loss: 0.1930 | Val Acc: 0.8750 | Val IoU: 0.3222


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.49s/it]


Epoch 10/10
Train Loss: 0.2304 | Val Acc: 0.8542 | Val IoU: 0.3200

Training with resnet and augmentation strategy:Compose([
  CoarseDropout(p=0.5, fill=0.0, fill_mask=None, num_holes_range=(1, 2), hole_height_range=(0.1, 0.2), hole_width_range=(0.1, 0.2)),
  GaussNoise(p=0.3, std_range=(0.2, 0.44), mean_range=(0.0, 0.0), per_channel=True, noise_scale_factor=1.0),
  HueSaturationValue(p=0.3, hue_shift_limit=(-20, 20), sat_shift_limit=(-30, 30), val_shift_limit=(-20, 20)),
  RandomShadow(p=0.2, shadow_roi=(0.0, 0.5, 1.0, 1.0), num_shadows_limit=(1, 2), shadow_dimension=5),
], p=1.0, bbox_params={'format': 'albumentations', 'label_fields': ['class_labels'], 'min_area': 0.01, 'min_visibility': 0.0, 'min_width': 0.0, 'min_height': 0.0, 'check_each_transform': True, 'clip': False, 'max_accept_ratio': None}, keypoint_params=None, additional_targets={}, is_check_shapes=True) 


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:14<00:00,  1.56s/it]


Epoch 1/10
Train Loss: 1.7715 | Val Acc: 0.4167 | Val IoU: 0.2987


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.49s/it]


Epoch 2/10
Train Loss: 1.6072 | Val Acc: 0.5417 | Val IoU: 0.2765


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.50s/it]


Epoch 3/10
Train Loss: 1.3857 | Val Acc: 0.5625 | Val IoU: 0.2702


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.52s/it]


Epoch 4/10
Train Loss: 1.0368 | Val Acc: 0.7083 | Val IoU: 0.2659


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.52s/it]


Epoch 5/10
Train Loss: 0.7450 | Val Acc: 0.8125 | Val IoU: 0.2637


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.49s/it]


Epoch 6/10
Train Loss: 0.5274 | Val Acc: 0.8125 | Val IoU: 0.2627


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.49s/it]


Epoch 7/10
Train Loss: 0.5071 | Val Acc: 0.8125 | Val IoU: 0.2665


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.50s/it]


Epoch 8/10
Train Loss: 0.4300 | Val Acc: 0.8542 | Val IoU: 0.2843


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.48s/it]


Epoch 9/10
Train Loss: 0.3379 | Val Acc: 0.8125 | Val IoU: 0.2523


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:13<00:00,  1.52s/it]


Epoch 10/10
Train Loss: 0.3740 | Val Acc: 0.8125 | Val IoU: 0.2924

Training with efficientnet and augmentation strategy:Compose([
  HorizontalFlip(p=0.5),
  Rotate(p=0.5, limit=(-30, 30), interpolation=1, border_mode=0, fill=0.0, fill_mask=0.0, rotate_method='largest_box', crop_border=False, mask_interpolation=0),
  RandomBrightnessContrast(p=0.2, brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), brightness_by_max=True, ensure_safe_range=False),
], p=1.0, bbox_params={'format': 'albumentations', 'label_fields': ['class_labels'], 'min_area': 0.01, 'min_visibility': 0.0, 'min_width': 0.0, 'min_height': 0.0, 'check_each_transform': True, 'clip': False, 'max_accept_ratio': None}, keypoint_params=None, additional_targets={}, is_check_shapes=True) 
Loaded pretrained weights for efficientnet-b0


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.14s/it]


Epoch 1/10
Train Loss: 1.8153 | Val Acc: 0.2292 | Val IoU: 0.1136


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.15s/it]


Epoch 2/10
Train Loss: 1.7533 | Val Acc: 0.2292 | Val IoU: 0.2581


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.15s/it]


Epoch 3/10
Train Loss: 1.7000 | Val Acc: 0.2500 | Val IoU: 0.3064


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.15s/it]


Epoch 4/10
Train Loss: 1.6035 | Val Acc: 0.4167 | Val IoU: 0.3070


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.17s/it]


Epoch 5/10
Train Loss: 1.4615 | Val Acc: 0.4375 | Val IoU: 0.2946


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.17s/it]


Epoch 6/10
Train Loss: 1.2938 | Val Acc: 0.4792 | Val IoU: 0.2971


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.22s/it]


Epoch 7/10
Train Loss: 1.0738 | Val Acc: 0.6042 | Val IoU: 0.3168


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.15s/it]


Epoch 8/10
Train Loss: 0.9236 | Val Acc: 0.6875 | Val IoU: 0.3338


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.17s/it]


Epoch 9/10
Train Loss: 0.7726 | Val Acc: 0.6875 | Val IoU: 0.3329


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.15s/it]


Epoch 10/10
Train Loss: 0.6786 | Val Acc: 0.6875 | Val IoU: 0.3327

Training with efficientnet and augmentation strategy:Compose([
  CoarseDropout(p=0.5, fill=0.0, fill_mask=None, num_holes_range=(1, 2), hole_height_range=(0.1, 0.2), hole_width_range=(0.1, 0.2)),
  GaussNoise(p=0.3, std_range=(0.2, 0.44), mean_range=(0.0, 0.0), per_channel=True, noise_scale_factor=1.0),
  HueSaturationValue(p=0.3, hue_shift_limit=(-20, 20), sat_shift_limit=(-30, 30), val_shift_limit=(-20, 20)),
  RandomShadow(p=0.2, shadow_roi=(0.0, 0.5, 1.0, 1.0), num_shadows_limit=(1, 2), shadow_dimension=5),
], p=1.0, bbox_params={'format': 'albumentations', 'label_fields': ['class_labels'], 'min_area': 0.01, 'min_visibility': 0.0, 'min_width': 0.0, 'min_height': 0.0, 'check_each_transform': True, 'clip': False, 'max_accept_ratio': None}, keypoint_params=None, additional_targets={}, is_check_shapes=True) 
Loaded pretrained weights for efficientnet-b0


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.16s/it]


Epoch 1/10
Train Loss: 1.8175 | Val Acc: 0.2083 | Val IoU: 0.0537


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.17s/it]


Epoch 2/10
Train Loss: 1.7699 | Val Acc: 0.2500 | Val IoU: 0.1993


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.16s/it]


Epoch 3/10
Train Loss: 1.7264 | Val Acc: 0.3125 | Val IoU: 0.2915


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.15s/it]


Epoch 4/10
Train Loss: 1.6786 | Val Acc: 0.3542 | Val IoU: 0.2947


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.15s/it]


Epoch 5/10
Train Loss: 1.6093 | Val Acc: 0.5000 | Val IoU: 0.2861


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:11<00:00,  1.22s/it]


Epoch 6/10
Train Loss: 1.4751 | Val Acc: 0.6042 | Val IoU: 0.2842


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.17s/it]


Epoch 7/10
Train Loss: 1.3604 | Val Acc: 0.5833 | Val IoU: 0.3079


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.16s/it]


Epoch 8/10
Train Loss: 1.2322 | Val Acc: 0.6250 | Val IoU: 0.3165


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.16s/it]


Epoch 9/10
Train Loss: 1.1101 | Val Acc: 0.6250 | Val IoU: 0.3217


100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:10<00:00,  1.16s/it]


Epoch 10/10
Train Loss: 0.9949 | Val Acc: 0.6042 | Val IoU: 0.3239


In [None]:
import torch
print(torch.__version__)  # Debe ser ≥2.0.0
print(torch.cuda.is_available())  # Debe ser True
print(torch.cuda.get_device_name(0))  # Debe mostrar "NVIDIA GeForce MX110"

2.5.1+cu121
True
NVIDIA GeForce MX110



## Generación de Submission

In [None]:
def create_submission(model_path, backbone_type):
    # Verificar existencia del modelo
    if not os.path.exists(model_path):
        print(f"Modelo {model_path} no encontrado. Entrenando primero...")
        model = train_model(backbone_type, aug_strategy1)  # Usar estrategia default
        torch.save(model.state_dict(), model_path)

    # Cargar modelo
    model = MultiTaskModel(backbone_type=backbone_type)

    try:
        model.load_state_dict(torch.load(model_path, map_location=device))
    except Exception as e:
        print(f"Error cargando {model_path}: {str(e)}")
        return None

    model.to(device).eval()

    # Resto del código original...
    test_df = pd.read_csv(osp.join(DATA_DIR, "test.csv"))
    test_dataset = MilitaryDataset(
        test_df,
        osp.join(DATA_DIR, "images/images"),
        output_size=IMG_SIZE
    )
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    all_preds = []
    global_idx = 0  # Contador de índices globales
    with torch.no_grad():
        for batch in test_loader:
            images = batch['image'].to(device)
            outputs = model(images)

            pred_classes = outputs['class_id'].argmax(dim=1).cpu().numpy()
            pred_boxes = outputs['bbox'].cpu().numpy()

            # Obtener nombres de archivo usando el contador global
            for cls, box in zip(pred_classes, pred_boxes):
                filename = test_df.iloc[global_idx]['filename']
                all_preds.append({
                    'filename': filename,
                    'class': id2obj[cls],
                    'xmin': max(0, int(box[0] * w_real)),
                    'ymin': max(0, int(box[1] * h_real)),
                    'xmax': min(w_real, int(box[2] * w_real)),
                    'ymax': min(h_real, int(box[3] * h_real))
                })
                global_idx += 1  # Incrementar índice global

    submission_df = pd.DataFrame(all_preds)
    submission_df = submission_df[['filename', 'class', 'xmin', 'ymin', 'xmax', 'ymax']]

    # Validar formato final
    assert set(submission_df.columns) == {'filename', 'class', 'xmin', 'ymin', 'xmax', 'ymax'}, \
        "Columnas incorrectas en el submission"

    submission_df.to_csv(f'submission_{backbone_type}_v3.csv', index=False)
    print(f"Submission generado para {backbone_type} en submission_{backbone_type}_v3.csv")
    return submission_df

# Asegurar que los modelos existen antes de generar submissions
backbones = ['custom', 'resnet', 'efficientnet']
for backbone in backbones:
    model_path = f"best_model_{backbone}.pth"
    if not os.path.exists(model_path):
        print(f"Entrenando modelo {backbone}...")
        model = train_model(backbone, aug_strategy1, num_epochs=15)
        torch.save(model.state_dict(), model_path)

    create_submission(model_path, backbone)

  model.load_state_dict(torch.load(model_path, map_location=device))


Submission generado para custom en submission_custom_v3.csv


  model.load_state_dict(torch.load(model_path, map_location=device))


Submission generado para resnet en submission_resnet_v3.csv
Loaded pretrained weights for efficientnet-b0


  model.load_state_dict(torch.load(model_path, map_location=device))


Submission generado para efficientnet en submission_efficientnet_v3.csv


In [None]:
test_df = pd.read_csv(osp.join(DATA_DIR, "test.csv"))


In [None]:
test_df = pd.read_csv(osp.join(DATA_DIR, "test.csv"))
test_dataset = MilitaryDataset(
        test_df,
        osp.join(DATA_DIR, "images/images"),
        output_size=IMG_SIZE
    )
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader

<torch.utils.data.dataloader.DataLoader at 0x211eab82f00>

In [None]:
test_loader

<torch.utils.data.dataloader.DataLoader at 0x211eab82f00>

In [None]:
# prompt: crea un codigo para descargar el csv test_df

import pandas as pd
import os

DATA_DIR = aa_iv_2025_i_object_localization_path  # Assuming this variable is defined from previous code

test_df = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))
test_df.to_csv('test_df.csv', index=False)
