## ResNet

ResNet y VGG comparten la idea de bloques repetitivos, pero difieren en lo que hacen dentro del bloque y en cómo conectan los bloques entre sí.

ResNet se basa en "bloques residuales".
Cada bloque tiene un atajo (“skip connection”) que suma la entrada original del bloque (x) con la salida procesada (F(x)) para tratar de solucionar el problema de desvanecimiento del gradiente.

De esta forma, el gradiente puede fluir hacia atras sin perderse en redes mucho más profundas.

In [11]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch
from datetime import datetime
from torchmetrics.classification import MulticlassPrecision, MulticlassRecall, MulticlassF1Score

TENSORBOARD_EXP = f"runs/cifar10_cnn_step20_lr0015_{datetime.now().strftime('%Y%m%d-%H%M%S')}"


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

torch.manual_seed(3)

transform = transforms.ToTensor()

train_full = datasets.CIFAR10("./data", train=True, download=True, transform=transform)


#Obtenemos el dataset train completo
loader_train = DataLoader(train_full, batch_size=len(train_full), shuffle=False, drop_last=True)                       
#Obtenemos un batch de datos
imgs, _ = next(iter(loader_train))             
print(imgs.shape) # [50000,3,32,32]

CIFAR10_MEAN = imgs.mean(dim=(0,2,3))
CIFAR10_STD  = imgs.std(dim=(0,2,3))
print(CIFAR10_MEAN, CIFAR10_STD)


cuda
torch.Size([50000, 3, 32, 32])
tensor([0.4914, 0.4822, 0.4465]) tensor([0.2470, 0.2435, 0.2616])


In [12]:

#CIFAR10_MEAN = torch.tensor([0.4914, 0.4822, 0.4465])
#CIFAR10_STD  = torch.tensor([0.2470, 0.2435, 0.2616])



#Creamos el transform para data augmentation y normalización de TRAIN_SET

train_tf = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN.tolist(), CIFAR10_STD.tolist())
    
])

#Creamos el transform para normalización de TRAIN_SET y EVAL sin aug
no_aug = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN.tolist(), CIFAR10_STD.tolist())
])

# Re-creamos los datasets con el nuevo transform + uno nuevo para validacion y que no pase por aug
train_full_aug = datasets.CIFAR10(root="./data", train=True, download=True, transform=train_tf)

train_full_no_aug = datasets.CIFAR10(root="./data", train=True, download=True, transform=no_aug)

test_set = datasets.CIFAR10(root="./data", train=False, download=True, transform=no_aug)


#Comprobamos que los datos estan normalizados
check_loader_train_full = DataLoader(train_full_no_aug, batch_size=len(train_full_no_aug), shuffle=False) #Dataset completo para calcular la media y la desviación estándar de los datos ya normalizados
x, _ = next(iter(check_loader_train_full))
mean_check = x.mean(dim=(0, 2, 3))
std_check  = x.std(dim=(0, 2, 3))
print("Mean Appx: 0):", mean_check)
print("Std Appx: 1):", std_check)
assert mean_check.abs().max() < 0.05
assert (std_check - 1).abs().max() < 0.05



#Dividimos el dataset en train y validation para no_aug
train_set, val_set = torch.utils.data.random_split(generator=torch.Generator().manual_seed(3), dataset=train_full_no_aug, lengths=[40000, 10000])

#Dividimos el dataset en train y validation para AUG
train_set_aug, val_set_aug = torch.utils.data.random_split(generator=torch.Generator().manual_seed(3), dataset=train_full_aug, lengths=[40000, 10000])

#Comprobamos que el dataset se ha dividido correctamente
print(len(train_set), len(val_set))
assert train_set.indices == train_set_aug.indices
assert val_set.indices   == val_set_aug.indices


#Nuestros loaders para entrenar, validar y testear, ya normalizados.

#Vamos a mejorar rendimiento de la GPU

NUM_WORKERS = 4

loader_train = DataLoader(train_set_aug, batch_size=128, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, persistent_workers=True, prefetch_factor=2) #Barajamos porque es train y mejora la generalización
loader_val = DataLoader(val_set, batch_size=256, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True, persistent_workers=True, prefetch_factor=2)
loader_test = DataLoader(test_set, batch_size=256, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True, persistent_workers=True, prefetch_factor=2)








Mean Appx: 0): tensor([-4.5197e-06, -2.3309e-06, -4.8446e-08])
Std Appx: 1): tensor([1.0000, 1.0000, 1.0000])
40000 10000


Vamos a implementar una Clase con un bloque residual simple.

In [13]:
# Residual Block
# ===========================================================
# NoSkip: Conv3x3(stride s) → BN → ReLU → Conv3x3(stride 1) → BN
# Skip: Identity o Conv1x1(stride s)
# Resultado final: NoSkip + ReLU o Skip + ReLU
# ===========================================================
import torch
import torch.nn as nn

class SimpleResidualBlock(nn.Module):
    """Conv-BN-ReLU -> Conv-BN -> +skip -> ReLU"""

    def __init__(self, in_ch, out_ch, stride=1):
        super().__init__()

        # ===========================================================
        # Camino principal (F(x))
        # ===========================================================
        self.main = nn.Sequential(
            nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(out_ch)
        )

        # ===========================================================
        # Skip)
        # ===========================================================
        # Si el stride es distinto de 1 o cambian los canales,
        # creamos una conv1x1 para igualar resolución y profundidad.
        # Si no, simplemente usamos Identity (no toca nada).
        if stride != 1 or in_ch != out_ch:
            self.skip = nn.Sequential(
                nn.Conv2d(in_ch, out_ch, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_ch)
            )
        else:
            self.skip = nn.Identity()

        # ReLU final (post-suma)
        self.relu = nn.ReLU(inplace=True)

    # ===========================================================
    # Forward: el flujo real de datos
    # ===========================================================
    def forward(self, x):
        # Calculamos el camino principal y el atajo en paralelo
        out = self.main(x)
        skip = self.skip(x)
        # Sumamos ambos caminos
        out = out + skip
        # Aplicamos ReLU final
        return self.relu(out)


# ===========================================================
# Helper para crear una capa con varios bloques seguidos
# ===========================================================
def _make_layer(block, in_ch, out_ch, num_blocks, first_stride):
    layers = []
    # Primer bloque: puede reducir resolución y/o cambiar canales
    layers.append(block(in_ch, out_ch, stride=first_stride))
    # Resto de bloques: stride=1, canales constantes
    for _ in range(1, num_blocks):
        layers.append(block(out_ch, out_ch, stride=1))
    return nn.Sequential(*layers)


Ahora implementaremos el resto de la red e incluiremos los bloques residuales

In [14]:
class ResNet20(nn.Module):
    """
    ResNet-20 para CIFAR-10
    Estructura: 1 conv inicial + 3 grupos de 3 bloques residuales + avg pool + FC
    """
    def __init__(self, num_classes=10):
        super().__init__()
        
        # Capa convolucional inicial (no reduce resolución en CIFAR10)
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True)
        )
        
        # 3 Grupos Residuales
        # Grupo 1: 16 canales, 32x32 -> 32x32 (stride=1)
        self.layer1 = _make_layer(SimpleResidualBlock, 16, 16, num_blocks=3, first_stride=1)
        
        # Grupo 2: 32 canales, 32x32 -> 16x16 (stride=2 en el primer bloque)
        self.layer2 = _make_layer(SimpleResidualBlock, 16, 32, num_blocks=3, first_stride=2)
        
        # Grupo 3: 64 canales, 16x16 -> 8x8 (stride=2 en el primer bloque)
        self.layer3 = _make_layer(SimpleResidualBlock, 32, 64, num_blocks=3, first_stride=2)
        
        # Clasificador
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(64, num_classes)
    
    def forward(self, x):
        # Conv inicial
        x = self.conv1(x)           # [B, 16, 32, 32]
        
        # Bloques residuales
        x = self.layer1(x)          # [B, 16, 32, 32]
        x = self.layer2(x)          # [B, 32, 16, 16]
        x = self.layer3(x)          # [B, 64, 8, 8]
        
        # Clasificación
        x = self.avgpool(x)         # [B, 64, 1, 1]
        x = x.view(x.size(0), -1)   # [B, 64]
        x = self.fc(x)              # [B, 10]
        return x

In [15]:
#GPU
torch.backends.cudnn.benchmark = True
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision("high") 
from torchsummary import summary
# Instancia + sanity
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ResNet20().to(device)

with torch.no_grad():
    xb = torch.randn(4, 3, 32, 32, device=device)
    yb = model(xb)
print("Input:", xb.shape, "| Output:", yb.shape)  
print("Params totales:", sum(p.numel() for p in model.parameters()))

print(summary(model, (3, 32, 32)))

Input: torch.Size([4, 3, 32, 32]) | Output: torch.Size([4, 10])
Params totales: 272474
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 32, 32]             432
       BatchNorm2d-2           [-1, 16, 32, 32]              32
              ReLU-3           [-1, 16, 32, 32]               0
            Conv2d-4           [-1, 16, 32, 32]           2,304
       BatchNorm2d-5           [-1, 16, 32, 32]              32
              ReLU-6           [-1, 16, 32, 32]               0
            Conv2d-7           [-1, 16, 32, 32]           2,304
       BatchNorm2d-8           [-1, 16, 32, 32]              32
          Identity-9           [-1, 16, 32, 32]               0
             ReLU-10           [-1, 16, 32, 32]               0
SimpleResidualBlock-11           [-1, 16, 32, 32]               0
           Conv2d-12           [-1, 16, 32, 32]           2,304
      BatchNor

In [16]:
#TorchMetrics
NUM_CLASSES = 10  # CIFAR-10

precision_metric = MulticlassPrecision(num_classes=NUM_CLASSES, average="macro").to(device)
recall_metric    = MulticlassRecall(num_classes=NUM_CLASSES, average="macro").to(device)
f1_metric        = MulticlassF1Score(num_classes=NUM_CLASSES, average="macro").to(device)


EPOCHS = 100  
lr = 0.1
momentum = 0.9

criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
weight_decay=1e-4

optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)


from torch.optim.lr_scheduler import ReduceLROnPlateau

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=8, threshold=1e-3)
lrs = []

model.train()    



ResNet20(
  (conv1): Sequential(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): SimpleResidualBlock(
      (main): Sequential(
        (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (4): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (skip): Identity()
      (relu): ReLU(inplace=True)
    )
    (1): SimpleResidualBlock(
      (main): Sequential(
        (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=

In [17]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir=TENSORBOARD_EXP)  # <-- añadido

def evaluate(model, loader, device, criterion):
    model.eval()
    total, correct, total_loss = 0, 0, 0.0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            logits = model(images)
            loss = criterion(logits, labels)
            total_loss += loss.item() * images.size(0)
            preds = logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return total_loss / total, correct / total


train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []
lrs = []

for epoch in range(1, EPOCHS + 1):
    model.train()
    running_loss, correct, total = 0.0, 0, 0

    for images, labels in loader_train:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        logits = model(images)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    train_loss = running_loss / total
    train_acc = correct / total
    val_loss, val_acc = evaluate(model, loader_val, device, criterion)

    precision_metric.reset()
    recall_metric.reset()
    f1_metric.reset()

    with torch.no_grad():
        for images, labels in loader_val:
            images, labels = images.to(device), labels.to(device)
            preds = model(images).argmax(dim=1)
            precision_metric.update(preds, labels)
            recall_metric.update(preds, labels)
            f1_metric.update(preds, labels)

    precision_val = precision_metric.compute().item()
    recall_val    = recall_metric.compute().item()
    f1_val        = f1_metric.compute().item()

    writer.add_scalar("Val/Precision_macro", precision_val, epoch)
    writer.add_scalar("Val/Recall_macro",    recall_val,  epoch)
    writer.add_scalar("Val/F1_macro",        f1_val,      epoch)

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

    print(
        f"Época {epoch}/{EPOCHS} | "
        f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
        f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}"
    )

    lrs.append(optimizer.param_groups[0]['lr'])
    scheduler.step(val_loss)

    #Escribimos los resultados de cada epoch en tensorboard
    writer.add_scalar("Loss/train", train_loss, epoch)
    writer.add_scalar("Loss/val",  val_loss,  epoch)
    writer.add_scalar("Acc/train",  train_acc,  epoch)
    writer.add_scalar("Acc/val",   val_acc,   epoch)
    writer.add_scalar("LR", optimizer.param_groups[0]['lr'], epoch)
    writer.flush()  # <-- fuerza escritura inmediata para ver en tiempo real

#Evaluamos el modelo en el test set y añadimos los resultados a tensorboard
test_loss, test_acc = evaluate(model, loader_test, device, criterion)

hparams = {
    'model': model.__class__.__name__,
    'seed': 3,
    'optimizer': optimizer.__class__.__name__,
    'lr_init': float(lr),
    'momentum': float(momentum),
    'batch_size': int(loader_train.batch_size),
    'weight_decay': float(optimizer.param_groups[0].get('weight_decay', 0.0)),
    'scheduler': type(scheduler).__name__,
}

metrics = {
    'metrics/test_acc': float(test_acc),
    'metrics/test_loss': float(test_loss),
    'metrics/val_acc_last': float(val_accuracies[-1]),
    'metrics/val_loss_last': float(val_losses[-1]),
    'metrics/train_acc_last': float(train_accuracies[-1]),
    'metrics/train_loss_last': float(train_losses[-1]),
}

writer.add_hparams(hparams, metrics)
writer.close()



Época 1/100 | Train Loss: 1.8139 | Train Acc: 0.3755 | Val Loss: 2.0461 | Val Acc: 0.4072
Época 2/100 | Train Loss: 1.4653 | Train Acc: 0.5555 | Val Loss: 1.5101 | Val Acc: 0.5551
Época 3/100 | Train Loss: 1.2876 | Train Acc: 0.6453 | Val Loss: 1.5015 | Val Acc: 0.5788
Época 4/100 | Train Loss: 1.1723 | Train Acc: 0.7011 | Val Loss: 1.2089 | Val Acc: 0.6969
Época 5/100 | Train Loss: 1.0947 | Train Acc: 0.7411 | Val Loss: 1.0979 | Val Acc: 0.7458
Época 6/100 | Train Loss: 1.0403 | Train Acc: 0.7669 | Val Loss: 1.1124 | Val Acc: 0.7444
Época 7/100 | Train Loss: 0.9959 | Train Acc: 0.7880 | Val Loss: 1.2837 | Val Acc: 0.6897
Época 8/100 | Train Loss: 0.9663 | Train Acc: 0.7998 | Val Loss: 1.0205 | Val Acc: 0.7845
Época 9/100 | Train Loss: 0.9379 | Train Acc: 0.8124 | Val Loss: 1.0032 | Val Acc: 0.7877
Época 10/100 | Train Loss: 0.9218 | Train Acc: 0.8189 | Val Loss: 1.0750 | Val Acc: 0.7526
Época 11/100 | Train Loss: 0.9002 | Train Acc: 0.8294 | Val Loss: 0.9739 | Val Acc: 0.8016
Época 12