## Parameter Tunning

Vamos a mejorar el entrenamiento del anterior todavía más, hasta conseguir una convergencia que nos sirva.

Aqui experimentaremos:

- Subiendo EPOCHS

- Aumentando el tamaño de LR

- Tocando step_size y gamma de StepLR

- Manteniendo Data Agumentation


Además introducimos aqui Tensorboard para visualización de lr, scalars y prepararemos correctamente para hacer test final.


Comenzamos igual que en la sección anterior, lo que cambianmos aqui serán parámetros.

In [15]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch
from datetime import datetime
from torchmetrics.classification import MulticlassPrecision, MulticlassRecall, MulticlassF1Score

TENSORBOARD_EXP = f"runs/cifar10_cnn_step20_lr0015_{datetime.now().strftime('%Y%m%d-%H%M%S')}"


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

torch.manual_seed(3)

transform = transforms.ToTensor()

train_full = datasets.CIFAR10("./data", train=True, download=True, transform=transform)


#Obtenemos el dataset train completo
loader_train = DataLoader(train_full, batch_size=len(train_full), shuffle=False)                       
#Obtenemos un batch de datos
imgs, _ = next(iter(loader_train))             
imgs = imgs.to(torch.float64)
print(imgs.shape) # [50000,3,32,32]

CIFAR10_MEAN = imgs.mean(dim=(0,2,3))
CIFAR10_STD  = imgs.std(dim=(0,2,3))
print(CIFAR10_MEAN, CIFAR10_STD)


cuda
torch.Size([50000, 3, 32, 32])
tensor([0.4914, 0.4822, 0.4465], dtype=torch.float64) tensor([0.2470, 0.2435, 0.2616], dtype=torch.float64)


In [16]:

#CIFAR10_MEAN = torch.tensor([0.4914, 0.4822, 0.4465])
#CIFAR10_STD  = torch.tensor([0.2470, 0.2435, 0.2616])



#Creamos el transform para data augmentation y normalización de TRAIN_SET

train_tf = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN.tolist(), CIFAR10_STD.tolist())
    
])

#Creamos el transform para normalización de TRAIN_SET y EVAL sin aug
no_aug = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(CIFAR10_MEAN.tolist(), CIFAR10_STD.tolist())
])

# Re-creamos los datasets con el nuevo transform + uno nuevo para validacion y que no pase por aug
train_full_aug = datasets.CIFAR10(root="./data", train=True, download=True, transform=train_tf)

train_full_no_aug = datasets.CIFAR10(root="./data", train=True, download=True, transform=no_aug)

test_set = datasets.CIFAR10(root="./data", train=False, download=True, transform=no_aug)


#Comprobamos que los datos estan normalizados
check_loader_train_full = DataLoader(train_full_no_aug, batch_size=len(train_full_no_aug), shuffle=False) #Dataset completo para calcular la media y la desviación estándar de los datos ya normalizados
x, _ = next(iter(check_loader_train_full))
mean_check = x.mean(dim=(0, 2, 3))
std_check  = x.std(dim=(0, 2, 3))
print("Mean Appx: 0):", mean_check)
print("Std Appx: 1):", std_check)
assert mean_check.abs().max() < 0.05
assert (std_check - 1).abs().max() < 0.05



#Dividimos el dataset en train y validation para no_aug
train_set, val_set = torch.utils.data.random_split(generator=torch.Generator().manual_seed(3), dataset=train_full_no_aug, lengths=[40000, 10000])

#Dividimos el dataset en train y validation para AUG
train_set_aug, val_set_aug = torch.utils.data.random_split(generator=torch.Generator().manual_seed(3), dataset=train_full_aug, lengths=[40000, 10000])

#Comprobamos que el dataset se ha dividido correctamente
print(len(train_set), len(val_set))
assert train_set.indices == train_set_aug.indices
assert val_set.indices   == val_set_aug.indices


#Nuestros loaders para entrenar, validar y testear, ya normalizados.

#Vamos a mejorar rendimiento de la GPU

NUM_WORKERS = 4

loader_train = DataLoader(train_set_aug, batch_size=128, shuffle=True, num_workers=NUM_WORKERS, pin_memory=True, persistent_workers=True, prefetch_factor=2) #Barajamos porque es train y mejora la generalización
loader_val = DataLoader(val_set, batch_size=256, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True, persistent_workers=True, prefetch_factor=2)
loader_test = DataLoader(test_set, batch_size=256, shuffle=False, num_workers=NUM_WORKERS, pin_memory=True, persistent_workers=True, prefetch_factor=2)








Mean Appx: 0): tensor([ 6.3542e-08, -1.9292e-08, -4.8446e-08])
Std Appx: 1): tensor([1.0000, 1.0000, 1.0000])
40000 10000


Vamos ahora a coger el primer batch

In [17]:
images, labels = next(iter(loader_train))
print(images.shape, labels.shape)

torch.Size([128, 3, 32, 32]) torch.Size([128])


# Declaración de arquitectura

In [None]:

import torch.nn as nn

def get_flatten_size(model_features, input_shape=(1, 3, 32, 32)):
    with torch.no_grad():
        x = torch.zeros(input_shape)
        out = model_features(x)
        return out.view(out.size(0), -1).size(1)

class CIFAR10CNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.features = nn.Sequential(

            #Primera capa de convolucion, entra una imagen de 3 canales y sale una imagen de 32 features
            #El kernel es de 3x3 y el padding es 1 (same padding)
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(num_features=64),
            nn.ReLU(), #Aplicamos la funcion de activacion ReLU
            nn.MaxPool2d(kernel_size=2, stride=2), #Aplicamos pooling max para reducir la dimensionalidad de la imagen
            
            #Segunda capa de convolucion, entra una imagen de 16 features y sale una imagen de 32 features
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(num_features=128),
            nn.ReLU(), #Aplicamos la funcion de activacion ReLU
            nn.MaxPool2d(kernel_size=2, stride=2), #Aplicamos pooling max para reducir la dimensionalidad de la imagen
            
            #Tercera capa de convolucion, entra una imagen de 32 features y sale una imagen de 64 features
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(num_features=256),
            nn.ReLU(), #Aplicamos la funcion de activacion ReLU
            nn.MaxPool2d(kernel_size=2, stride=2), #Aplicamos pooling max para reducir la dimensionalidad de la imagen
            nn.Dropout(p=0.3), #Aplicamos dropout para evitar el overfitting

            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm2d(num_features=512),
            nn.ReLU(), #Aplicamos la funcion de activacion ReLU
            nn.MaxPool2d(kernel_size=2, stride=2), #Aplicamos pooling max para reducir la dimensionalidad de la imagen
        )

        self.gap = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(512, 10)
        

    

    def forward(self, x):
        x = self.features(x)
        x = self.gap(x).flatten(1)
        return self.fc(x)





# Instancia
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.conv.fp32_precision = 'tf32'
torch.backends.cuda.matmul.fp32_precision = 'tf32'
torch.set_float32_matmul_precision("high")

model = CIFAR10CNN()
model = model.to(device)
model = model.to(memory_format=torch.channels_last)
print(model)

x = torch.randn(4, 3, 32, 32).to(device)
logits = model(x)
print(logits.shape)



CIFAR10CNN(
  (features): Sequential(
    (0): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (5): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (9): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Dropout(p=0.3, inplace=False)
    (13): Conv2d(512, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=F

In [19]:
from torchsummary import summary

summary(model, (3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 128, 32, 32]           3,456
       BatchNorm2d-2          [-1, 128, 32, 32]             256
              ReLU-3          [-1, 128, 32, 32]               0
         MaxPool2d-4          [-1, 128, 16, 16]               0
            Conv2d-5          [-1, 256, 16, 16]         294,912
       BatchNorm2d-6          [-1, 256, 16, 16]             512
              ReLU-7          [-1, 256, 16, 16]               0
         MaxPool2d-8            [-1, 256, 8, 8]               0
            Conv2d-9            [-1, 512, 8, 8]       1,179,648
      BatchNorm2d-10            [-1, 512, 8, 8]           1,024
             ReLU-11            [-1, 512, 8, 8]               0
        MaxPool2d-12            [-1, 512, 4, 4]               0
          Dropout-13            [-1, 512, 4, 4]               0
           Conv2d-14           [-1, 102

### Vamos a declarar el bucle de entramiento

Vamos a recordar los parametros tipicos de SDG

| Parámetro  | Rol                                                      | Valor típico |
| ---------- | -------------------------------------------------------- | ------------ |
| `lr`       | tamaño del paso (cuánto cambian los pesos por gradiente) | 0.01–0.1     |
| `momentum` | cuánto “recuerda” del gradiente anterior                 | 0.8–0.95     |


In [20]:
#TorchMetrics
NUM_CLASSES = 10  # CIFAR-10

precision_metric = MulticlassPrecision(num_classes=NUM_CLASSES, average="macro").to(device)
recall_metric    = MulticlassRecall(num_classes=NUM_CLASSES, average="macro").to(device)
f1_metric        = MulticlassF1Score(num_classes=NUM_CLASSES, average="macro").to(device)

#Definimos los parametros
EPOCHS = 80  # número de pasadas por el dataset
lr = 0.05
momentum = 0.9
#Definimos la funcion de perdida y el optimizador
criterion = nn.CrossEntropyLoss()
weight_decay=5e-4
#lr es la tasa de aprendizaje, momentum es el factor de inercia, es decir, cuanto se mueve el optimizador en la direccion del gradiente y 
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay)


#DECLARAMOS EL SCHEDULER
from torch.optim.lr_scheduler import ReduceLROnPlateau
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, threshold=1e-3)
lrs = []

model.to(device)  # mueve el modelo a GPU
model.train()     # pone el modelo en modo entrenamiento (activa dropout, etc.)



CIFAR10CNN(
  (features): Sequential(
    (0): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (5): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (9): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (12): Dropout(p=0.3, inplace=False)
    (13): Conv2d(512, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=F

In [21]:
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter(log_dir=TENSORBOARD_EXP)  # <-- añadido

def evaluate(model, loader, device, criterion):
    model.eval()
    total, correct, total_loss = 0, 0, 0.0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            logits = model(images)
            loss = criterion(logits, labels)
            total_loss += loss.item() * images.size(0)
            preds = logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    return total_loss / total, correct / total


train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []
lrs = []

for epoch in range(1, EPOCHS + 1):
    model.train()
    running_loss, correct, total = 0.0, 0, 0

    for images, labels in loader_train:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        logits = model(images)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    train_loss = running_loss / total
    train_acc = correct / total
    val_loss, val_acc = evaluate(model, loader_val, device, criterion)

    precision_metric.reset()
    recall_metric.reset()
    f1_metric.reset()

    with torch.no_grad():
        for images, labels in loader_val:
            images, labels = images.to(device), labels.to(device)
            preds = model(images).argmax(dim=1)
            precision_metric.update(preds, labels)
            recall_metric.update(preds, labels)
            f1_metric.update(preds, labels)

    precision_val = precision_metric.compute().item()
    recall_val    = recall_metric.compute().item()
    f1_val        = f1_metric.compute().item()

    writer.add_scalar("Val/Precision_macro", precision_val, epoch)
    writer.add_scalar("Val/Recall_macro",    recall_val,  epoch)
    writer.add_scalar("Val/F1_macro",        f1_val,      epoch)

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

    print(
        f"Época {epoch}/{EPOCHS} | "
        f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
        f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}"
    )

    lrs.append(optimizer.param_groups[0]['lr'])
    scheduler.step(val_loss)

    #Escribimos los resultados de cada epoch en tensorboard
    writer.add_scalar("Loss/train", train_loss, epoch)
    writer.add_scalar("Loss/val",  val_loss,  epoch)
    writer.add_scalar("Acc/train",  train_acc,  epoch)
    writer.add_scalar("Acc/val",   val_acc,   epoch)
    writer.add_scalar("LR", optimizer.param_groups[0]['lr'], epoch)
    writer.flush()  # <-- fuerza escritura inmediata para ver en tiempo real

#Evaluamos el modelo en el test set y añadimos los resultados a tensorboard
test_loss, test_acc = evaluate(model, loader_test, device, criterion)

hparams = {
    'model': model.__class__.__name__,
    'seed': 3,
    'optimizer': optimizer.__class__.__name__,
    'lr_init': float(lr),
    'momentum': float(momentum),
    'batch_size': int(loader_train.batch_size),
    'weight_decay': float(optimizer.param_groups[0].get('weight_decay', 0.0)),
    'scheduler': type(scheduler).__name__,
}

metrics = {
    'metrics/test_acc': float(test_acc),
    'metrics/test_loss': float(test_loss),
    'metrics/val_acc_last': float(val_accuracies[-1]),
    'metrics/val_loss_last': float(val_losses[-1]),
    'metrics/train_acc_last': float(train_accuracies[-1]),
    'metrics/train_loss_last': float(train_losses[-1]),
}

writer.add_hparams(hparams, metrics)
writer.close()



Época 1/80 | Train Loss: 2.1157 | Train Acc: 0.3396 | Val Loss: 1.5727 | Val Acc: 0.4304
Época 2/80 | Train Loss: 1.3464 | Train Acc: 0.5092 | Val Loss: 1.3082 | Val Acc: 0.5307
Época 3/80 | Train Loss: 1.1218 | Train Acc: 0.5988 | Val Loss: 1.0793 | Val Acc: 0.6228
Época 4/80 | Train Loss: 0.9870 | Train Acc: 0.6480 | Val Loss: 0.9013 | Val Acc: 0.6813
Época 5/80 | Train Loss: 0.8698 | Train Acc: 0.6923 | Val Loss: 0.8446 | Val Acc: 0.7019
Época 6/80 | Train Loss: 0.7905 | Train Acc: 0.7216 | Val Loss: 0.8587 | Val Acc: 0.7101
Época 7/80 | Train Loss: 0.7245 | Train Acc: 0.7447 | Val Loss: 0.8447 | Val Acc: 0.7195
Época 8/80 | Train Loss: 0.6729 | Train Acc: 0.7639 | Val Loss: 0.6567 | Val Acc: 0.7735
Época 9/80 | Train Loss: 0.6344 | Train Acc: 0.7777 | Val Loss: 0.8353 | Val Acc: 0.7258
Época 10/80 | Train Loss: 0.5938 | Train Acc: 0.7941 | Val Loss: 0.6871 | Val Acc: 0.7731
Época 11/80 | Train Loss: 0.5641 | Train Acc: 0.8058 | Val Loss: 0.6754 | Val Acc: 0.7719
Época 12/80 | Train

Exception in thread Thread-16 (_pin_memory_loop):
Traceback (most recent call last):
  File "/usr/lib/python3.12/threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "/root/.pyenv/versions/dlvs/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 788, in run_closure
    _threading_Thread_run(self)
  File "/usr/lib/python3.12/threading.py", line 1010, in run
    self._target(*self._args, **self._kwargs)
  File "/root/.pyenv/versions/dlvs/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 52, in _pin_memory_loop
    do_one_step()
  File "/root/.pyenv/versions/dlvs/lib/python3.12/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in do_one_step
    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/queues.py", line 122, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.pyenv/versions/dlvs/l

KeyboardInterrupt: 