In [1]:
import torch
import torchvision
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
import multiprocessing
import torch.optim as optim
import torch.nn.functional as  F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

print("Torch version: ", torch. __version__)

####################################################################
# Set Device
####################################################################

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)


####################################################################
# Prepare Data
####################################################################

transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.1307,), (0.3081,))
])

train_set = torchvision.datasets.MNIST('.data/', train=True, download=True, transform=transform)
#? Considera usar transform=transforms.Compose([ToTensor(), Normalize((0.1307,), (0.3081,))]) para centrar/escala antes del flatten.
#train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)

test_set = torchvision.datasets.MNIST('.data/', train=False, download=True, transform=transform)
#test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=True)

print("Train images: ", train_set)
print("Image: ", train_set[0][0])
print("Label: ", train_set[0][1])
print("Label one hot: ", F.one_hot(torch.tensor(train_set[0][1]), num_classes=10))


####################################################################
# Dataset Class
####################################################################

class MNIST_dataset(Dataset):

    def __init__(self, data, partition = "train"):

        print("\nLoading MNIST ", partition, " Dataset...")
        self.data = data
        self.partition = partition
        print("\tTotal Len.: ", len(self.data), "\n", 50*"-")

    def __len__(self):
        return len(self.data)

    def from_pil_to_tensor(self, image):
        return torchvision.transforms.ToTensor()(image)

    def __getitem__(self, idx):

        # Image
        image_tensor = self.data[idx][0].view(-1)
        # PIL Image to torch tensor
        #! image_tensor = self.from_pil_to_tensor(image)
        # care! net expect a 784 size vector and our dataset
        # provide 1x28x28 (channels, height, width) -> Reshape!
        #! image_tensor = image_tensor.view(-1)
#? Tambien puedes normalizar aqui (image_tensor = (image_tensor - mean) / std) si no usas transforms.

        # Label
        label = torch.tensor(self.data[idx][1])
        # label = F.one_hot(label, num_classes=10).float()
        label = torch.tensor(self.data[idx][1], dtype=torch.long)
#? Alternativa: devuelve label como entero y usa CrossEntropyLoss(label_smoothing=0.1) para regularizar sin one-hot.

        return {"img": image_tensor, "label": label}

train_dataset = MNIST_dataset(train_set, partition="train")
test_dataset = MNIST_dataset(test_set, partition="test")


####################################################################
# DataLoader Class
####################################################################

batch_size = 100
num_workers = 0
print("Num workers", num_workers)
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True, num_workers=num_workers)
test_dataloader = DataLoader(test_dataset, batch_size, shuffle=False, num_workers=num_workers)
#? Para GPU ayuda pin_memory=True y persistent_workers=True cuando num_workers>0.

####################################################################
# Early stopping Class
####################################################################

import copy
import torch

class EarlyStopping:
    def __init__(self, patience=10, min_delta=0.0, mode="max"):
        """
        patience: nº de epochs sin mejora para parar
        min_delta: mejora mínima para considerar 'mejora real'
        mode: "max" si monitorizas accuracy, "min" si monitorizas loss
        """
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.best_score = None
        self.counter = 0
        self.best_state_dict = None
        self.best_epoch = -1

    def step(self, score, model, epoch):
        if self.best_score is None:
            self.best_score = score
            self.best_state_dict = copy.deepcopy(model.state_dict())
            self.best_epoch = epoch
            return False  # no parar

        improved = (score > self.best_score + self.min_delta) if self.mode == "max" \
                   else (score < self.best_score - self.min_delta)

        if improved:
            self.best_score = score
            self.best_state_dict = copy.deepcopy(model.state_dict())
            self.best_epoch = epoch
            self.counter = 0
        else:
            self.counter += 1

        return self.counter >= self.patience  # True => parar


####################################################################
# Neural Network Class
####################################################################

# Creating our Neural Network - Fully Connected
class Net(nn.Module):
    def __init__(self, num_classes):
        super(Net, self).__init__()
        #* AÑADIDO CAPA BATCHNORM1D Y DROPOUT
        self.linear1 = nn.Linear(784, 1024)
        self.relu1 = nn.ReLU()
        self.BatchNorm1d1 = nn.BatchNorm1d(1024)
        self.drop1 = nn.Dropout(0.1)
        self.linear2 = nn.Linear(1024, 512)
        self.BatchNorm1d2 = nn.BatchNorm1d(512)
        self.relu2 = nn.ReLU()
        self.drop2 = nn.Dropout(0.2)
        self.linear3 = nn.Linear(512, 256)
        self.BatchNorm1d3 = nn.BatchNorm1d(256)
        self.relu3 = nn.ReLU()
        self.drop3 = nn.Dropout(0.2)
        self.classifier = nn.Linear(256, num_classes)
#? BatchNorm1d tras cada Linear y Dropout(0.1-0.3) antes de la activacion suelen mejorar la generalizacion.
#? Un MLP mas profundo pero mas estrecho (ej. 784->512->256->128->10) reduce parametros y overfitting sin usar CNN.

    def forward(self, x):
        out = self.drop1(self.relu1(self.BatchNorm1d1(self.linear1(x))))
        out = self.drop2(self.relu2(self.BatchNorm1d2(self.linear2(out))))
        out = self.drop3(self.relu3(self.BatchNorm1d3(self.linear3(out))))
        out = self.classifier(out)
        return out


# Instantiating the network and printing its architecture
num_classes = 10
net = Net(num_classes)
print(net)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Params: ", count_parameters(net))

####################################################################
# Training settings
####################################################################

# Training hyperparameters
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(net.parameters(), lr=0.001, weight_decay=1e-6, momentum=0.9) # Original lr=0.01
optimizer = optim.AdamW(net.parameters(), lr=1e-3, weight_decay=1e-4)
epochs = 75 # Original = 25
#? Prueba AdamW con weight_decay mas alto (p.ej. 1e-2) y un scheduler CosineAnnealingLR u OneCycleLR.

#* SCHEDULER
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=3e-3,
    epochs=epochs,
    steps_per_epoch=len(train_dataloader)
)




####################################################################
# Training
####################################################################

# Load model in GPU
net.to(device)

print("\n---- Start Training ----")
best_accuracy = -1
best_epoch = 0

# early_stopper = EarlyStopping(patience=10, min_delta=0.02, mode="max")
# min_delta=0.05 significa +0.05% de accuracy como mejora mínima (ajústalo si quieres)

for epoch in range(epochs):


    # TRAIN NETWORK
    train_loss, train_correct = 0, 0
    net.train()
    with tqdm(iter(train_dataloader), desc="Epoch " + str(epoch), unit="batch") as tepoch:
        for batch in tepoch:

            # Returned values of Dataset Class
            images = batch["img"].to(device)
            labels = batch["label"].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # Forward
            outputs = net(images)
            loss = criterion(outputs, labels)

            # Calculate gradients
            loss.backward()
#? Puedes recortar gradientes con torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=1.0) si ves inestabilidad.

            # Update gradients
            optimizer.step()
            #* ACTUALIZADO PARA USAR SCHEDULER
            scheduler.step()
            #! # one hot -> labels
            #! labels = torch.argmax(labels, dim=1)
            #! pred = torch.argmax(outputs, dim=1)
            #! train_correct += pred.eq(labels).sum().item()
            
            # sin one-hot
            pred = outputs.argmax(dim=1)
            train_correct += pred.eq(labels).sum().item()

            # print statistics
            train_loss += loss.item() * images.size(0)

    train_loss /= len(train_dataloader.dataset)

    # TEST NETWORK
    test_loss, test_correct = 0, 0
    net.eval()
    with torch.no_grad():
      with tqdm(iter(test_dataloader), desc="Test " + str(epoch), unit="batch") as tepoch:
          for batch in tepoch:

            images = batch["img"].to(device)
            labels = batch["label"].to(device)

            # Forward
            outputs = net(images)
            test_loss += criterion(outputs, labels).item() * images.size(0)

            #! # one hot -> labels
            #! labels = torch.argmax(labels, dim=1)
            #! pred = torch.argmax(outputs, dim=1)
            
            # sin one-hot
            pred = outputs.argmax(dim=1)
            ()

            test_correct += pred.eq(labels).sum().item()

    test_loss /= len(test_dataloader.dataset)
    test_accuracy = 100. * test_correct / len(test_dataloader.dataset)

    print("[Epoch {}] Train Loss: {:.6f} - Test Loss: {:.6f} - Train Accuracy: {:.2f}% - Test Accuracy: {:.2f}%".format(
        epoch + 1, train_loss, test_loss, 100. * train_correct / len(train_dataloader.dataset), test_accuracy
    ))

    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        best_epoch = epoch

        # Save best weights
        torch.save(net.state_dict(), "best_model.pt")
        
    # should_stop = early_stopper.step(test_accuracy, net, epoch)
    # print(f"EarlyStopping: best={early_stopper.best_score:.2f}% (epoch {early_stopper.best_epoch+1}) "
    #     f"patience_counter={early_stopper.counter}/{early_stopper.patience}")

    # if should_stop:
    #     print(f"Stopping early at epoch {epoch+1}. Best was epoch {early_stopper.best_epoch+1} "
    #         f"with acc {early_stopper.best_score:.2f}%")
    #     break

#? Agrega early stopping con paciencia (p.ej. 10 epocas) y ReduceLROnPlateau para bajar lr cuando el val loss se estanque.

print("\nBEST TEST ACCURACY: ", best_accuracy, " in epoch ", best_epoch)

# So far:
# best acc:  98.24 (default)
# best acc:  96.64 with lr: 0.001
# best acc:  98.26 with 2 hidden layers
# best acc:  98.64 with lr: 0.1
# best acc:  98.02 with lr: 0.001 & 75 epochs

####################################################################
# Load best weights
####################################################################

# Load best weights
net.load_state_dict(torch.load("best_model.pt"))

test_loss, test_correct = 0, 0
net.eval()
with torch.no_grad():
    with tqdm(iter(test_dataloader), desc="Test " + str(epoch), unit="batch") as tepoch:
        for batch in tepoch:

            images = batch["img"].to(device)
            labels = batch["label"].to(device)

            # Forward
            outputs = net(images)
            test_loss += criterion(outputs, labels)

            #! # one hot -> labels
            #! labels = torch.argmax(labels, dim=1)
            #! pred = torch.argmax(outputs, dim=1)
            
            pred = outputs.argmax(dim=1)
            ()

            test_correct += pred.eq(labels).sum().item()

    test_loss /= len(test_dataloader.dataset)
    test_accuracy = 100. * test_correct / len(test_dataloader.dataset)
print("Final best acc: ", test_accuracy)

Torch version:  2.10.0+cu130
Device:  cuda
Train images:  Dataset MNIST
    Number of datapoints: 60000
    Root location: .data/
    Split: Train
    StandardTransform
Transform: Compose(
               ToTensor()
               Normalize(mean=(0.1307,), std=(0.3081,))
           )
Image:  tensor([[[-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
          -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
          -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
          -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242],
         [-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
          -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
          -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
          -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242],
         [-0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242, -0.4242,
          -0.4242, -0.4242, -0.4242, -0.4242, -0.

Epoch 0: 100%|██████████| 600/600 [00:17<00:00, 34.46batch/s]
Test 0: 100%|██████████| 100/100 [00:02<00:00, 36.58batch/s]


[Epoch 1] Train Loss: 0.450284 - Test Loss: 0.138924 - Train Accuracy: 89.04% - Test Accuracy: 96.06%
EarlyStopping: best=96.06% (epoch 1) patience_counter=0/10


Epoch 1: 100%|██████████| 600/600 [00:17<00:00, 35.17batch/s]
Test 1: 100%|██████████| 100/100 [00:02<00:00, 35.21batch/s]


[Epoch 2] Train Loss: 0.151790 - Test Loss: 0.088651 - Train Accuracy: 95.88% - Test Accuracy: 97.15%
EarlyStopping: best=97.15% (epoch 2) patience_counter=0/10


Epoch 2: 100%|██████████| 600/600 [00:18<00:00, 33.30batch/s]
Test 2: 100%|██████████| 100/100 [00:02<00:00, 34.07batch/s]


[Epoch 3] Train Loss: 0.108013 - Test Loss: 0.074144 - Train Accuracy: 96.89% - Test Accuracy: 97.74%
EarlyStopping: best=97.74% (epoch 3) patience_counter=0/10


Epoch 3: 100%|██████████| 600/600 [00:17<00:00, 33.50batch/s]
Test 3: 100%|██████████| 100/100 [00:02<00:00, 34.75batch/s]


[Epoch 4] Train Loss: 0.088504 - Test Loss: 0.073512 - Train Accuracy: 97.28% - Test Accuracy: 97.79%
EarlyStopping: best=97.79% (epoch 4) patience_counter=0/10


Epoch 4: 100%|██████████| 600/600 [00:18<00:00, 33.03batch/s]
Test 4: 100%|██████████| 100/100 [00:02<00:00, 34.73batch/s]


[Epoch 5] Train Loss: 0.083253 - Test Loss: 0.075646 - Train Accuracy: 97.36% - Test Accuracy: 97.54%
EarlyStopping: best=97.79% (epoch 4) patience_counter=1/10


Epoch 5: 100%|██████████| 600/600 [00:17<00:00, 35.15batch/s]
Test 5: 100%|██████████| 100/100 [00:02<00:00, 36.40batch/s]


[Epoch 6] Train Loss: 0.076739 - Test Loss: 0.070216 - Train Accuracy: 97.61% - Test Accuracy: 97.82%
EarlyStopping: best=97.82% (epoch 6) patience_counter=0/10


Epoch 6: 100%|██████████| 600/600 [00:17<00:00, 35.03batch/s]
Test 6: 100%|██████████| 100/100 [00:02<00:00, 36.38batch/s]


[Epoch 7] Train Loss: 0.073606 - Test Loss: 0.073912 - Train Accuracy: 97.64% - Test Accuracy: 97.73%
EarlyStopping: best=97.82% (epoch 6) patience_counter=1/10


Epoch 7: 100%|██████████| 600/600 [00:17<00:00, 34.91batch/s]
Test 7: 100%|██████████| 100/100 [00:02<00:00, 36.80batch/s]


[Epoch 8] Train Loss: 0.069009 - Test Loss: 0.070464 - Train Accuracy: 97.81% - Test Accuracy: 97.90%
EarlyStopping: best=97.90% (epoch 8) patience_counter=0/10


Epoch 8: 100%|██████████| 600/600 [00:17<00:00, 34.91batch/s]
Test 8: 100%|██████████| 100/100 [00:02<00:00, 36.39batch/s]


[Epoch 9] Train Loss: 0.064802 - Test Loss: 0.077953 - Train Accuracy: 97.94% - Test Accuracy: 97.69%
EarlyStopping: best=97.90% (epoch 8) patience_counter=1/10


Epoch 9: 100%|██████████| 600/600 [00:17<00:00, 34.60batch/s]
Test 9: 100%|██████████| 100/100 [00:02<00:00, 36.40batch/s]


[Epoch 10] Train Loss: 0.063226 - Test Loss: 0.066058 - Train Accuracy: 98.01% - Test Accuracy: 97.84%
EarlyStopping: best=97.90% (epoch 8) patience_counter=2/10


Epoch 10: 100%|██████████| 600/600 [00:17<00:00, 34.92batch/s]
Test 10: 100%|██████████| 100/100 [00:02<00:00, 36.23batch/s]


[Epoch 11] Train Loss: 0.060760 - Test Loss: 0.067462 - Train Accuracy: 98.05% - Test Accuracy: 97.86%
EarlyStopping: best=97.90% (epoch 8) patience_counter=3/10


Epoch 11: 100%|██████████| 600/600 [00:16<00:00, 36.40batch/s]
Test 11: 100%|██████████| 100/100 [00:02<00:00, 39.09batch/s]


[Epoch 12] Train Loss: 0.060158 - Test Loss: 0.073428 - Train Accuracy: 98.05% - Test Accuracy: 97.65%
EarlyStopping: best=97.90% (epoch 8) patience_counter=4/10


Epoch 12: 100%|██████████| 600/600 [00:16<00:00, 35.80batch/s]
Test 12: 100%|██████████| 100/100 [00:02<00:00, 36.22batch/s]


[Epoch 13] Train Loss: 0.056347 - Test Loss: 0.061541 - Train Accuracy: 98.22% - Test Accuracy: 98.11%
EarlyStopping: best=98.11% (epoch 13) patience_counter=0/10


Epoch 13: 100%|██████████| 600/600 [00:17<00:00, 34.21batch/s]
Test 13: 100%|██████████| 100/100 [00:02<00:00, 38.00batch/s]


[Epoch 14] Train Loss: 0.052013 - Test Loss: 0.068460 - Train Accuracy: 98.31% - Test Accuracy: 98.02%
EarlyStopping: best=98.11% (epoch 13) patience_counter=1/10


Epoch 14: 100%|██████████| 600/600 [00:16<00:00, 36.69batch/s]
Test 14: 100%|██████████| 100/100 [00:02<00:00, 37.35batch/s]


[Epoch 15] Train Loss: 0.050237 - Test Loss: 0.070712 - Train Accuracy: 98.42% - Test Accuracy: 98.07%
EarlyStopping: best=98.11% (epoch 13) patience_counter=2/10


Epoch 15: 100%|██████████| 600/600 [00:16<00:00, 36.36batch/s]
Test 15: 100%|██████████| 100/100 [00:02<00:00, 38.35batch/s]


[Epoch 16] Train Loss: 0.048071 - Test Loss: 0.074332 - Train Accuracy: 98.48% - Test Accuracy: 98.11%
EarlyStopping: best=98.11% (epoch 13) patience_counter=3/10


Epoch 16: 100%|██████████| 600/600 [00:17<00:00, 34.82batch/s]
Test 16: 100%|██████████| 100/100 [00:02<00:00, 35.77batch/s]


[Epoch 17] Train Loss: 0.048665 - Test Loss: 0.058903 - Train Accuracy: 98.44% - Test Accuracy: 98.22%
EarlyStopping: best=98.22% (epoch 17) patience_counter=0/10


Epoch 17: 100%|██████████| 600/600 [00:17<00:00, 34.48batch/s]
Test 17: 100%|██████████| 100/100 [00:02<00:00, 35.15batch/s]


[Epoch 18] Train Loss: 0.043084 - Test Loss: 0.070680 - Train Accuracy: 98.58% - Test Accuracy: 98.03%
EarlyStopping: best=98.22% (epoch 17) patience_counter=1/10


Epoch 18: 100%|██████████| 600/600 [00:17<00:00, 34.14batch/s]
Test 18: 100%|██████████| 100/100 [00:02<00:00, 35.91batch/s]


[Epoch 19] Train Loss: 0.039197 - Test Loss: 0.057949 - Train Accuracy: 98.76% - Test Accuracy: 98.30%
EarlyStopping: best=98.30% (epoch 19) patience_counter=0/10


Epoch 19: 100%|██████████| 600/600 [00:17<00:00, 33.52batch/s]
Test 19: 100%|██████████| 100/100 [00:02<00:00, 35.75batch/s]


[Epoch 20] Train Loss: 0.039432 - Test Loss: 0.059559 - Train Accuracy: 98.70% - Test Accuracy: 98.26%
EarlyStopping: best=98.30% (epoch 19) patience_counter=1/10


Epoch 20: 100%|██████████| 600/600 [00:17<00:00, 34.06batch/s]
Test 20: 100%|██████████| 100/100 [00:02<00:00, 36.06batch/s]


[Epoch 21] Train Loss: 0.035992 - Test Loss: 0.067082 - Train Accuracy: 98.84% - Test Accuracy: 98.10%
EarlyStopping: best=98.30% (epoch 19) patience_counter=2/10


Epoch 21: 100%|██████████| 600/600 [00:17<00:00, 35.02batch/s]
Test 21: 100%|██████████| 100/100 [00:02<00:00, 34.81batch/s]


[Epoch 22] Train Loss: 0.035281 - Test Loss: 0.056323 - Train Accuracy: 98.89% - Test Accuracy: 98.49%
EarlyStopping: best=98.49% (epoch 22) patience_counter=0/10


Epoch 22: 100%|██████████| 600/600 [00:16<00:00, 35.56batch/s]
Test 22: 100%|██████████| 100/100 [00:02<00:00, 38.13batch/s]


[Epoch 23] Train Loss: 0.031300 - Test Loss: 0.062212 - Train Accuracy: 98.98% - Test Accuracy: 98.28%
EarlyStopping: best=98.49% (epoch 22) patience_counter=1/10


Epoch 23: 100%|██████████| 600/600 [00:16<00:00, 35.87batch/s]
Test 23: 100%|██████████| 100/100 [00:02<00:00, 38.55batch/s]


[Epoch 24] Train Loss: 0.030328 - Test Loss: 0.062342 - Train Accuracy: 99.02% - Test Accuracy: 98.39%
EarlyStopping: best=98.49% (epoch 22) patience_counter=2/10


Epoch 24: 100%|██████████| 600/600 [00:16<00:00, 36.86batch/s]
Test 24: 100%|██████████| 100/100 [00:02<00:00, 38.92batch/s]


[Epoch 25] Train Loss: 0.026698 - Test Loss: 0.059466 - Train Accuracy: 99.09% - Test Accuracy: 98.42%
EarlyStopping: best=98.49% (epoch 22) patience_counter=3/10


Epoch 25: 100%|██████████| 600/600 [00:16<00:00, 36.99batch/s]
Test 25: 100%|██████████| 100/100 [00:02<00:00, 38.89batch/s]


[Epoch 26] Train Loss: 0.025616 - Test Loss: 0.059966 - Train Accuracy: 99.18% - Test Accuracy: 98.49%
EarlyStopping: best=98.49% (epoch 22) patience_counter=4/10


Epoch 26: 100%|██████████| 600/600 [00:16<00:00, 36.19batch/s]
Test 26: 100%|██████████| 100/100 [00:02<00:00, 38.22batch/s]


[Epoch 27] Train Loss: 0.023038 - Test Loss: 0.059906 - Train Accuracy: 99.28% - Test Accuracy: 98.42%
EarlyStopping: best=98.49% (epoch 22) patience_counter=5/10


Epoch 27: 100%|██████████| 600/600 [00:16<00:00, 36.17batch/s]
Test 27: 100%|██████████| 100/100 [00:02<00:00, 36.48batch/s]


[Epoch 28] Train Loss: 0.021383 - Test Loss: 0.063534 - Train Accuracy: 99.29% - Test Accuracy: 98.48%
EarlyStopping: best=98.49% (epoch 22) patience_counter=6/10


Epoch 28: 100%|██████████| 600/600 [00:17<00:00, 34.93batch/s]
Test 28: 100%|██████████| 100/100 [00:02<00:00, 36.40batch/s]


[Epoch 29] Train Loss: 0.021595 - Test Loss: 0.061707 - Train Accuracy: 99.24% - Test Accuracy: 98.35%
EarlyStopping: best=98.49% (epoch 22) patience_counter=7/10


Epoch 29: 100%|██████████| 600/600 [00:17<00:00, 34.81batch/s]
Test 29: 100%|██████████| 100/100 [00:02<00:00, 36.23batch/s]


[Epoch 30] Train Loss: 0.019651 - Test Loss: 0.063161 - Train Accuracy: 99.36% - Test Accuracy: 98.38%
EarlyStopping: best=98.49% (epoch 22) patience_counter=8/10


Epoch 30: 100%|██████████| 600/600 [00:18<00:00, 32.52batch/s]
Test 30: 100%|██████████| 100/100 [00:02<00:00, 34.51batch/s]


[Epoch 31] Train Loss: 0.019187 - Test Loss: 0.063622 - Train Accuracy: 99.40% - Test Accuracy: 98.42%
EarlyStopping: best=98.49% (epoch 22) patience_counter=9/10


Epoch 31: 100%|██████████| 600/600 [00:17<00:00, 34.25batch/s]
Test 31: 100%|██████████| 100/100 [00:02<00:00, 34.47batch/s]


[Epoch 32] Train Loss: 0.016898 - Test Loss: 0.059427 - Train Accuracy: 99.49% - Test Accuracy: 98.58%
EarlyStopping: best=98.58% (epoch 32) patience_counter=0/10


Epoch 32: 100%|██████████| 600/600 [00:17<00:00, 34.09batch/s]
Test 32: 100%|██████████| 100/100 [00:02<00:00, 36.90batch/s]


[Epoch 33] Train Loss: 0.015917 - Test Loss: 0.058277 - Train Accuracy: 99.47% - Test Accuracy: 98.60%
EarlyStopping: best=98.58% (epoch 32) patience_counter=1/10


Epoch 33: 100%|██████████| 600/600 [00:17<00:00, 35.15batch/s]
Test 33: 100%|██████████| 100/100 [00:02<00:00, 36.58batch/s]


[Epoch 34] Train Loss: 0.016991 - Test Loss: 0.066414 - Train Accuracy: 99.47% - Test Accuracy: 98.44%
EarlyStopping: best=98.58% (epoch 32) patience_counter=2/10


Epoch 34: 100%|██████████| 600/600 [00:17<00:00, 35.22batch/s]
Test 34: 100%|██████████| 100/100 [00:02<00:00, 36.47batch/s]


[Epoch 35] Train Loss: 0.014047 - Test Loss: 0.061252 - Train Accuracy: 99.51% - Test Accuracy: 98.52%
EarlyStopping: best=98.58% (epoch 32) patience_counter=3/10


Epoch 35: 100%|██████████| 600/600 [00:17<00:00, 35.29batch/s]
Test 35: 100%|██████████| 100/100 [00:02<00:00, 36.88batch/s]


[Epoch 36] Train Loss: 0.013524 - Test Loss: 0.064143 - Train Accuracy: 99.57% - Test Accuracy: 98.56%
EarlyStopping: best=98.58% (epoch 32) patience_counter=4/10


Epoch 36: 100%|██████████| 600/600 [00:17<00:00, 35.00batch/s]
Test 36: 100%|██████████| 100/100 [00:02<00:00, 36.22batch/s]


[Epoch 37] Train Loss: 0.012572 - Test Loss: 0.070742 - Train Accuracy: 99.58% - Test Accuracy: 98.48%
EarlyStopping: best=98.58% (epoch 32) patience_counter=5/10


Epoch 37: 100%|██████████| 600/600 [00:16<00:00, 37.06batch/s]
Test 37: 100%|██████████| 100/100 [00:02<00:00, 39.21batch/s]


[Epoch 38] Train Loss: 0.012371 - Test Loss: 0.063690 - Train Accuracy: 99.60% - Test Accuracy: 98.52%
EarlyStopping: best=98.58% (epoch 32) patience_counter=6/10


Epoch 38: 100%|██████████| 600/600 [00:16<00:00, 37.16batch/s]
Test 38: 100%|██████████| 100/100 [00:02<00:00, 39.32batch/s]


[Epoch 39] Train Loss: 0.012813 - Test Loss: 0.055516 - Train Accuracy: 99.58% - Test Accuracy: 98.83%
EarlyStopping: best=98.83% (epoch 39) patience_counter=0/10


Epoch 39: 100%|██████████| 600/600 [00:16<00:00, 36.24batch/s]
Test 39: 100%|██████████| 100/100 [00:02<00:00, 37.66batch/s]


[Epoch 40] Train Loss: 0.010027 - Test Loss: 0.060633 - Train Accuracy: 99.66% - Test Accuracy: 98.63%
EarlyStopping: best=98.83% (epoch 39) patience_counter=1/10


Epoch 40: 100%|██████████| 600/600 [00:16<00:00, 36.77batch/s]
Test 40: 100%|██████████| 100/100 [00:02<00:00, 37.52batch/s]


[Epoch 41] Train Loss: 0.009243 - Test Loss: 0.061651 - Train Accuracy: 99.71% - Test Accuracy: 98.60%
EarlyStopping: best=98.83% (epoch 39) patience_counter=2/10


Epoch 41: 100%|██████████| 600/600 [00:16<00:00, 37.04batch/s]
Test 41: 100%|██████████| 100/100 [00:02<00:00, 39.00batch/s]


[Epoch 42] Train Loss: 0.008649 - Test Loss: 0.064129 - Train Accuracy: 99.70% - Test Accuracy: 98.70%
EarlyStopping: best=98.83% (epoch 39) patience_counter=3/10


Epoch 42: 100%|██████████| 600/600 [00:16<00:00, 36.91batch/s]
Test 42: 100%|██████████| 100/100 [00:02<00:00, 38.88batch/s]


[Epoch 43] Train Loss: 0.007933 - Test Loss: 0.058747 - Train Accuracy: 99.75% - Test Accuracy: 98.75%
EarlyStopping: best=98.83% (epoch 39) patience_counter=4/10


Epoch 43: 100%|██████████| 600/600 [00:16<00:00, 37.04batch/s]
Test 43: 100%|██████████| 100/100 [00:02<00:00, 38.97batch/s]


[Epoch 44] Train Loss: 0.008077 - Test Loss: 0.067778 - Train Accuracy: 99.72% - Test Accuracy: 98.70%
EarlyStopping: best=98.83% (epoch 39) patience_counter=5/10


Epoch 44: 100%|██████████| 600/600 [00:16<00:00, 37.03batch/s]
Test 44: 100%|██████████| 100/100 [00:02<00:00, 38.93batch/s]


[Epoch 45] Train Loss: 0.007370 - Test Loss: 0.061528 - Train Accuracy: 99.73% - Test Accuracy: 98.66%
EarlyStopping: best=98.83% (epoch 39) patience_counter=6/10


Epoch 45: 100%|██████████| 600/600 [00:16<00:00, 37.08batch/s]
Test 45: 100%|██████████| 100/100 [00:02<00:00, 39.12batch/s]


[Epoch 46] Train Loss: 0.006156 - Test Loss: 0.063015 - Train Accuracy: 99.79% - Test Accuracy: 98.64%
EarlyStopping: best=98.83% (epoch 39) patience_counter=7/10


Epoch 46: 100%|██████████| 600/600 [00:16<00:00, 37.16batch/s]
Test 46: 100%|██████████| 100/100 [00:02<00:00, 39.30batch/s]


[Epoch 47] Train Loss: 0.006254 - Test Loss: 0.069423 - Train Accuracy: 99.80% - Test Accuracy: 98.68%
EarlyStopping: best=98.83% (epoch 39) patience_counter=8/10


Epoch 47: 100%|██████████| 600/600 [00:16<00:00, 37.18batch/s]
Test 47: 100%|██████████| 100/100 [00:02<00:00, 39.08batch/s]


[Epoch 48] Train Loss: 0.005220 - Test Loss: 0.069681 - Train Accuracy: 99.84% - Test Accuracy: 98.60%
EarlyStopping: best=98.83% (epoch 39) patience_counter=9/10


Epoch 48: 100%|██████████| 600/600 [00:16<00:00, 37.25batch/s]
Test 48: 100%|██████████| 100/100 [00:02<00:00, 39.03batch/s]


[Epoch 49] Train Loss: 0.005073 - Test Loss: 0.072014 - Train Accuracy: 99.85% - Test Accuracy: 98.64%
EarlyStopping: best=98.83% (epoch 39) patience_counter=10/10
Stopping early at epoch 49. Best was epoch 39 with acc 98.83%

BEST TEST ACCURACY:  98.83  in epoch  38


Test 48: 100%|██████████| 100/100 [00:02<00:00, 39.40batch/s]

Final best acc:  98.83



