In [None]:
import torch
import torchvision
import pandas as pd
import torch.nn as nn
from tqdm import tqdm
import multiprocessing
import torch.optim as optim
import torch.nn.functional as  F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

print("Torch version: ", torch. __version__)

####################################################################
# Set Device
####################################################################

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)


####################################################################
# Prepare Data
####################################################################

train_set = torchvision.datasets.MNIST('.data/', train=True, download=True)
#? Considera usar transform=transforms.Compose([ToTensor(), Normalize((0.1307,), (0.3081,))]) para centrar/escala antes del flatten.
#train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)

test_set = torchvision.datasets.MNIST('.data/', train=False, download=True)
#test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=True)

print("Train images: ", train_set)
print("Image: ", train_set[0][0])
print("Label: ", train_set[0][1])
print("Label one hot: ", F.one_hot(torch.tensor(train_set[0][1]), num_classes=10))


####################################################################
# Dataset Class
####################################################################

class MNIST_dataset(Dataset):

    def __init__(self, data, partition = "train"):

        print("\nLoading MNIST ", partition, " Dataset...")
        self.data = data
        self.partition = partition
        print("\tTotal Len.: ", len(self.data), "\n", 50*"-")

    def __len__(self):
        return len(self.data)

    def from_pil_to_tensor(self, image):
        return torchvision.transforms.ToTensor()(image)

    def __getitem__(self, idx):

        # Image
        image = self.data[idx][0]
        # PIL Image to torch tensor
        image_tensor = self.from_pil_to_tensor(image)
        # care! net expect a 784 size vector and our dataset
        # provide 1x28x28 (channels, height, width) -> Reshape!
        image_tensor = image_tensor.view(-1)
#? Tambien puedes normalizar aqui (image_tensor = (image_tensor - mean) / std) si no usas transforms.

        # Label
        label = torch.tensor(self.data[idx][1])
        label = F.one_hot(label, num_classes=10).float()
#? Alternativa: devuelve label como entero y usa CrossEntropyLoss(label_smoothing=0.1) para regularizar sin one-hot.

        return {"img": image_tensor, "label": label}

train_dataset = MNIST_dataset(train_set, partition="train")
test_dataset = MNIST_dataset(test_set, partition="test")


####################################################################
# DataLoader Class
####################################################################

batch_size = 100
num_workers = 0
print("Num workers", num_workers)
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True, num_workers=num_workers)
test_dataloader = DataLoader(test_dataset, batch_size, shuffle=False, num_workers=num_workers)
#? Para GPU ayuda pin_memory=True y persistent_workers=True cuando num_workers>0.

####################################################################
# Early stopping Class
####################################################################

import copy
import torch

class EarlyStopping:
    def __init__(self, patience=10, min_delta=0.0, mode="max"):
        """
        patience: nº de epochs sin mejora para parar
        min_delta: mejora mínima para considerar 'mejora real'
        mode: "max" si monitorizas accuracy, "min" si monitorizas loss
        """
        self.patience = patience
        self.min_delta = min_delta
        self.mode = mode
        self.best_score = None
        self.counter = 0
        self.best_state_dict = None
        self.best_epoch = -1

    def step(self, score, model, epoch):
        if self.best_score is None:
            self.best_score = score
            self.best_state_dict = copy.deepcopy(model.state_dict())
            self.best_epoch = epoch
            return False  # no parar

        improved = (score > self.best_score + self.min_delta) if self.mode == "max" \
                   else (score < self.best_score - self.min_delta)

        if improved:
            self.best_score = score
            self.best_state_dict = copy.deepcopy(model.state_dict())
            self.best_epoch = epoch
            self.counter = 0
        else:
            self.counter += 1

        return self.counter >= self.patience  # True => parar


####################################################################
# Neural Network Class
####################################################################

# Creating our Neural Network - Fully Connected
class Net(nn.Module):
    def __init__(self, num_classes):
        super(Net, self).__init__()
        #* AÑADIDO CAPA BATCHNORM1D Y DROPOUT
        self.linear1 = nn.Linear(784, 512)
        self.relu1 = nn.ReLU()
        self.BatchNorm1d1 = nn.BatchNorm1d(512)
        self.drop1 = nn.Dropout(0.2)
        self.linear2 = nn.Linear(512, 256)
        self.BatchNorm1d2 = nn.BatchNorm1d(256)
        self.relu2 = nn.ReLU()
        self.drop2 = nn.Dropout(0.3)
        self.linear3 = nn.Linear(256, 128)
        self.BatchNorm1d3 = nn.BatchNorm1d(128)
        self.relu3 = nn.ReLU()
        self.drop3 = nn.Dropout(0.4)
        self.classifier = nn.Linear(128, num_classes)
#? BatchNorm1d tras cada Linear y Dropout(0.1-0.3) antes de la activacion suelen mejorar la generalizacion.
#? Un MLP mas profundo pero mas estrecho (ej. 784->512->256->128->10) reduce parametros y overfitting sin usar CNN.

    def forward(self, x):
        out = self.drop1(self.relu1(self.BatchNorm1d1(self.linear1(x))))
        out = self.drop2(self.relu2(self.BatchNorm1d2(self.linear2(out))))
        out = self.drop3(self.relu3(self.BatchNorm1d3(self.linear3(out))))
        out = self.classifier(out)
        return out


# Instantiating the network and printing its architecture
num_classes = 10
net = Net(num_classes)
print(net)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Params: ", count_parameters(net))

####################################################################
# Training settings
####################################################################

# Training hyperparameters
criterion = nn.CrossEntropyLoss()
# optimizer = optim.SGD(net.parameters(), lr=0.001, weight_decay=1e-6, momentum=0.9) # Original lr=0.01
optimizer = optim.AdamW(net.parameters(), lr=0.001, weight_decay=1e-2)
epochs = 75 # Original = 25
#? Prueba AdamW con weight_decay mas alto (p.ej. 1e-2) y un scheduler CosineAnnealingLR u OneCycleLR.


####################################################################
# Training
####################################################################

# Load model in GPU
net.to(device)

print("\n---- Start Training ----")
best_accuracy = -1
best_epoch = 0

early_stopper = EarlyStopping(patience=10, min_delta=0.1, mode="max")
# min_delta=0.05 significa +0.05% de accuracy como mejora mínima (ajústalo si quieres)

for epoch in range(epochs):


    # TRAIN NETWORK
    train_loss, train_correct = 0, 0
    net.train()
    with tqdm(iter(train_dataloader), desc="Epoch " + str(epoch), unit="batch") as tepoch:
        for batch in tepoch:

            # Returned values of Dataset Class
            images = batch["img"].to(device)
            labels = batch["label"].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # Forward
            outputs = net(images)
            loss = criterion(outputs, labels)

            # Calculate gradients
            loss.backward()
#? Puedes recortar gradientes con torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=1.0) si ves inestabilidad.

            # Update gradients
            optimizer.step()

            # one hot -> labels
            labels = torch.argmax(labels, dim=1)
            pred = torch.argmax(outputs, dim=1)
            train_correct += pred.eq(labels).sum().item()

            # print statistics
            train_loss += loss.item()

    train_loss /= len(train_dataloader.dataset)

    # TEST NETWORK
    test_loss, test_correct = 0, 0
    net.eval()
    with torch.no_grad():
      with tqdm(iter(test_dataloader), desc="Test " + str(epoch), unit="batch") as tepoch:
          for batch in tepoch:

            images = batch["img"].to(device)
            labels = batch["label"].to(device)

            # Forward
            outputs = net(images)
            test_loss += criterion(outputs, labels)

            # one hot -> labels
            labels = torch.argmax(labels, dim=1)
            pred = torch.argmax(outputs, dim=1)

            test_correct += pred.eq(labels).sum().item()

    test_loss /= len(test_dataloader.dataset)
    test_accuracy = 100. * test_correct / len(test_dataloader.dataset)

    print("[Epoch {}] Train Loss: {:.6f} - Test Loss: {:.6f} - Train Accuracy: {:.2f}% - Test Accuracy: {:.2f}%".format(
        epoch + 1, train_loss, test_loss, 100. * train_correct / len(train_dataloader.dataset), test_accuracy
    ))

    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        best_epoch = epoch

        # Save best weights
        torch.save(net.state_dict(), "best_model.pt")
        
    should_stop = early_stopper.step(test_accuracy, net, epoch)
    print(f"EarlyStopping: best={early_stopper.best_score:.2f}% (epoch {early_stopper.best_epoch+1}) "
        f"patience_counter={early_stopper.counter}/{early_stopper.patience}")

    if should_stop:
        print(f"Stopping early at epoch {epoch+1}. Best was epoch {early_stopper.best_epoch+1} "
            f"with acc {early_stopper.best_score:.2f}%")
        break

#? Agrega early stopping con paciencia (p.ej. 10 epocas) y ReduceLROnPlateau para bajar lr cuando el val loss se estanque.

print("\nBEST TEST ACCURACY: ", best_accuracy, " in epoch ", best_epoch)

# So far:
# best acc:  98.24 (default)
# best acc:  96.64 with lr: 0.001
# best acc:  98.26 with 2 hidden layers
# best acc:  98.64 with lr: 0.1
# best acc:  98.02 with lr: 0.001 & 75 epochs

####################################################################
# Load best weights
####################################################################

# Load best weights
net.load_state_dict(torch.load("best_model.pt"))

test_loss, test_correct = 0, 0
net.eval()
with torch.no_grad():
    with tqdm(iter(test_dataloader), desc="Test " + str(epoch), unit="batch") as tepoch:
        for batch in tepoch:

            images = batch["img"].to(device)
            labels = batch["label"].to(device)

            # Forward
            outputs = net(images)
            test_loss += criterion(outputs, labels)

            # one hot -> labels
            labels = torch.argmax(labels, dim=1)
            pred = torch.argmax(outputs, dim=1)

            test_correct += pred.eq(labels).sum().item()

    test_loss /= len(test_dataloader.dataset)
    test_accuracy = 100. * test_correct / len(test_dataloader.dataset)
print("Final best acc: ", test_accuracy)

Torch version:  2.10.0+cu130
Device:  cuda
Train images:  Dataset MNIST
    Number of datapoints: 60000
    Root location: .data/
    Split: Train
Image:  <PIL.Image.Image image mode=L size=28x28 at 0x2A4841D52B0>
Label:  5
Label one hot:  tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

Loading MNIST  train  Dataset...
	Total Len.:  60000 
 --------------------------------------------------

Loading MNIST  test  Dataset...
	Total Len.:  10000 
 --------------------------------------------------
Num workers 0
Net(
  (linear1): Linear(in_features=784, out_features=512, bias=True)
  (relu1): ReLU()
  (BatchNorm1d1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (drop1): Dropout(p=0.2, inplace=False)
  (linear2): Linear(in_features=512, out_features=256, bias=True)
  (BatchNorm1d2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): ReLU()
  (drop2): Dropout(p=0.3, inplace=False)
  (linear3): Linear(in_features=256, out_

Epoch 0: 100%|██████████| 600/600 [00:06<00:00, 97.99batch/s] 
Test 0: 100%|██████████| 100/100 [00:00<00:00, 112.67batch/s]


[Epoch 1] Train Loss: 0.008315 - Test Loss: 0.002893 - Train Accuracy: 78.08% - Test Accuracy: 92.80%
EarlyStopping: best=92.80% (epoch 1) patience_counter=0/10


Epoch 1: 100%|██████████| 600/600 [00:06<00:00, 99.29batch/s] 
Test 1: 100%|██████████| 100/100 [00:00<00:00, 111.36batch/s]


[Epoch 2] Train Loss: 0.003520 - Test Loss: 0.001836 - Train Accuracy: 90.56% - Test Accuracy: 94.88%
EarlyStopping: best=94.88% (epoch 2) patience_counter=0/10


Epoch 2: 100%|██████████| 600/600 [00:06<00:00, 99.58batch/s] 
Test 2: 100%|██████████| 100/100 [00:00<00:00, 112.05batch/s]


[Epoch 3] Train Loss: 0.002640 - Test Loss: 0.001445 - Train Accuracy: 92.63% - Test Accuracy: 95.76%
EarlyStopping: best=95.76% (epoch 3) patience_counter=0/10


Epoch 3: 100%|██████████| 600/600 [00:06<00:00, 99.85batch/s] 
Test 3: 100%|██████████| 100/100 [00:00<00:00, 112.86batch/s]


[Epoch 4] Train Loss: 0.002231 - Test Loss: 0.001225 - Train Accuracy: 93.68% - Test Accuracy: 96.36%
EarlyStopping: best=96.36% (epoch 4) patience_counter=0/10


Epoch 4: 100%|██████████| 600/600 [00:05<00:00, 100.04batch/s]
Test 4: 100%|██████████| 100/100 [00:00<00:00, 110.99batch/s]


[Epoch 5] Train Loss: 0.001906 - Test Loss: 0.001085 - Train Accuracy: 94.59% - Test Accuracy: 96.75%
EarlyStopping: best=96.75% (epoch 5) patience_counter=0/10


Epoch 5: 100%|██████████| 600/600 [00:06<00:00, 93.14batch/s]
Test 5: 100%|██████████| 100/100 [00:00<00:00, 102.88batch/s]


[Epoch 6] Train Loss: 0.001700 - Test Loss: 0.000965 - Train Accuracy: 95.15% - Test Accuracy: 97.11%
EarlyStopping: best=97.11% (epoch 6) patience_counter=0/10


Epoch 6: 100%|██████████| 600/600 [00:06<00:00, 94.52batch/s]
Test 6: 100%|██████████| 100/100 [00:00<00:00, 103.57batch/s]


[Epoch 7] Train Loss: 0.001581 - Test Loss: 0.000886 - Train Accuracy: 95.45% - Test Accuracy: 97.27%
EarlyStopping: best=97.27% (epoch 7) patience_counter=0/10


Epoch 7: 100%|██████████| 600/600 [00:06<00:00, 94.11batch/s]
Test 7: 100%|██████████| 100/100 [00:00<00:00, 105.65batch/s]


[Epoch 8] Train Loss: 0.001436 - Test Loss: 0.000829 - Train Accuracy: 95.88% - Test Accuracy: 97.47%
EarlyStopping: best=97.47% (epoch 8) patience_counter=0/10


Epoch 8: 100%|██████████| 600/600 [00:06<00:00, 94.15batch/s]
Test 8: 100%|██████████| 100/100 [00:00<00:00, 105.82batch/s]


[Epoch 9] Train Loss: 0.001321 - Test Loss: 0.000764 - Train Accuracy: 96.12% - Test Accuracy: 97.63%
EarlyStopping: best=97.63% (epoch 9) patience_counter=0/10


Epoch 9: 100%|██████████| 600/600 [00:06<00:00, 94.70batch/s]
Test 9: 100%|██████████| 100/100 [00:00<00:00, 106.50batch/s]


[Epoch 10] Train Loss: 0.001226 - Test Loss: 0.000758 - Train Accuracy: 96.49% - Test Accuracy: 97.61%
EarlyStopping: best=97.63% (epoch 9) patience_counter=1/10


Epoch 10: 100%|██████████| 600/600 [00:06<00:00, 95.02batch/s]
Test 10: 100%|██████████| 100/100 [00:00<00:00, 105.99batch/s]


[Epoch 11] Train Loss: 0.001142 - Test Loss: 0.000716 - Train Accuracy: 96.72% - Test Accuracy: 97.78%
EarlyStopping: best=97.78% (epoch 11) patience_counter=0/10


Epoch 11: 100%|██████████| 600/600 [00:06<00:00, 95.07batch/s]
Test 11: 100%|██████████| 100/100 [00:00<00:00, 105.87batch/s]


[Epoch 12] Train Loss: 0.001041 - Test Loss: 0.000683 - Train Accuracy: 96.95% - Test Accuracy: 97.86%
EarlyStopping: best=97.78% (epoch 11) patience_counter=1/10


Epoch 12: 100%|██████████| 600/600 [00:06<00:00, 93.51batch/s]
Test 12: 100%|██████████| 100/100 [00:00<00:00, 106.04batch/s]


[Epoch 13] Train Loss: 0.001005 - Test Loss: 0.000650 - Train Accuracy: 97.10% - Test Accuracy: 97.98%
EarlyStopping: best=97.98% (epoch 13) patience_counter=0/10


Epoch 13: 100%|██████████| 600/600 [00:06<00:00, 92.33batch/s]
Test 13: 100%|██████████| 100/100 [00:00<00:00, 105.65batch/s]


[Epoch 14] Train Loss: 0.000956 - Test Loss: 0.000603 - Train Accuracy: 97.18% - Test Accuracy: 98.18%
EarlyStopping: best=98.18% (epoch 14) patience_counter=0/10


Epoch 14: 100%|██████████| 600/600 [00:06<00:00, 93.98batch/s]
Test 14: 100%|██████████| 100/100 [00:00<00:00, 102.35batch/s]


[Epoch 15] Train Loss: 0.000919 - Test Loss: 0.000627 - Train Accuracy: 97.31% - Test Accuracy: 97.99%
EarlyStopping: best=98.18% (epoch 14) patience_counter=1/10


Epoch 15: 100%|██████████| 600/600 [00:06<00:00, 91.61batch/s]
Test 15: 100%|██████████| 100/100 [00:00<00:00, 104.22batch/s]


[Epoch 16] Train Loss: 0.000879 - Test Loss: 0.000600 - Train Accuracy: 97.36% - Test Accuracy: 98.12%
EarlyStopping: best=98.18% (epoch 14) patience_counter=2/10


Epoch 16: 100%|██████████| 600/600 [00:06<00:00, 91.73batch/s]
Test 16: 100%|██████████| 100/100 [00:00<00:00, 103.04batch/s]


[Epoch 17] Train Loss: 0.000807 - Test Loss: 0.000589 - Train Accuracy: 97.64% - Test Accuracy: 98.16%
EarlyStopping: best=98.18% (epoch 14) patience_counter=3/10


Epoch 17: 100%|██████████| 600/600 [00:06<00:00, 94.78batch/s]
Test 17: 100%|██████████| 100/100 [00:00<00:00, 104.54batch/s]


[Epoch 18] Train Loss: 0.000783 - Test Loss: 0.000574 - Train Accuracy: 97.66% - Test Accuracy: 98.25%
EarlyStopping: best=98.18% (epoch 14) patience_counter=4/10


Epoch 18: 100%|██████████| 600/600 [00:06<00:00, 89.65batch/s]
Test 18: 100%|██████████| 100/100 [00:00<00:00, 101.11batch/s]


[Epoch 19] Train Loss: 0.000766 - Test Loss: 0.000580 - Train Accuracy: 97.68% - Test Accuracy: 98.13%
EarlyStopping: best=98.18% (epoch 14) patience_counter=5/10


Epoch 19: 100%|██████████| 600/600 [00:06<00:00, 90.28batch/s]
Test 19: 100%|██████████| 100/100 [00:01<00:00, 97.85batch/s]


[Epoch 20] Train Loss: 0.000730 - Test Loss: 0.000550 - Train Accuracy: 97.81% - Test Accuracy: 98.29%
EarlyStopping: best=98.29% (epoch 20) patience_counter=0/10


Epoch 20: 100%|██████████| 600/600 [00:06<00:00, 86.61batch/s]
Test 20: 100%|██████████| 100/100 [00:01<00:00, 98.32batch/s]


[Epoch 21] Train Loss: 0.000686 - Test Loss: 0.000555 - Train Accuracy: 97.96% - Test Accuracy: 98.22%
EarlyStopping: best=98.29% (epoch 20) patience_counter=1/10


Epoch 21: 100%|██████████| 600/600 [00:06<00:00, 91.51batch/s]
Test 21: 100%|██████████| 100/100 [00:00<00:00, 104.11batch/s]


[Epoch 22] Train Loss: 0.000681 - Test Loss: 0.000555 - Train Accuracy: 97.91% - Test Accuracy: 98.32%
EarlyStopping: best=98.29% (epoch 20) patience_counter=2/10


Epoch 22: 100%|██████████| 600/600 [00:06<00:00, 93.95batch/s]
Test 22: 100%|██████████| 100/100 [00:00<00:00, 105.10batch/s]


[Epoch 23] Train Loss: 0.000661 - Test Loss: 0.000537 - Train Accuracy: 98.00% - Test Accuracy: 98.26%
EarlyStopping: best=98.29% (epoch 20) patience_counter=3/10


Epoch 23: 100%|██████████| 600/600 [00:06<00:00, 94.03batch/s]
Test 23: 100%|██████████| 100/100 [00:00<00:00, 105.37batch/s]


[Epoch 24] Train Loss: 0.000631 - Test Loss: 0.000545 - Train Accuracy: 98.18% - Test Accuracy: 98.38%
EarlyStopping: best=98.29% (epoch 20) patience_counter=4/10


Epoch 24: 100%|██████████| 600/600 [00:06<00:00, 93.35batch/s]
Test 24: 100%|██████████| 100/100 [00:00<00:00, 103.20batch/s]


[Epoch 25] Train Loss: 0.000607 - Test Loss: 0.000554 - Train Accuracy: 98.22% - Test Accuracy: 98.15%
EarlyStopping: best=98.29% (epoch 20) patience_counter=5/10


Epoch 25: 100%|██████████| 600/600 [00:06<00:00, 93.99batch/s]
Test 25: 100%|██████████| 100/100 [00:00<00:00, 106.38batch/s]


[Epoch 26] Train Loss: 0.000610 - Test Loss: 0.000543 - Train Accuracy: 98.12% - Test Accuracy: 98.36%
EarlyStopping: best=98.29% (epoch 20) patience_counter=6/10


Epoch 26: 100%|██████████| 600/600 [00:06<00:00, 94.04batch/s]
Test 26: 100%|██████████| 100/100 [00:00<00:00, 104.88batch/s]


[Epoch 27] Train Loss: 0.000551 - Test Loss: 0.000531 - Train Accuracy: 98.36% - Test Accuracy: 98.41%
EarlyStopping: best=98.41% (epoch 27) patience_counter=0/10


Epoch 27: 100%|██████████| 600/600 [00:06<00:00, 93.82batch/s]
Test 27: 100%|██████████| 100/100 [00:00<00:00, 102.77batch/s]


[Epoch 28] Train Loss: 0.000569 - Test Loss: 0.000533 - Train Accuracy: 98.27% - Test Accuracy: 98.35%
EarlyStopping: best=98.41% (epoch 27) patience_counter=1/10


Epoch 28: 100%|██████████| 600/600 [00:06<00:00, 90.83batch/s]
Test 28: 100%|██████████| 100/100 [00:00<00:00, 100.91batch/s]


[Epoch 29] Train Loss: 0.000509 - Test Loss: 0.000543 - Train Accuracy: 98.41% - Test Accuracy: 98.32%
EarlyStopping: best=98.41% (epoch 27) patience_counter=2/10


Epoch 29: 100%|██████████| 600/600 [00:07<00:00, 80.46batch/s]
Test 29: 100%|██████████| 100/100 [00:00<00:00, 105.93batch/s]


[Epoch 30] Train Loss: 0.000510 - Test Loss: 0.000520 - Train Accuracy: 98.44% - Test Accuracy: 98.36%
EarlyStopping: best=98.41% (epoch 27) patience_counter=3/10


Epoch 30: 100%|██████████| 600/600 [00:06<00:00, 93.43batch/s]
Test 30: 100%|██████████| 100/100 [00:00<00:00, 105.82batch/s]


[Epoch 31] Train Loss: 0.000519 - Test Loss: 0.000516 - Train Accuracy: 98.39% - Test Accuracy: 98.44%
EarlyStopping: best=98.41% (epoch 27) patience_counter=4/10


Epoch 31: 100%|██████████| 600/600 [00:06<00:00, 92.03batch/s]
Test 31: 100%|██████████| 100/100 [00:01<00:00, 94.79batch/s]


[Epoch 32] Train Loss: 0.000469 - Test Loss: 0.000531 - Train Accuracy: 98.52% - Test Accuracy: 98.41%
EarlyStopping: best=98.41% (epoch 27) patience_counter=5/10


Epoch 32: 100%|██████████| 600/600 [00:06<00:00, 91.42batch/s]
Test 32: 100%|██████████| 100/100 [00:00<00:00, 103.46batch/s]


[Epoch 33] Train Loss: 0.000493 - Test Loss: 0.000528 - Train Accuracy: 98.45% - Test Accuracy: 98.37%
EarlyStopping: best=98.41% (epoch 27) patience_counter=6/10


Epoch 33: 100%|██████████| 600/600 [00:06<00:00, 89.88batch/s]
Test 33: 100%|██████████| 100/100 [00:00<00:00, 102.50batch/s]


[Epoch 34] Train Loss: 0.000455 - Test Loss: 0.000520 - Train Accuracy: 98.60% - Test Accuracy: 98.40%
EarlyStopping: best=98.41% (epoch 27) patience_counter=7/10


Epoch 34: 100%|██████████| 600/600 [00:06<00:00, 92.75batch/s]
Test 34: 100%|██████████| 100/100 [00:00<00:00, 102.83batch/s]


[Epoch 35] Train Loss: 0.000427 - Test Loss: 0.000497 - Train Accuracy: 98.69% - Test Accuracy: 98.43%
EarlyStopping: best=98.41% (epoch 27) patience_counter=8/10


Epoch 35: 100%|██████████| 600/600 [00:06<00:00, 94.15batch/s]
Test 35: 100%|██████████| 100/100 [00:00<00:00, 105.43batch/s]


[Epoch 36] Train Loss: 0.000453 - Test Loss: 0.000532 - Train Accuracy: 98.55% - Test Accuracy: 98.38%
EarlyStopping: best=98.41% (epoch 27) patience_counter=9/10


Epoch 36: 100%|██████████| 600/600 [00:06<00:00, 87.05batch/s]
Test 36: 100%|██████████| 100/100 [00:00<00:00, 103.46batch/s]


[Epoch 37] Train Loss: 0.000440 - Test Loss: 0.000488 - Train Accuracy: 98.62% - Test Accuracy: 98.43%
EarlyStopping: best=98.41% (epoch 27) patience_counter=10/10
Stopping early at epoch 37. Best was epoch 27 with acc 98.41%

BEST TEST ACCURACY:  98.44  in epoch  30


Test 36: 100%|██████████| 100/100 [00:00<00:00, 105.15batch/s]

Final best acc:  98.44



