In [None]:
import os
import sys
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
import io
from contextlib import redirect_stdout
from PIL import Image
from torch.utils.data import Subset, Dataset, random_split
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter

# ----- Define a Tee class to write to multiple streams -----
class Tee(object):
    def __init__(self, *fileobjects):
        self.fileobjects = fileobjects

    def write(self, text):
        for f in self.fileobjects:
            f.write(text)
            f.flush()

    def flush(self):
        for f in self.fileobjects:
            f.flush()

# ----- Specify paths for saving logs, metrics, and models -----
log_file_path = "./logs/cifar10_training_log.txt"
metrics_file_path = "./metrics/cifar10_training_metrics.json"
best_model_path = "./models/cifar10_best_model"
save_path = "./models/cifar10_resnet18"

# ----- Prepare the CIFAR-10 DataLoaders -----
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Download and prepare datasets
full_trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                            download=True, transform=transform)

# Define the sizes for train and validation sets
train_size = int(0.8 * len(full_trainset))  # 80% for training
val_size = len(full_trainset) - train_size   # Remaining 20% for validation

# Split the dataset
trainset, valset = random_split(full_trainset, [train_size, val_size])

# Download and load the test dataset
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                      download=True, transform=transform)

batch_size = 256
num_workers = 2
train_loader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                         shuffle=True, num_workers=num_workers)
val_loader = torch.utils.data.DataLoader(valset, batch_size=batch_size,
                                       shuffle=False, num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                        shuffle=False, num_workers=num_workers)



# ----- Prepare the Model for CIFAR-10 -----
model = models.resnet18(weights=None)
# Adjust first layer for 32x32 images: replace 7x7 conv with 3x3 and remove maxpool
model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
model.maxpool = nn.Identity()
# Change final layer for 10 classes (CIFAR-10) instead of 200 (Tiny ImageNet)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 10)

# ----- Setup Training Parameters -----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 200
learning_rate = 0.1
checkpoint_frequency = 10
start_epoch = 0
start_checkpoint = None

# If starting from a checkpoint, load the model state
if start_checkpoint:
    state_dict = torch.load(start_checkpoint, map_location=device)
    model.load_state_dict(state_dict)

model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=5e-4)

# ----- Initialize Metrics Storage -----
metrics = {
    "epochs": [],
    "train_loss": [],
    "train_acc": [],
    "val_loss": [],
    "val_acc": []
}
best_val_acc = 0.0
best_epoch = None

# ----- Capture Standard Output During Training and Print to Console -----
log_capture = io.StringIO()
tee = Tee(sys.stdout, log_capture)  # This will print to stdout and also capture the output

# Add before training loop starts
writer = SummaryWriter(log_dir='./runs/cifar10_experiment')

with redirect_stdout(tee):
    print("Training Resnet18 CIFAR-10 model...")
    for epoch in range(start_epoch, start_epoch + num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        # Save a checkpoint at the specified frequency
        if epoch % checkpoint_frequency == 0:
            torch.save(model.state_dict(), save_path + "_" + str(epoch) + ".pth")

        # Modified training loop with tqdm
        for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_epoch_loss = running_loss / total
        train_epoch_acc = 100. * correct / total

        # Evaluate on the validation set
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()

        val_epoch_loss = val_loss / val_total
        val_epoch_acc = 100. * val_correct / val_total

        # Store metrics for this epoch
        metrics["epochs"].append(epoch+1)  # Epochs are stored 1-indexed for clarity
        metrics["train_loss"].append(train_epoch_loss)
        metrics["train_acc"].append(train_epoch_acc)
        metrics["val_loss"].append(val_epoch_loss)
        metrics["val_acc"].append(val_epoch_acc)

        print(f"Epoch [{epoch+1}/{start_epoch + num_epochs}] - Train Loss: {train_epoch_loss:.4f}, Train Acc: {train_epoch_acc:.2f}% | Val Loss: {val_epoch_loss:.4f}, Val Acc: {val_epoch_acc:.2f}%")

        # Add TensorBoard logging after computing metrics
        writer.add_scalar('Loss/Train', train_epoch_loss, epoch)
        writer.add_scalar('Accuracy/Train', train_epoch_acc, epoch)
        writer.add_scalar('Loss/Validation', val_epoch_loss, epoch)
        writer.add_scalar('Accuracy/Validation', val_epoch_acc, epoch)

        # Save the best model based on validation accuracy
        if val_epoch_acc > best_val_acc:
            best_val_acc = val_epoch_acc
            best_epoch = epoch+1
            torch.save(model.state_dict(), best_model_path)
            print(f"New best model found at epoch {epoch+1} with Val Acc: {val_epoch_acc:.2f}%")
        
        # ----- Save metrics JSON after every epoch -----
        with open(metrics_file_path, "w") as f:
            json.dump(metrics, f, indent=4)
    
    print("Training complete.")
    torch.save(model.state_dict(), save_path + "_final.pth")

# Close the TensorBoard writer
writer.close()

# Write the captured logs to the specified log file (log_capture already contains the output)
with open(log_file_path, "w") as f:
    f.write(log_capture.getvalue())

# Optionally, print confirmation that the logs, metrics, and best model have been saved
print("Training log captured and saved to", log_file_path)
print("Training metrics saved to", metrics_file_path)
print("Best model saved from epoch", best_epoch, "with validation accuracy of", best_val_acc)

2025-04-05 04:01:31.965875: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-05 04:01:32.647579: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-04-05 04:01:32.647691: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-04-05 04:01:32.757048: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-05 04:01:32.978816: I tensorflow/core/platform/cpu_feature_guar

Files already downloaded and verified
Files already downloaded and verified
Training Resnet18 CIFAR-10 model...


Epoch 1/200: 100%|██████████| 157/157 [00:13<00:00, 11.36it/s]


Epoch [1/200] - Train Loss: 2.2341, Train Acc: 25.86% | Val Loss: 1.7402, Val Acc: 35.73%
New best model found at epoch 1 with Val Acc: 35.73%


Epoch 2/200: 100%|██████████| 157/157 [00:12<00:00, 12.49it/s]


Epoch [2/200] - Train Loss: 1.5303, Train Acc: 43.49% | Val Loss: 1.4478, Val Acc: 46.84%
New best model found at epoch 2 with Val Acc: 46.84%


Epoch 3/200: 100%|██████████| 157/157 [00:12<00:00, 12.52it/s]


Epoch [3/200] - Train Loss: 1.2894, Train Acc: 53.26% | Val Loss: 1.2702, Val Acc: 54.28%
New best model found at epoch 3 with Val Acc: 54.28%


Epoch 4/200: 100%|██████████| 157/157 [00:12<00:00, 12.45it/s]


Epoch [4/200] - Train Loss: 1.0684, Train Acc: 61.71% | Val Loss: 1.1837, Val Acc: 60.05%
New best model found at epoch 4 with Val Acc: 60.05%


Epoch 5/200: 100%|██████████| 157/157 [00:12<00:00, 12.39it/s]


Epoch [5/200] - Train Loss: 0.8756, Train Acc: 69.19% | Val Loss: 1.0485, Val Acc: 63.94%
New best model found at epoch 5 with Val Acc: 63.94%


Epoch 6/200: 100%|██████████| 157/157 [00:12<00:00, 12.34it/s]


Epoch [6/200] - Train Loss: 0.7371, Train Acc: 73.64% | Val Loss: 0.8699, Val Acc: 69.44%
New best model found at epoch 6 with Val Acc: 69.44%


Epoch 7/200: 100%|██████████| 157/157 [00:12<00:00, 12.35it/s]


Epoch [7/200] - Train Loss: 0.6164, Train Acc: 78.35% | Val Loss: 0.7827, Val Acc: 72.80%
New best model found at epoch 7 with Val Acc: 72.80%


Epoch 8/200: 100%|██████████| 157/157 [00:12<00:00, 12.32it/s]


Epoch [8/200] - Train Loss: 0.5185, Train Acc: 81.40% | Val Loss: 0.7995, Val Acc: 72.28%


Epoch 9/200: 100%|██████████| 157/157 [00:12<00:00, 12.28it/s]


Epoch [9/200] - Train Loss: 0.4121, Train Acc: 85.32% | Val Loss: 0.8971, Val Acc: 70.38%


Epoch 10/200: 100%|██████████| 157/157 [00:12<00:00, 12.34it/s]


Epoch [10/200] - Train Loss: 0.3248, Train Acc: 88.46% | Val Loss: 0.7748, Val Acc: 74.83%
New best model found at epoch 10 with Val Acc: 74.83%


Epoch 11/200: 100%|██████████| 157/157 [00:12<00:00, 12.31it/s]


Epoch [11/200] - Train Loss: 0.2632, Train Acc: 90.83% | Val Loss: 0.7953, Val Acc: 75.30%
New best model found at epoch 11 with Val Acc: 75.30%


Epoch 12/200: 100%|██████████| 157/157 [00:12<00:00, 12.30it/s]


Epoch [12/200] - Train Loss: 0.2126, Train Acc: 92.63% | Val Loss: 0.7732, Val Acc: 75.81%
New best model found at epoch 12 with Val Acc: 75.81%


Epoch 13/200: 100%|██████████| 157/157 [00:12<00:00, 12.27it/s]


Epoch [13/200] - Train Loss: 0.1646, Train Acc: 94.36% | Val Loss: 1.0608, Val Acc: 71.74%


Epoch 14/200: 100%|██████████| 157/157 [00:12<00:00, 12.28it/s]


Epoch [14/200] - Train Loss: 0.1592, Train Acc: 94.50% | Val Loss: 0.9517, Val Acc: 75.09%


Epoch 15/200: 100%|██████████| 157/157 [00:12<00:00, 12.31it/s]


Epoch [15/200] - Train Loss: 0.1316, Train Acc: 95.38% | Val Loss: 0.7968, Val Acc: 78.30%
New best model found at epoch 15 with Val Acc: 78.30%


Epoch 16/200: 100%|██████████| 157/157 [00:12<00:00, 12.26it/s]


Epoch [16/200] - Train Loss: 0.1029, Train Acc: 96.41% | Val Loss: 1.3036, Val Acc: 70.53%


Epoch 17/200: 100%|██████████| 157/157 [00:12<00:00, 12.32it/s]


Epoch [17/200] - Train Loss: 0.1319, Train Acc: 95.46% | Val Loss: 0.8709, Val Acc: 76.66%


Epoch 18/200: 100%|██████████| 157/157 [00:12<00:00, 12.23it/s]


Epoch [18/200] - Train Loss: 0.1059, Train Acc: 96.43% | Val Loss: 0.9628, Val Acc: 75.81%


Epoch 19/200: 100%|██████████| 157/157 [00:12<00:00, 12.27it/s]


Epoch [19/200] - Train Loss: 0.0991, Train Acc: 96.59% | Val Loss: 0.9444, Val Acc: 75.70%


Epoch 20/200: 100%|██████████| 157/157 [00:12<00:00, 12.30it/s]


Epoch [20/200] - Train Loss: 0.1032, Train Acc: 96.42% | Val Loss: 1.0008, Val Acc: 75.42%


Epoch 21/200: 100%|██████████| 157/157 [00:12<00:00, 12.29it/s]


Epoch [21/200] - Train Loss: 0.1119, Train Acc: 96.20% | Val Loss: 0.9278, Val Acc: 75.51%


Epoch 22/200: 100%|██████████| 157/157 [00:12<00:00, 12.26it/s]


Epoch [22/200] - Train Loss: 0.1020, Train Acc: 96.53% | Val Loss: 0.8464, Val Acc: 77.59%


Epoch 23/200: 100%|██████████| 157/157 [00:12<00:00, 12.30it/s]


Epoch [23/200] - Train Loss: 0.1057, Train Acc: 96.35% | Val Loss: 0.9893, Val Acc: 74.89%


Epoch 24/200: 100%|██████████| 157/157 [00:12<00:00, 12.33it/s]


Epoch [24/200] - Train Loss: 0.0848, Train Acc: 97.21% | Val Loss: 0.8826, Val Acc: 77.36%


Epoch 25/200: 100%|██████████| 157/157 [00:12<00:00, 12.27it/s]


Epoch [25/200] - Train Loss: 0.0934, Train Acc: 96.82% | Val Loss: 1.0234, Val Acc: 74.58%


Epoch 26/200: 100%|██████████| 157/157 [00:12<00:00, 12.28it/s]


Epoch [26/200] - Train Loss: 0.1189, Train Acc: 95.98% | Val Loss: 0.7809, Val Acc: 78.38%
New best model found at epoch 26 with Val Acc: 78.38%


Epoch 27/200: 100%|██████████| 157/157 [00:12<00:00, 12.28it/s]


Epoch [27/200] - Train Loss: 0.0750, Train Acc: 97.53% | Val Loss: 0.7732, Val Acc: 79.04%
New best model found at epoch 27 with Val Acc: 79.04%


Epoch 28/200: 100%|██████████| 157/157 [00:12<00:00, 12.29it/s]


Epoch [28/200] - Train Loss: 0.0721, Train Acc: 97.63% | Val Loss: 0.9619, Val Acc: 76.02%


Epoch 29/200: 100%|██████████| 157/157 [00:12<00:00, 12.30it/s]


Epoch [29/200] - Train Loss: 0.1031, Train Acc: 96.52% | Val Loss: 1.0732, Val Acc: 74.55%


Epoch 30/200: 100%|██████████| 157/157 [00:12<00:00, 12.28it/s]


Epoch [30/200] - Train Loss: 0.0816, Train Acc: 97.28% | Val Loss: 0.9120, Val Acc: 77.10%


Epoch 31/200: 100%|██████████| 157/157 [00:12<00:00, 12.31it/s]


Epoch [31/200] - Train Loss: 0.0939, Train Acc: 96.80% | Val Loss: 0.9930, Val Acc: 74.59%


Epoch 32/200:  52%|█████▏    | 81/157 [00:06<00:06, 12.36it/s]