# 07. PyTorch Experiment Tracking

In [1]:
import torchvision
import torch
from torch import nn
from torchvision import transforms
from torchinfo import summary
from going_modular import data_setup, engine

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
def set_seeds(seed: int=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

In [4]:
set_seeds()

## 1. Get data


In [5]:
import os
import zipfile
from pathlib import Path
import requests

def download_data(
    source: str,
    destination: str,
    remove_source: True
) -> Path:
    data_path = Path("data/")
    image_path = data_path / destination

    if image_path.is_dir():
        print(f"[INFO] {image_path} directory already exists. Skipping download.")
    else:
        print(f"[INFO] Did not find {image_path} directory. Creating now.")
        image_path.mkdir(parents=True, exist_ok=True)

        target_file = Path(source).name
        with open(data_path / target_file, "wb") as f:
            request = requests.get(source)
            print(f"[INFO] Downloading {source} to {data_path / target_file}")
            f.write(request.content)

        with zipfile.ZipFile(data_path / target_file, "r") as zip_ref:
            print(f"[INFO] Extracting {data_path / target_file} to {image_path}")
            zip_ref.extractall(image_path)

        if remove_source:
            os.remove(data_path / target_file)

    return image_path

In [6]:
image_path = download_data(
    source="https://github.com/mrdbourke/pytorch-deep-learning/raw/refs/heads/main/data/pizza_steak_sushi.zip",
    destination="pizza_steak_sushi",
    remove_source=True
)


[INFO] data\pizza_steak_sushi directory already exists. Skipping download.


## 2. Create datasets and dataloaders

## 2.1 Create DataLoaders with manual transforms

The goal with transforms is to ensure your custom data is formatted in a reproducile way as well as a way that will suit pretrained models.

In [7]:
train_dir = image_path / "train"
test_dir = image_path / "test"

In [8]:
normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225]
)

manual_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    normalize
])

print(f"Manually created transforms: {manual_transforms}")

train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(
    train_dir, 
    test_dir, 
    manual_transforms, 
    batch_size=32
)

train_dataloader, test_dataloader, class_names

Manually created transforms: Compose(
    Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=True)
    ToTensor()
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
)


(<torch.utils.data.dataloader.DataLoader at 0x1751360b110>,
 <torch.utils.data.dataloader.DataLoader at 0x175135e1d90>,
 ['pizza', 'steak', 'sushi'])

## 2.2 Create dataloaders using automatically created transforms

In [9]:
weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT

automatic_transforms = weights.transforms()

train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(
    train_dir, 
    test_dir, 
    automatic_transforms, 
    batch_size=32
)

train_dataloader, test_dataloader, class_names

(<torch.utils.data.dataloader.DataLoader at 0x1751360b410>,
 <torch.utils.data.dataloader.DataLoader at 0x1751360b350>,
 ['pizza', 'steak', 'sushi'])

## 3. Getting a pretrained model, freeze the base layers and change the classifier head

In [10]:
model = torchvision.models.efficientnet_b0(weights=weights).to(device)

In [11]:
for param in model.features.parameters():
    param.requires_grad = False

In [12]:
model.classifier = nn.Sequential(
    nn.Dropout(0.2, inplace=True),
    nn.Linear(1280, len(class_names))
).to(device)

In [13]:
summary(
    model,
    input_size=(32, 3, 224, 224),
    col_names=[
        "input_size",
        "output_size",
        "num_params",
        "trainable"],
    col_width=20,
    row_settings=["var_names"]
)

Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
EfficientNet (EfficientNet)                                  [32, 3, 224, 224]    [32, 3]              --                   Partial
├─Sequential (features)                                      [32, 3, 224, 224]    [32, 1280, 7, 7]     --                   False
│    └─Conv2dNormActivation (0)                              [32, 3, 224, 224]    [32, 32, 112, 112]   --                   False
│    │    └─Conv2d (0)                                       [32, 3, 224, 224]    [32, 32, 112, 112]   (864)                False
│    │    └─BatchNorm2d (1)                                  [32, 32, 112, 112]   [32, 32, 112, 112]   (64)                 False
│    │    └─SiLU (2)                                         [32, 32, 112, 112]   [32, 32, 112, 112]   --                   --
│    └─Sequential (1)                                        [32, 32, 112, 112]   [32, 

## 4. Train a single model and track results

In [14]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [15]:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [16]:
from going_modular.engine import train_step, test_step
from tqdm.auto import tqdm
from typing import List, Dict, Tuple

In [17]:
def train(model: torch.nn.Module, 
          train_dataloader: torch.utils.data.DataLoader, 
          test_dataloader: torch.utils.data.DataLoader, 
          optimizer: torch.optim.Optimizer,
          loss_fn: torch.nn.Module,
          epochs: int,
          device: torch.device) -> Dict[str, List[float]]:
    """Trains and tests a PyTorch model.

    Passes a target PyTorch models through train_step() and test_step()
    functions for a number of epochs, training and testing the model
    in the same epoch loop.

    Calculates, prints and stores evaluation metrics throughout.

    Args:
    model: A PyTorch model to be trained and tested.
    train_dataloader: A DataLoader instance for the model to be trained on.
    test_dataloader: A DataLoader instance for the model to be tested on.
    optimizer: A PyTorch optimizer to help minimize the loss function.
    loss_fn: A PyTorch loss function to calculate loss on both datasets.
    epochs: An integer indicating how many epochs to train for.
    device: A target device to compute on (e.g. "cuda" or "cpu").

    Returns:
    A dictionary of training and testing loss as well as training and
    testing accuracy metrics. Each metric has a value in a list for 
    each epoch.
    In the form: {train_loss: [...],
              train_acc: [...],
              test_loss: [...],
              test_acc: [...]} 
    For example if training for epochs=2: 
             {train_loss: [2.0616, 1.0537],
              train_acc: [0.3945, 0.3945],
              test_loss: [1.2641, 1.5706],
              test_acc: [0.3400, 0.2973]} 
    """
    # Create empty results dictionary
    results = {"train_loss": [],
               "train_acc": [],
               "test_loss": [],
               "test_acc": []
    }

    # Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(model=model,
                                          dataloader=train_dataloader,
                                          loss_fn=loss_fn,
                                          optimizer=optimizer,
                                          device=device)
        test_loss, test_acc = test_step(model=model,
          dataloader=test_dataloader,
          loss_fn=loss_fn,
          device=device)

        # Print out what's happening
        print(
          f"Epoch: {epoch+1} | "
          f"train_loss: {train_loss:.4f} | "
          f"train_acc: {train_acc:.4f} | "
          f"test_loss: {test_loss:.4f} | "
          f"test_acc: {test_acc:.4f}"
        )

        # Update results dictionary
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)

    ### NEW: Experiment tracking with TensorBoard
        writer.add_scalars(
            main_tag="Loss", 
            tag_scalar_dict={
                "train_loss": train_loss, 
                "test_loss": test_loss},
            global_step=epoch
        )


        writer.add_scalars(
            main_tag="Accuracy", 
            tag_scalar_dict={
                "train_acc": train_acc, 
                "test_acc": test_acc},
            global_step=epoch
        )

        writer.add_graph(
            model=model,
            input_to_model=torch.randn(32, 3, 224, 224).to(device)
        )

    writer.close()
    ### END NEW

    # Return the filled results at the end of the epochs
    return results

In [18]:
set_seeds()
results = train(
    model=model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    loss_fn=loss_fn,
    epochs=25,
    device=device
)

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 1.0743 | train_acc: 0.4219 | test_loss: 0.8657 | test_acc: 0.7737
Epoch: 2 | train_loss: 0.8918 | train_acc: 0.6641 | test_loss: 0.7681 | test_acc: 0.7945
Epoch: 3 | train_loss: 0.7432 | train_acc: 0.7500 | test_loss: 0.6499 | test_acc: 0.8655
Epoch: 4 | train_loss: 0.6621 | train_acc: 0.8867 | test_loss: 0.6412 | test_acc: 0.8456
Epoch: 5 | train_loss: 0.6875 | train_acc: 0.7617 | test_loss: 0.6664 | test_acc: 0.8144
Epoch: 6 | train_loss: 0.5658 | train_acc: 0.8906 | test_loss: 0.5798 | test_acc: 0.8864
Epoch: 7 | train_loss: 0.5309 | train_acc: 0.9180 | test_loss: 0.5272 | test_acc: 0.8665
Epoch: 8 | train_loss: 0.5247 | train_acc: 0.8164 | test_loss: 0.5353 | test_acc: 0.8456
Epoch: 9 | train_loss: 0.4915 | train_acc: 0.8320 | test_loss: 0.5227 | test_acc: 0.8456
Epoch: 10 | train_loss: 0.4232 | train_acc: 0.9453 | test_loss: 0.4505 | test_acc: 0.8759
Epoch: 11 | train_loss: 0.5290 | train_acc: 0.7930 | test_loss: 0.4020 | test_acc: 0.9167
Epoch: 12 | train_l

In [19]:
%load_ext tensorboard
%tensorboard --logdir=runs

Reusing TensorBoard on port 6006 (pid 32024), started 0:05:14 ago. (Use '!kill 32024' to kill it.)