# 09: PyTorch Profiling

This notebook is an experiment to try out the PyTorch profiler.

See here for more:
* https://pytorch.org/blog/introducing-pytorch-profiler-the-new-and-improved-performance-tool/
* https://pytorch.org/docs/stable/profiler.html

In [10]:
import torch
import torchvision
from torch import nn
from torchvision import transforms, datasets
from torchinfo import summary

import numpy as np
import matplotlib.pyplot as plt

from going_modular import data_setup, engine

## Setup device

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Get and load data

In [12]:
import os
import requests
from zipfile import ZipFile

def get_food_image_data():
    if not os.path.exists("data/10_whole_foods"):
        os.makedirs("data/", exist_ok=True)
        # Download data
        data_url = "https://storage.googleapis.com/food-vision-image-playground/10_whole_foods.zip"
        print(f"Downloading data from {data_url}...")
        requests.get(data_url)
        # Unzip data
        targ_dir = "data/10_whole_foods"
        print(f"Extracting data to {targ_dir}...")
        with ZipFile("10_whole_foods.zip") as zip_ref:
            zip_ref.extractall(targ_dir)
    else:
        print("data/10_whole_foods dir exists, skipping download")

get_food_image_data()

data/10_whole_foods dir exists, skipping download


In [38]:
# Setup dirs
train_dir = "data/10_whole_foods/train"
test_dir = "data/10_whole_foods/test"

# Setup ImageNet normalization levels (turns all images into similar distribution as ImageNet)
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

# Create starter transform
simple_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    normalize
])           

# Create data loaders
train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(
    train_dir=train_dir,
    test_dir=test_dir,
    transform=simple_transform,
    batch_size=32,
    num_workers=8
)

train_dataloader, test_dataloader, class_names

(<torch.utils.data.dataloader.DataLoader at 0x7f052ab26e20>,
 <torch.utils.data.dataloader.DataLoader at 0x7f05299eed00>,
 ['apple',
  'banana',
  'beef',
  'blueberries',
  'carrots',
  'chicken_wings',
  'egg',
  'honey',
  'mushrooms',
  'strawberries'])

## Load model 

In [66]:
model = torchvision.models.efficientnet_b0(pretrained=True).to(device)
# model

In [67]:
# Update the classifier
model.classifier = torch.nn.Sequential(
    nn.Dropout(p=0.2),
    nn.Linear(1280, len(class_names)).to(device))

# Freeze all base layers 
for param in model.features.parameters():
    param.requires_grad = False

## Train model and track results

In [68]:
# Define loss and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

### Adjust training function to track results with `SummaryWriter`

In [69]:
model.name = "EfficietNetB0"
model.name

'EfficietNetB0'

In [70]:
from torch.utils.tensorboard import SummaryWriter
from going_modular.engine import train_step, test_step
from tqdm import tqdm
writer = SummaryWriter()

Update the `train_step()` function to include the PyTorch profiler.

In [71]:
def train_step(model, dataloader, loss_fn, optimizer):
    model.train()
    train_loss, train_acc = 0, 0
    ## NEW: Add PyTorch profiler

    dir_to_save_logs = os.path.join("logs", datetime.now().strftime("%Y-%m-%d-%H-%M"))
    with torch.profiler.profile(
        on_trace_ready=torch.profiler.tensorboard_trace_handler(dir_name=dir_to_save_logs),
        # with_stack=True # this adds a lot of overhead to training (tracing all the stack)
    ):
        for batch, (X, y) in enumerate(dataloader):
            # Send data to GPU
            X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
            
            # Turn on mixed precision if available
            with torch.autocast(device_type=device, enabled=True):
                # 1. Forward pass
                y_pred = model(X)

                # 2. Calculate loss
                loss = loss_fn(y_pred, y)

            # 3. Optimizer zero grad
            optimizer.zero_grad()

            # 4. Loss backward
            loss.backward()

            # 5. Optimizer step
            optimizer.step()

            # 6. Calculate metrics
            train_loss += loss.item()
            y_pred_class = torch.softmax(y_pred, dim=1).argmax(dim=1)
            # print(f"y: \n{y}\ny_pred_class:{y_pred_class}")
            # print(f"y argmax: {y_pred.argmax(dim=1)}")
            # print(f"Equal: {(y_pred_class == y)}")
            train_acc += (y_pred_class == y).sum().item() / len(y_pred)
            # print(f"batch: {batch} train_acc: {train_acc}")

    # Adjust returned metrics
    return train_loss / len(dataloader), train_acc / len(dataloader)

TK - Now to use the writer, we've got to adjust the `train()` function...

In [72]:
def train(
    model,
    train_dataloader,
    test_dataloader,
    optimizer,
    loss_fn=nn.CrossEntropyLoss(),
    epochs=5,
):

    results = {"train_loss": [], "train_acc": [], "test_loss": [], "test_acc": []}

    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = train_step(
            model=model,
            dataloader=train_dataloader,
            loss_fn=loss_fn,
            optimizer=optimizer,
        )
        test_loss, test_acc = test_step(
            model=model, dataloader=test_dataloader, loss_fn=loss_fn
        )

        # Print out what's happening
        print(
            f"Epoch: {epoch+1} | "
            f"train_loss: {train_loss:.4f} | "
            f"train_acc: {train_acc:.4f} | "
            f"test_loss: {test_loss:.4f} | "
            f"test_acc: {test_acc:.4f}"
        )

        # Update results
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)

        # Add results to SummaryWriter
        writer.add_scalars(main_tag="Loss", 
                           tag_scalar_dict={"train_loss": train_loss,
                                            "test_loss": test_loss},
                           global_step=epoch)
        writer.add_scalars(main_tag="Accuracy", 
                           tag_scalar_dict={"train_acc": train_acc,
                                            "test_acc": test_acc}, 
                           global_step=epoch)
    
    # Close the writer
    writer.close()

    return results

In [None]:
# Train model
# Note: Not using engine.train() since the original script isn't updated
results = train(model=model,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        optimizer=optimizer,
        loss_fn=loss_fn,
        epochs=5)

## Try mixed precision with larger model

Now we'll try turn on mixed precision with a larger model (e.g. EffifientNetB0 with all layers tuneable).

In [74]:
# Unfreeze all base layers 
for param in model.features.parameters():
    param.requires_grad = True

# for param in model.features.parameters():
#     print(param.requires_grad)

In [None]:
# Train model
# Note: Not using engine.train() since the original script isn't updated
results = train(model=model,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        optimizer=optimizer,
        loss_fn=loss_fn,
        epochs=5)

Checking the PyTorch profiler, it seems that mixed precision utilises some Tensor Cores, however, these aren't large numbers.

E.g. it uses 9-12% Tensor Cores. Perhaps the slow down when using mixed precision is because the tensors have to get altered and converted when there isn't very many of them. For example only 9-12% of tensors get converted so the speed up gains aren't realised on these tensors because they get cancelled out by the conversion time.

## Extensions
* Does changing the data input size to EfficientNetB4 change its results? E.g. input image size of (380, 380) instead of (224, 224)?