In [1]:
# Make sure we're using a NVIDIA GPU
import torch
if torch.cuda.is_available():
  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find("failed") >= 0:
    print("Not connected to a GPU, to leverage the best of PyTorch 2.0, you should connect to a GPU.")

  # Get GPU name
  gpu_name = !nvidia-smi --query-gpu=gpu_name --format=csv
  gpu_name = gpu_name[1]
  GPU_NAME = gpu_name.replace(" ", "_") # remove underscores for easier saving
  print(f'GPU name: {GPU_NAME}')

  # Get GPU capability score
  GPU_SCORE = torch.cuda.get_device_capability()
  print(f"GPU capability score: {GPU_SCORE}")
  if GPU_SCORE >= (8, 0):
    print(f"GPU score higher than or equal to (8, 0), PyTorch 2.x speedup features available.")
  else:
    print(f"GPU score lower than (8, 0), PyTorch 2.x speedup features will be limited (PyTorch 2.x speedups happen most on newer GPUs).")

  # Print GPU info
  print(f"GPU information:\n{gpu_info}")

else:
  print("PyTorch couldn't find a GPU, to leverage the best of PyTorch 2.0, you should connect to a GPU.")

GPU name: NVIDIA_GeForce_RTX_4060_Ti
GPU capability score: (8, 9)
GPU score higher than or equal to (8, 0), PyTorch 2.x speedup features available.
GPU information:
Thu Apr 18 16:32:17 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04             Driver Version: 535.171.04   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 Ti     Off | 00000000:01:00.0 Off |                  N/A |
|  0%   39C    P8              13W / 165W |      4MiB / 16380MiB |      0%      Default |
|                                         |                      |                 

In [2]:
# Check available GPU memory and total GPU memory
total_free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
print(f"Total free GPU memory: {round(total_free_gpu_memory * 1e-9, 3)} GB")
print(f"Total GPU memory: {round(total_gpu_memory * 1e-9, 3)} GB")

Total free GPU memory: 16.734 GB
Total GPU memory: 16.864 GB


In [3]:
if GPU_SCORE >= (8, 0):
  print(f"[INFO] Using GPU with score: {GPU_SCORE}, enabling TensorFloat32 (TF32) computing (faster on new GPUs)")
  torch.backends.cuda.matmul.allow_tf32 = True
else:
  print(f"[INFO] Using GPU with score: {GPU_SCORE}, TensorFloat32 (TF32) not available, to use it you need a GPU with score >= (8, 0)")
  torch.backends.cuda.matmul.allow_tf32 = False

[INFO] Using GPU with score: (8, 9), enabling TensorFloat32 (TF32) computing (faster on new GPUs)


In [4]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdate3k2[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
import torch
import torchvision
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
# Set batch size depending on amount of GPU memory
total_free_gpu_memory_gb = round(total_free_gpu_memory * 1e-9, 3)
if total_free_gpu_memory_gb >= 16:
  BATCH_SIZE = 128 # Note: you could experiment with higher values here if you like.
  IMAGE_SIZE = 224
  print(f"GPU memory available is {total_free_gpu_memory_gb} GB, using batch size of {BATCH_SIZE} and image size {IMAGE_SIZE}")
else:
  BATCH_SIZE = 32
  IMAGE_SIZE = 128
  print(f"GPU memory available is {total_free_gpu_memory_gb} GB, using batch size of {BATCH_SIZE} and image size {IMAGE_SIZE}")

GPU memory available is 16.734 GB, using batch size of 128 and image size 224


In [7]:
def create_model(num_classes=10):
    """
    Creates a ResNet50 model with the latest weights and transforms via torchvision.
    """
    model_weights = torchvision.models.ResNet50_Weights.IMAGENET1K_V2
    transforms = model_weights.transforms()
    model = torchvision.models.resnet50(weights=model_weights)

    # Adjust the number of output features in model to match the number of classes in the dataset
    model.fc = torch.nn.Linear(in_features=2048, out_features=num_classes)
    return model, transforms

## Data loaders

In [13]:
import os
from torch.utils.data import DataLoader


def get_train_test_dataloader(transforms, batch_size: int, num_workers: int):
    train_dataset = torchvision.datasets.CIFAR10(
        root=".", train=True, download=True, transform=transforms
    )

    test_dataset = torchvision.datasets.CIFAR10(
        root=".",
        train=False,  # want the test split
        download=True,
        transform=transforms,
    )
    train_dataloader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
    )

    test_dataloader = DataLoader(
        dataset=test_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
    )
    return train_dataloader, test_dataloader, train_dataset, test_dataset

In [14]:
train, test, tr, t = get_train_test_dataloader(torchvision.models.ResNet50_Weights.IMAGENET1K_V2.transforms(), BATCH_SIZE, num_workers=4)

Files already downloaded and verified
Files already downloaded and verified


In [17]:
tr[0]

AttributeError: 'tuple' object has no attribute 'shape'

In [11]:
for X, y in train:
    print(X, y)
    break

tensor([[[[-1.6042, -1.6042, -1.6042,  ..., -1.5870, -1.5528, -1.5357],
          [-1.5870, -1.6042, -1.5870,  ..., -1.5870, -1.5699, -1.5528],
          [-1.5699, -1.5870, -1.5870,  ..., -1.5870, -1.5699, -1.5699],
          ...,
          [ 0.2282,  0.2624,  0.2967,  ..., -0.3369, -0.3369, -0.3541],
          [ 0.2111,  0.2453,  0.2796,  ..., -0.4054, -0.4054, -0.4054],
          [ 0.1768,  0.2282,  0.2624,  ..., -0.4739, -0.4739, -0.4911]],

         [[-1.1779, -1.1779, -1.1954,  ..., -1.1429, -1.1253, -1.1078],
          [-1.1429, -1.1429, -1.1604,  ..., -1.1253, -1.1253, -1.1253],
          [-1.1253, -1.1253, -1.1253,  ..., -1.1253, -1.1253, -1.1253],
          ...,
          [ 0.4503,  0.4853,  0.5203,  ...,  0.0301,  0.0126, -0.0224],
          [ 0.4328,  0.4678,  0.5203,  ..., -0.0399, -0.0749, -0.1099],
          [ 0.3978,  0.4503,  0.5028,  ..., -0.1275, -0.1450, -0.1800]],

         [[-0.9504, -0.9504, -0.9678,  ..., -0.9853, -0.9853, -0.9853],
          [-0.9504, -0.9330, -

In [15]:
model_weights = torchvision.models.ResNet50_Weights.IMAGENET1K_V2
transforms = model_weights.transforms()

In [17]:
train_dataset = torchvision.datasets.CIFAR10(
    root=".", train=True, download=True
)

Files already downloaded and verified


In [34]:
train_dataset

Dataset CIFAR10
    Number of datapoints: 50000
    Root location: .
    Split: Train

In [41]:
# train_dataset[0][0]
import torchvision.transforms as transforms

# Convert the image to a tensor
transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ]
)

tensor = transform(train_dataset[0][0])
tensor

tensor([[[-1.1075, -1.3815, -1.2617,  ...,  0.5878,  0.4851,  0.4166],
         [-1.8439, -2.1179, -1.8097,  ..., -0.0116, -0.0801, -0.0287],
         [-1.6898, -1.8439, -1.2788,  ..., -0.0972, -0.0629, -0.2513],
         ...,
         [ 1.4440,  1.3242,  1.2728,  ...,  0.6221, -1.1589, -1.2103],
         [ 0.9646,  0.8447,  1.0673,  ...,  1.0331, -0.4568, -0.6965],
         [ 0.9132,  0.7591,  0.9474,  ...,  1.5810,  0.4679, -0.0116]],

        [[-0.9503, -1.2304, -1.1954,  ...,  0.2752,  0.1527,  0.1352],
         [-1.6856, -2.0357, -1.8957,  ..., -0.4951, -0.5826, -0.5126],
         [-1.6155, -1.9132, -1.5630,  ..., -0.5651, -0.5651, -0.7577],
         ...,
         [ 0.9405,  0.6429,  0.7829,  ...,  0.2927, -1.4930, -1.4405],
         [ 0.3978,  0.1176,  0.4853,  ...,  0.5553, -0.9503, -1.1078],
         [ 0.4853,  0.2227,  0.4503,  ...,  1.1856,  0.0301, -0.4251]],

        [[-0.7064, -1.0201, -1.0550,  ...,  0.0779, -0.0267, -0.0092],
         [-1.4559, -1.8044, -1.8044,  ..., -0

## Train function

In [9]:
import time
from tqdm.notebook import tqdm as tqdm
from typing import Tuple


def train_step(
    epoch: int,
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    loss_fn: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    device: torch.device,
    disable_progress_bar: bool = False,
) -> Tuple[float, float]:
    """Trains a PyTorch model for a single epoch.

    Turns a target PyTorch model to training mode and then
    runs through all of the required training steps (forward
    pass, loss calculation, optimizer step).

    Args:
      model: A PyTorch model to be trained.
      dataloader: A DataLoader instance for the model to be trained on.
      loss_fn: A PyTorch loss function to minimize.
      optimizer: A PyTorch optimizer to help minimize the loss function.
      device: A target device to compute on (e.g. "cuda" or "cpu").

    Returns:
      A tuple of training loss and training accuracy metrics.
      In the form (train_loss, train_accuracy). For example:

      (0.1112, 0.8743)
    """
    # Put model in train mode
    model.train()

    # Setup train loss and train accuracy values
    train_loss, train_acc = 0, 0

    # Loop through data loader data batches
    progress_bar = tqdm(
        enumerate(dataloader),
        desc=f"Training Epoch {epoch}",
        total=len(dataloader),
        disable=disable_progress_bar,
    )

    for batch, (X, y) in progress_bar:
        # Send data to target device
        X, y = X.to(device), y.to(device)

        # 1. Forward pass
        y_pred = model(X)

        # 2. Calculate  and accumulate loss
        loss = loss_fn(y_pred, y)
        train_loss += loss.item()

        # 3. Optimizer zero grad
        optimizer.zero_grad()

        # 4. Loss backward
        loss.backward()

        # 5. Optimizer step
        optimizer.step()

        # Calculate and accumulate accuracy metric across all batches
        y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
        train_acc += (y_pred_class == y).sum().item() / len(y_pred)

        # Update progress bar
        progress_bar.set_postfix(
            {
                "train_loss": train_loss / (batch + 1),
                "train_acc": train_acc / (batch + 1),
            }
        )
        if (batch + 1) % 20 == 0:
            wandb.log(
                {
                    "train_loss": train_loss / (batch + 1),
                    "train_accuracy": train_acc / (batch + 1),
                }
            )

    # Adjust metrics to get average loss and accuracy per batch
    train_loss = train_loss / len(dataloader)
    train_acc = train_acc / len(dataloader)
    wandb.log({"train_loss": train_loss, "train_accuracy": train_acc})
    return train_loss, train_acc


def test_step(
    epoch: int,
    model: torch.nn.Module,
    dataloader: torch.utils.data.DataLoader,
    loss_fn: torch.nn.Module,
    device: torch.device,
) -> Tuple[float, float]:
    """Tests a PyTorch model for a single epoch.

    Turns a target PyTorch model to "eval" mode and then performs
    a forward pass on a testing dataset.

    Args:
      model: A PyTorch model to be tested.
      dataloader: A DataLoader instance for the model to be tested on.
      loss_fn: A PyTorch loss function to calculate loss on the test data.
      device: A target device to compute on (e.g. "cuda" or "cpu").

    Returns:
      A tuple of testing loss and testing accuracy metrics.
      In the form (test_loss, test_accuracy). For example:

      (0.0223, 0.8985)
    """
    # Put model in eval mode
    model.eval()

    # Setup test loss and test accuracy values
    test_loss, test_acc = 0, 0

    # Loop through data loader data batches
    progress_bar = tqdm(
        enumerate(dataloader),
        desc=f"Testing Epoch {epoch}",
        total=len(dataloader),
    )

    # Turn on inference context manager
    with torch.inference_mode():  # no_grad() required for PyTorch 2.0, I found some errors with `torch.inference_mode()`, please let me know if this is not the case
        # Loop through DataLoader batches
        for batch, (X, y) in progress_bar:
            # Send data to target device
            X, y = X.to(device), y.to(device)

            # 1. Forward pass
            test_pred_logits = model(X)

            # 2. Calculate and accumulate loss
            loss = loss_fn(test_pred_logits, y)
            test_loss += loss.item()

            # Calculate and accumulate accuracy
            test_pred_labels = test_pred_logits.argmax(dim=1)
            test_acc += (test_pred_labels == y).sum().item() / len(test_pred_labels)

            # Update progress bar
            progress_bar.set_postfix(
                {
                    "test_loss": test_loss / (batch + 1),
                    "test_acc": test_acc / (batch + 1),
                }
            )

    # Adjust metrics to get average loss and accuracy per batch
    test_loss = test_loss / len(dataloader)
    test_acc = test_acc / len(dataloader)
    wandb.log({"test_loss": test_loss, "test_accuracy": test_acc})
    return test_loss, test_acc


def train(
    model: torch.nn.Module,
    train_dataloader: torch.utils.data.DataLoader,
    test_dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    loss_fn: torch.nn.Module,
    epochs: int,
    device: torch.device,
):
    wandb.watch(model, loss_fn, log="all", log_freq=10)
    # Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):

        # Perform training step and time it
        train_epoch_start_time = time.time()
        train_loss, train_acc = train_step(
            epoch=epoch,
            model=model,
            dataloader=train_dataloader,
            loss_fn=loss_fn,
            optimizer=optimizer,
            device=device,
        )
        train_epoch_end_time = time.time()
        train_epoch_time = train_epoch_end_time - train_epoch_start_time

        # Perform testing step and time it
        test_epoch_start_time = time.time()
        test_loss, test_acc = test_step(
            epoch=epoch,
            model=model,
            dataloader=test_dataloader,
            loss_fn=loss_fn,
            device=device,
        )
        test_epoch_end_time = time.time()
        test_epoch_time = test_epoch_end_time - test_epoch_start_time

        # Print out what's happening
        print(
            f"Epoch: {epoch+1} | "
            f"train_loss: {train_loss:.4f} | "
            f"train_acc: {train_acc:.4f} | "
            f"test_loss: {test_loss:.4f} | "
            f"test_acc: {test_acc:.4f} | "
            f"train_epoch_time: {train_epoch_time:.4f} | "
            f"test_epoch_time: {test_epoch_time:.4f}"
        )

        # Update results dictionary
        # results["train_loss"].append(train_loss)
        # results["train_acc"].append(train_acc)
        # results["test_loss"].append(test_loss)
        # results["test_acc"].append(test_acc)
        # results["train_epoch_time"].append(train_epoch_time)
        # results["test_epoch_time"].append(test_epoch_time)

    # Return the filled results at the end of the epochs
    # return results


def test(model, dataloader):
    test_acc = 0
    model.eval()
    progress_bar = tqdm(
        enumerate(dataloader),
        desc=f"Testing",
        total=len(dataloader),
    )
    with torch.inference_mode():
        for _, (X, y) in progress_bar:
            X, y = X.to(device), y.to(device)
            test_pred_logits = model(X)
            test_pred_labels = test_pred_logits.argmax(dim=1)
            test_acc += (test_pred_labels == y).sum().item() / len(test_pred_labels)
        test_acc = test_acc / len(dataloader)
        print(f"Test accuracy: {test_acc}")
        wandb.log({"test_accuracy": test_acc})
    # Save the model in the exchangeable ONNX format
    torch.onnx.export(model, f="model.onnx")
    wandb.save("model.onnx")

In [10]:
config = dict(
    epochs=3,
    classes=10,
    learning_rate=2e-3,
    batch_size=BATCH_SIZE,
    image_size=IMAGE_SIZE,
    num_workers=os.cpu_count(),
    weight_decay=1e-4,
)
wandb.init(project="new-sota-model", name="dat-test-resnet50-v2", config=config)

In [11]:
model, transforms = create_model(num_classes=config["classes"])
model.to(device)
compiled_model = torch.compile(model)
train_dataloader, test_dataloader = get_train_test_dataloader(
    transforms, config["batch_size"], config["num_workers"]
)
optimizer = torch.optim.Adam(
    model.parameters(), lr=config["learning_rate"], weight_decay=config["weight_decay"]
)
loss_fn = torch.nn.CrossEntropyLoss()

Files already downloaded and verified
Files already downloaded and verified


In [14]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f260a94aef0>

In [12]:
# train(
#     model=compiled_model,
#     train_dataloader=train_dataloader,
#     test_dataloader=test_dataloader,
#     loss_fn=loss_fn,
#     optimizer=optimizer,
#     epochs=config["epochs"],
#     device=device,
# )

In [13]:
# torch.save(
#     {
#         "model_state_dict": model.state_dict(),
#         "optimizer_state_dict": optimizer.state_dict(),
#     },
#     f"compile_model_v2.pt",
# )