In [31]:
import torch

if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device = mps_device)
    print(x)
else:
    print("MPS device not found")

tensor([1.], device='mps:0')


In [32]:
import timeit

import random

x = torch.ones(500000000, device='mps')
timeit.timeit(lambda:x * random.randint(0, 100), number=1)

0.00038287509232759476

In [33]:
x = torch.ones(50000000, device="cpu")
timeit.timeit(lambda:x * random.randint(0,100), number = 1)

0.019521832931786776

There definitely is a time difference, but the source I am comparing this to had 0.0003982 and 0.0112392, so overall the numbers I am seeing are off by a bit.


In [34]:
from time import process_time
import torch

def testgpu():
    if torch.backends.mps.is_available():
        mps_device = torch.device("mps")
    t0 = process_time()
    x = torch.ones(n1, device=mps_device)
    y = x + torch.rand(n1, device=mps_device)
    t1 = process_time()
    print(f"Total time with gpu ({n1}): {t1-t0}")
    t0 = process_time()
    x = torch.ones(n2, device=mps_device)
    y = x + torch.rand(n2, device=mps_device)
    t1 = process_time()
    print(f"Total time with gpu ({n2}): {t1-t0}")

def testcpu():
    t0 = process_time()
    x = torch.ones(n1)
    y = x + torch.rand(n1)
    t1 = process_time()
    print(f"Total time with cpu ({n1}): {t1-t0}")
    t0 = process_time()
    x = torch.ones(n2)
    y = x + torch.rand(n2)
    t1 = process_time()
    print(f"Total time with cpu ({n2}): {t1-t0}")

if __name__ == '__main__':
    n1 = 10000
    n2 = 100000000
    testcpu()
    testgpu()

Total time with cpu (10000): 0.0005199999999998539
Total time with cpu (100000000): 0.7512539999999994
Total time with gpu (10000): 0.0005629999999996471
Total time with gpu (100000000): 0.0002760000000012752


In [35]:
import argparse
import os
from pathlib import Path
from timeit import default_timer as timer

import torch
import torchvision
import torchvision.transforms.v2 as transforms # use v2 transforms for faster augmentations
import pandas as pd

from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from tqdm.auto import tqdm



As this was a command line function, need some modifications to make it work in a notebook.
See: https://stackoverflow.com/questions/48796169/how-to-fix-ipykernel-launcher-py-error-unrecognized-arguments-in-jupyter

In [36]:
print(torchvision.__version__)

0.15.2a0


Latest version of torchvision is 0.17, but I can't convince conda to install that...
https://pytorch.org/vision/0.15/transforms.html#transforms-scriptability

In [37]:
    # Create DataLoaders
    def create_dataloaders(batch_size, num_workers=NUM_WORKERS):
        train_dataloader = DataLoader(train_data,
                                      batch_size=batch_size,
                                      shuffle=True,
                                      num_workers=num_workers,
                                      pin_memory=False) # note: if you pin memory, you may get "too many workers" errors when recreating DataLoaders, see: https://github.com/Lightning-AI/pytorch-lightning/issues/18487#issuecomment-1740244601

        test_dataloader = DataLoader(test_data,
                                     batch_size=batch_size,
                                     shuffle=False,
                                     num_workers=num_workers,
                                     pin_memory=False)

        return train_dataloader, test_dataloader

In [38]:

    ### Train Step ###
    def train_step(model: torch.nn.Module, 
                   dataloader: torch.utils.data.DataLoader, 
                   loss_fn: torch.nn.Module, 
                   optimizer: torch.optim.Optimizer,
                   device: torch.device):
        # Put model in train mode
        model.train()
        
        # Setup train loss and train accuracy values
        train_loss, train_acc = 0, 0
        
        # Loop through data loader data batches
        for batch, (X, y) in tqdm(enumerate(dataloader), total=len(dataloader)):
            # Send data to target device
            X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
    #         X, y = X.to(device, non_blocking=True, memory_format=torch.channels_last), y.to(device, non_blocking=True)
    #         X, y = X.to(device), y.to(device)

            # 1. Forward pass
            y_pred = model(X)

            # 2. Calculate  and accumulate loss
            loss = loss_fn(y_pred, y)
            train_loss += loss.item() 

            # 3. Optimizer zero grad
            optimizer.zero_grad()

            # 4. Loss backward
            loss.backward()

            # 5. Optimizer step
            optimizer.step()

            # Calculate and accumulate accuracy metric across all batches
            y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
            train_acc += (y_pred_class == y).sum().item()/len(y_pred)
            
        # Adjust metrics to get average loss and accuracy per batch 
        train_loss = train_loss / len(dataloader)
        train_acc = train_acc / len(dataloader)
        return train_loss, train_acc

In [39]:

    ### Test Step ###
    def test_step(model: torch.nn.Module, 
                dataloader: torch.utils.data.DataLoader, 
                loss_fn: torch.nn.Module,
                device: torch.device):
        # Put model in eval mode
        model.eval() 
        
        # Setup test loss and test accuracy values
        test_loss, test_acc = 0, 0
        
        # Turn on inference context manager
        with torch.inference_mode():
            # Loop through DataLoader batches
            for batch, (X, y) in tqdm(enumerate(dataloader), total=len(dataloader)):
                # Send data to target device
                X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
    #             X, y = X.to(device, non_blocking=True, memory_format=torch.channels_last), y.to(device, non_blocking=True)
    #             X, y = X.to(device), y.to(device)
        
                # 1. Forward pass
                test_pred_logits = model(X)

                # 2. Calculate and accumulate loss
                loss = loss_fn(test_pred_logits, y)
                test_loss += loss.item()
                
                # Calculate and accumulate accuracy
                test_pred_labels = test_pred_logits.argmax(dim=1)
                test_acc += ((test_pred_labels == y).sum().item()/len(test_pred_labels))
                
        # Adjust metrics to get average loss and accuracy per batch 
        test_loss = test_loss / len(dataloader)
        test_acc = test_acc / len(dataloader)
        return test_loss, test_acc

In [40]:
    def train_and_time(batch_sizes=BATCH_SIZES,
                       epochs=EPOCHS,
                       device=device):

        batch_size_training_results = []

        for batch_size in batch_sizes:
            print(f"[INFO] Training with batch size {batch_size} for {epochs} epochs...")
            # Create an instance of resnet50
            model = torchvision.models.resnet50(num_classes=100).to(device)
            # model = torch.compile(model) # potential way to speed up model

            # Setup loss function and optimizer
            loss_fn = nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)

            # Create DataLoaders
            train_dataloader, test_dataloader = create_dataloaders(batch_size=batch_size)

            try:
                # Start the timer
                start_time = timer()

                # Train model
                model_results = train_and_test_model(model=model, 
                                                    train_dataloader=train_dataloader,
                                                    test_dataloader=test_dataloader,
                                                    optimizer=optimizer,
                                                    loss_fn=loss_fn, 
                                                    epochs=epochs,
                                                    device=device,
                                                    eval=False) # don't eval, just test training time

                # End the timer
                end_time = timer()

                total_training_time = end_time - start_time
                avg_time_per_epoch = total_training_time / epochs

                batch_size_training_results.append({"batch_size": batch_size,
                                                    "avg_time_per_epoch": avg_time_per_epoch})
                save_results(batch_size_training_results)
                print(f"[INFO] Finished training with batch size {batch_size} for {epochs} epochs, total time: {round(total_training_time, 3)} seconds, avg time per epoch: {round(avg_time_per_epoch, 3)} seconds\n\n")

            except Exception as e:
                print(f"[INFO] Error: {e}")
                print(f"[INFO] Failed training with batch size {batch_size} for {epochs} epochs...\n\n")
                batch_size_training_results.append({"batch_size": batch_size,
                                                    "avg_time_per_epoch": "FAILED"})
                save_results(batch_size_training_results)
                break
                
        return batch_size_training_results

In [41]:
    
    def save_results(batch_size_training_results, target_dir="results_pytorch_cv"):
        # Create CSV filename
        if GPU_NAME:
            csv_filename = f"{GPU_NAME.replace(' ', '_')}_{DATASET_NAME}_{MODEL_NAME}_{INPUT_SHAPE[-1]}_{BACKEND}_results.csv"
        else:
            csv_filename = f"{CPU_PROCESSOR}_{DATASET_NAME}_{MODEL_NAME}_{INPUT_SHAPE[-1]}_{BACKEND}_results.csv"

        # Make the target results directory if it doesn't exist (include the parents)
        target_results_dir = target_dir
        results_path = Path("results") / target_results_dir
        results_path.mkdir(parents=True, exist_ok=True)
        csv_filepath = results_path / csv_filename

        # Turn dict into DataFrame 
        df = pd.DataFrame(batch_size_training_results) 

        # Save to CSV
        print(f"[INFO] Saving results to: {csv_filepath}")
        df.to_csv(csv_filepath, index=False)

In [42]:
    # 1. Take in various parameters required for training and test steps
    def train_and_test_model(model: torch.nn.Module, 
                            train_dataloader: torch.utils.data.DataLoader, 
                            test_dataloader: torch.utils.data.DataLoader, 
                            optimizer: torch.optim.Optimizer,
                            loss_fn: torch.nn.Module,
                            epochs: int,
                            device: torch.device,
                            eval: bool=False):
        
        print(f"[INFO] Training model {model.__class__.__name__} on device '{device}' for {epochs} epochs...")
        
        results = {"train_loss": [], "train_acc": [], "test_loss": [], "test_acc": []}

        # Loop through training and testing steps for a number of epochs
        for epoch in tqdm(range(epochs)):
            # Do eval before training (to see if there's any errors)
            if eval:
                test_loss, test_acc = test_step(model=model,
                                            dataloader=test_dataloader,
                                            loss_fn=loss_fn,
                                            device=device)
            
            train_loss, train_acc = train_step(model=model,
                                            dataloader=train_dataloader,
                                            loss_fn=loss_fn,
                                            optimizer=optimizer,
                                            device=device)
            
            
            # Print out what's happening
            print(
                f"Epoch: {epoch+1} | "
                f"train_loss: {train_loss:.4f} | "
                f"train_acc: {train_acc:.4f} | "
            )

            if eval:
                print(
                    f"Epoch: {epoch+1} | "
                    f"test_loss: {test_loss:.4f} | "
                    f"test_acc: {test_acc:.4f} | "
                )

            # Save results to dictionary
            results["train_loss"].append(train_loss)
            results["train_acc"].append(train_acc)
            if eval:
                results["test_loss"].append(test_loss)
                results["test_acc"].append(test_acc)
        
        return results


In [None]:

    CPU_PROCESSOR = None

    ### Get CPU Processor name ###
    if not CPU_PROCESSOR:
        try:
            import cpuinfo
            CPU_PROCESSOR = cpuinfo.get_cpu_info().get("brand_raw").replace(" ", "_")
            print(f"[INFO] CPU Processor: {CPU_PROCESSOR}")
        except Exception as e:
            print(f"Error: {e}, may have failed to get CPU_PROCESSOR name from cpuinfo, please install cpuinfo or set CPU_PROCESSOR manually") 

    ### Setup device ###
    if torch.backends.mps.is_available():
        device = torch.device("mps")
        print(f"[INFO] MPS device found, using device: {device}")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"[INFO] CUDA device found, using device: {device}")
    else:
        device = torch.device("cpu")
        print (f"[INFO] MPS or CUDA device not found, using device: {device} (results will be much slower than using MPS or CUDA)")

    # Prevent torch from erroring with too many files open (happens on M3)
    # See: https://github.com/pytorch/pytorch/issues/11201, https://github.com/CVMI-Lab/PLA/issues/20 
    torch.multiprocessing.set_sharing_strategy('file_system')

    # Set random seed
    torch.manual_seed(42)

    ### Set constants ###
    GPU_NAME = "mps"
    BACKEND = "pytorch"
    MODEL_NAME = "resnet50"
    IMAGE_SIZE = 32
    INPUT_SHAPE = (3, IMAGE_SIZE, IMAGE_SIZE)
    NUM_WORKERS = os.cpu_count()
    EPOCHS = 5
    BATCH_SIZES = [16, 32, 64, 128, 256, 512, 1024]
    DATASET_NAME = "CIFAR100"

    print(f"[INFO] Testing model: {MODEL_NAME} on {DATASET_NAME} dataset with input shape {INPUT_SHAPE} for {EPOCHS} epochs across batch sizes: {BATCH_SIZES}")


    ### Prepare Data ### 
    simple_transform = transforms.Compose([
        transforms.Resize(size=IMAGE_SIZE),
        # for torchvision 0.15 I believe ToImage must be ToTensor
        transforms.ToTensor(), 
        # again, since I seems stuck on 0.15, where there is no scale option
        #transforms.ToDtype(torch.float32, scale=True)
        transforms.ConvertDtype(torch.float32)

    ])

    # Get Datasets
    train_data = datasets.CIFAR10(root="data",
                                train=True,
                                transform=simple_transform,
                                download=True)

    test_data = datasets.CIFAR10(root="data",
                                train=False,
                                transform=simple_transform,
                                download=True)

    print(f"[INFO] Number of training samples: {len(train_data)}, number of testing samples: {len(test_data)}")

    ### Train an time model ### 
    batch_size_training_results = train_and_time(batch_sizes=BATCH_SIZES,
                                                 epochs=EPOCHS,
                                                 device=device)

    print("[INFO] Finished training with all batch sizes.")        

    print(f"[INFO] Results:\n{batch_size_training_results}")

[INFO] CPU Processor: Apple_M1_Max
[INFO] MPS device found, using device: mps
[INFO] Testing model: resnet50 on CIFAR100 dataset with input shape (3, 32, 32) for 5 epochs across batch sizes: [16, 32, 64, 128, 256, 512, 1024]
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to data/cifar-10-python.tar.gz


59.7%IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

100.0%


Extracting data/cifar-10-python.tar.gz to data
Files already downloaded and verified
[INFO] Number of training samples: 50000, number of testing samples: 10000
[INFO] Training with batch size 16 for 5 epochs...
[INFO] Training model ResNet on device 'mps' for 5 epochs...


  0%|          | 0/5 [00:00<?, ?it/s]

  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/image.so
  warn(
  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/image.so
  warn(
  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/image.so
  warn(
  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/image.so
  warn(
  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/image.so
  warn(
  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/image.so
  warn(
  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-pa

  0%|          | 0/3125 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 2.3956 | train_acc: 0.2003 | 


  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/image.so
  warn(
  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/image.so
  warn(
  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/image.so
  warn(
  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/image.so
  warn(
  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/image.so
  warn(
  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/image.so
  warn(
  Referenced from: <CFED5F8E-EC3F-36FD-AAA3-2C6C7F8D3DD9> /Users/rff/anaconda3/envs/MacML/lib/python3.11/site-pa

  0%|          | 0/3125 [00:00<?, ?it/s]

In [45]:
??transforms

[0;31mType:[0m        module
[0;31mString form:[0m <module 'torchvision.transforms.v2' from '/Users/rff/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/transforms/v2/__init__.py'>
[0;31mFile:[0m        ~/anaconda3/envs/MacML/lib/python3.11/site-packages/torchvision/transforms/v2/__init__.py
[0;31mSource:[0m     
[0;32mfrom[0m [0mtorchvision[0m[0;34m.[0m[0mtransforms[0m [0;32mimport[0m [0mAutoAugmentPolicy[0m[0;34m,[0m [0mInterpolationMode[0m  [0;31m# usort: skip[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m[0;32mfrom[0m [0;34m.[0m [0;32mimport[0m [0mfunctional[0m[0;34m,[0m [0mutils[0m  [0;31m# usort: skip[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m[0;32mfrom[0m [0;34m.[0m[0m_transform[0m [0;32mimport[0m [0mTransform[0m  [0;31m# usort: skip[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m[0;32mfrom[0m [0;34m.[0m[0m_augment[0m [0;32mimport[0m [0mRandomErasing[0m[0;34m[0m
[0;34m[0m[0;32mfrom[0m [0;3