<div style="background-color:teal; color:white; padding:10px; font-size:20px">
Installs

In [10]:
import sys

In [11]:
# update boto3 and sagemaker to ensure latest SDK version
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install --upgrade boto3
!{sys.executable} -m pip install --upgrade sagemaker
!{sys.executable} -m pip install torch
!{sys.executable} -m pip install torchvision

[0m

In [12]:
!pip install torchinfo

[0m

<div style="background-color:teal; color:white; padding:10px; font-size:20px">
Imports

In [13]:
# AWS
from sagemaker.pytorch import PyTorch
from sagemaker.experiments.run import Run
from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.utils import unique_name_from_base
import boto3

# Model
import torch
import torchvision

# Aux
import os
import sys
import logging
from IPython.display import set_matplotlib_formats
from matplotlib import pyplot as plt
import itertools
import time
import sklearn

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout))

<div style="background-color:teal; color:white; padding:10px; font-size:20px">
Train Script

In [14]:
%%writefile ./script/toy_model.py

#=======================================================================
#                               SET UP
#=======================================================================

# ensure that the latest version of the SageMaker SDK is available
import os

os.system("pip install -U sagemaker")

import argparse
import json
import logging
import traceback
import sys
import time
from os.path import join
import boto3
from sagemaker.session import Session
from sagemaker.experiments.run import load_run
import torch
import torchvision
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.functional as F

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler(sys.stdout)) # Sends logs to std out

# Saving copy of logs to .json file
if "SAGEMAKER_METRICS_DIRECTORY" in os.environ:
    log_file_handler = logging.FileHandler(
        join(os.environ["SAGEMAKER_METRICS_DIRECTORY"], "metrics.json")
    )
    formatter = logging.Formatter(
        "{'time':'%(asctime)s', 'name': '%(name)s', \
        'level': '%(levelname)s', 'message': '%(message)s'}",
        style="%",
    )
    log_file_handler.setFormatter(formatter)
    logger.addHandler(log_file_handler)

#=======================================================================
#                               MODEL
#=======================================================================

class CIFAR10Net(nn.Module):
    def __init__(self):
        super(CIFAR10Net, self).__init__()

        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.conv4 = nn.Conv2d(128, 256, 3, padding=1)
        self.bn4 = nn.BatchNorm2d(256)
        
        self.pool = nn.MaxPool2d(2, 2)
        self.adaptive_pool = nn.AdaptiveAvgPool2d((1, 1))
        
        self.fc1 = nn.Linear(256, 512)
        self.bn5 = nn.BatchNorm1d(512)
        self.fc2 = nn.Linear(512, 256)
        self.bn6 = nn.BatchNorm1d(256)
        self.fc3 = nn.Linear(256, 10)
        
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(F.leaky_relu(self.bn1(self.conv1(x))))
        x = self.pool(F.leaky_relu(self.bn2(self.conv2(x))))
        x = self.pool(F.leaky_relu(self.bn3(self.conv3(x))))
        x = self.pool(F.leaky_relu(self.bn4(self.conv4(x))))
        
        x = self.adaptive_pool(x)
        x = x.view(-1, 256)
        
        x = F.leaky_relu(self.bn5(self.fc1(x)))
        x = self.dropout(x)
        
        x = F.leaky_relu(self.bn6(self.fc2(x)))
        x = self.dropout(x)
        
        x = self.fc3(x)
        return x

#=======================================================================
#                               EVAL LOG FUNCTION
#=======================================================================


def log_performance(model, data_loader, device, epoch, run, metric_type="Test"):
    model.eval()
    loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in data_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            # loss += torch.nn.functional.nll_loss(
            #     output, target, reduction="sum"
            # ).item()  # sum up batch loss
            criterion = nn.CrossEntropyLoss(reduction = "sum")
            loss += criterion(output, target).item()
            pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
    loss /= len(data_loader.dataset)
    accuracy = 100.0 * correct / len(data_loader.dataset)
    # ⛳️SM: log metrics
    run.log_metric(name=metric_type + ":loss", value=loss, step=epoch)
    run.log_metric(name=metric_type + ":accuracy", value=accuracy, step=epoch)
    logger.info(
        "{} Average loss: {:.4f}, {} Accuracy: {:.4f}%;\n".format(
            metric_type, loss, metric_type, accuracy
        )
    )

#=======================================================================
#                               TRAIN FUNCTION
#=======================================================================

def train_model(
    run, 
    train_set, 
    test_set,
    epochs, 
    lr
):
    """
    Args:
        run (sagemaker.experiments.run.Run): SageMaker Experiment run object
        train_set (torchvision.datasets.mnist.MNIST): train dataset
        test_set (torchvision.datasets.mnist.MNIST): test dataset
        data_dir (str): local directory where the MNIST datasource is stored
        optimizer (str): the optimization algorthm to use for training your CNN
                         available options are sgd and adam
        epochs (int): number of complete pass of the training dataset through the algorithm
        hidden_channels (int): number of hidden channels in your model
    """

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    train_loader = torch.utils.data.DataLoader(train_set, batch_size= 64, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size= 500, shuffle=False)
    
    #📍Logger: Progress Output
    logger.info(
        "Processes {}/{} ({:.0f}%) of train data".format(
            len(train_loader.sampler),
            len(train_loader.dataset),
            100.0 * len(train_loader.sampler) / len(train_loader.dataset),))

    logger.info(
        "Processes {}/{} ({:.0f}%) of test data".format(
            len(test_loader.sampler),
            len(test_loader.dataset),
            100.0 * len(test_loader.sampler) / len(test_loader.dataset),))
    
    # Train Set Up
    model = CIFAR10Net().to(device)
    # model = torch.nn.DataParallel(model) # if multiple GPU's
    lr = lr
    log_interval = 100
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
        
    # ⛳️SM: log model run parameters
    run.log_parameters({"optimizer": "Adam",
                        "epochs": epochs,})

    # Train Loop = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
    for epoch in range(1, epochs + 1):
        print("Training Epoch:", epoch)
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader, 1):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            # loss = torch.nn.functional.nll_loss(output, target)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            #📍 Logger: Train Status
            if batch_idx % log_interval == 0:
                logger.info(
                    "Train Epoch: {} [{}/{} ({:.0f}%)], Train Loss: {:.6f};".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.sampler),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )
        # ⛳️ SM: Metric Logging
        run.log_metric(name = 'epoch', value = epoch)
        log_performance(model, train_loader, device, epoch, run, "Train")
        log_performance(model, test_loader, device, epoch, run, "Test")
    # = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
        
    # ⛳️ SM: Confusion Matrix Logging
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.max(1, keepdim=True)[1]
            run.log_confusion_matrix(target, pred, "Confusion-Matrix-Test-Data")
    return model


def model_fn(model_dir):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    hidden_channels = int(os.environ.get("hidden_channels", "5"))
    kernel_size = int(os.environ.get("kernel_size", "5"))
    dropout = float(os.environ.get("dropout", "0.5"))
    model = torch.nn.DataParallel(Net(hidden_channels, kernel_size, dropout))
    with open(os.path.join(model_dir, "model.pth"), "rb") as f:
        model.load_state_dict(torch.load(f))
        return model.to(device)


def save_model(model, model_dir, run):
    logger.info("Saving the model.")
    path = os.path.join(model_dir, "model.pth")
    # recommended way from http://pytorch.org/docs/master/notes/serialization.html
    torch.save(model.cpu().state_dict(), path)

#=======================================================================
#                               ARGPARSE
#=======================================================================

if __name__ == "__main__":
    
    try:
        parser = argparse.ArgumentParser()

        parser.add_argument("--epochs",type=int,default=10,metavar="N",help="number of epochs to train (default: 10)")
        parser.add_argument('--lr', type=float, default=0.01)

        # Data, model, and output directories
        # parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
        parser.add_argument('--training', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
        parser.add_argument('--validation', type=str, default=os.environ['SM_CHANNEL_VALIDATION'])
        # parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])

        # Container environment
        parser.add_argument("--hosts", type=list, default=json.loads(os.environ["SM_HOSTS"]))
        parser.add_argument("--current-host", type=str, default=os.environ["SM_CURRENT_HOST"])
        parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
        parser.add_argument("--num-gpus", type=int, default=os.environ["SM_NUM_GPUS"])
        parser.add_argument("--region", type=str, default="us-east-1", help="SageMaker Region")

        args, _ = parser.parse_known_args()

# = = = = = = = = = = = = = = DATA = = = = = = = = = = = = = = = = = = = =

        load_transform = transforms.Compose([transforms.ToTensor(),
                                             transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

        train_set = torchvision.datasets.ImageFolder(root=args.training, transform=load_transform)
        test_set = torchvision.datasets.ImageFolder(root=args.validation, transform=load_transform)

#=======================================================================
#                               RUN
#=======================================================================

        session = Session(boto3.session.Session(region_name=args.region))

        with load_run(sagemaker_session=session) as run:

            model = train_model(
                run,
                train_set = train_set,
                test_set = test_set,
                epochs=args.epochs,
                lr = args.lr
            )
            save_model(model, args.model_dir, run)
            
    except Exception as e:
        logger.error("Exception occurred: %s", e)
        logger.error(traceback.format_exc())

Overwriting ./script/toy_model.py


<div style="background-color:teal; color:white; padding:10px; font-size:20px">
SageMaker Experiments

Below is a diagram outlining the various components of an SM Experiments run and how they must be structured in your code. Steps below involve:
- Creation of a SageMaker experiment name (which can be thought of as a project or exploration containing all of your runs)
- Creation of desired hyperparameter search space (the cartesian product of all possible HP combinations)
- A for-loop that creates an indexed experiment run for every HP combination:
    - Every HP combination is passed to an Estimator object (training job) that trains the model under its respective configuration
    - Training metadata and model artifacts are saved between a specified s3 location and a SM Experiments database

<img src="./images/sm_experiments_overview.png" width="800" />

<div style="background-color:teal; color:white; padding:10px; font-size:20px">
Hyperparameters

In [15]:
from sklearn.model_selection import ParameterGrid

# Set Region - annoying SM detail that it must be passed as an arg
region = Session().boto_session.region_name

param_grid = {
    'lr': [0.001],
    'epochs': [10],
    'region':[region]
}

param_combinations = list(ParameterGrid(param_grid))
param_combinations

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


[{'epochs': 10, 'lr': 0.001, 'region': 'us-east-1'}]

<div style="background-color:teal; color:white; padding:10px; font-size:20px">
Train Model w/ Run Context

In [16]:
# Data Sources
full_train_s3_path = f's3://pytorch-aws-train-deploy-project/cifar-dataset/foldered-dataset/train/'
full_val_s3_path = f's3://pytorch-aws-train-deploy-project/cifar-dataset/foldered-dataset/val/'

subset_train_s3_path = f's3://pytorch-aws-train-deploy-project/cifar-dataset/foldered-dataset-subset/train/'
subset_val_s3_path = f's3://pytorch-aws-train-deploy-project/cifar-dataset/foldered-dataset-subset/val/'

run_id = 0

In [18]:
from sagemaker import get_execution_role

# Set role for train job
role = get_execution_role()

experiment_name = "pytorch-experiments-train"

# Param Grid
for params in param_combinations:
        run_id += 1
        run_name = "run-" + str(run_id)
        print(run_name)
    
        # Sagemaker Run
        with Run(experiment_name=experiment_name, 
                     run_name=run_name, 
                     sagemaker_session=Session()) as run:
            
            # Estimator
            est = PyTorch(
                entry_point="./script/toy_model.py", # train script
                role=role,
                model_dir="/opt/ml/model/", # dir inside container
                framework_version="1.12",
                py_version="py38",
                instance_type="ml.m5.xlarge", # $0.23 / hr
                instance_count=1,
                hyperparameters = params,
                keep_alive_period_in_seconds=3600,
                output_path = "s3://pytorch-aws-train-deploy-project/training/"
            )

            est.fit(
            inputs={
                        'training': full_train_s3_path,
                        'validation': full_val_s3_path 
                    })

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
run-2
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: pytorch-training-2023-09-12-20-24-06-264


2023-09-12 20:24:06 Starting - Starting the training job...
2023-09-12 20:24:22 Starting - Preparing the instances for training......
2023-09-12 20:25:23 Downloading - Downloading input data.........
2023-09-12 20:26:59 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2023-09-12 20:27:05,284 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2023-09-12 20:27:05,286 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-09-12 20:27:05,288 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2023-09-12 20:27:05,299 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2023-09-12 20:27:05,301 sagemaker_pytorch_container.training INFO     Invoking user training 

<div style="background-color:teal; color:white; padding:10px; font-size:20px">
Clear SageMaker Experiment Data (skip)