### This notebook demonstrates how to train and deploy a U-net model on Sagemaker

In [1]:
import logging
import os
import sagemaker
import boto3
from sagemaker.tensorflow import TensorFlow
from datetime import datetime

# Configure environment

In [2]:
boto_session = boto3.Session(profile_name="crayon-site") # specify your local aws profile
sagemaker_session = sagemaker.Session(boto_session)
SAGEMAKER_ROLE = "AmazonSageMaker-ExecutionRole-20200714T182988"

In [3]:
AI_AUTOMATION_REPO_DIR = "/home/tailaiw/work/site/ai-automation/" # specific you local path to the repo

# Setup hyperparameters

In [4]:
encoder = "resnet50"
pretrained = 1

classes = "asphalt,concrete,rooftop,landscape,gravel"  # classes to consider, others all merged into "others"
classes = classes.replace(" ", "")  # make sure no whitespace is included

crop_width = 896  # crop from the original image with this size
crop_height = 896  # crop from the original image with this size
input_width = 224  # model input width (rescale crops to this size before fed to model)
input_height = 224  # model input width (rescale crops to this size before fed to model)

train_dataset_mode = "random"
crops_per_train_image = 64  # crops from a training image, if "random" mode
crops_per_train_image_w = 8  # crops from a training image along horizontal direction, if "tile" mode
crops_per_train_image_h = 4  # crops from a training image along vertical direction, if "tile" mode
crops_per_val_image_w = 8  # crops from a validation image along the horizontal direction
crops_per_val_image_h = 4  # crops from a validation image along the vertical direction
train_ratio = 0.8  # ratio of training data in the entire dataset

train_batch_size = 32
val_batch_size = 32
epochs = 100

cache_size = 500 # number of images to cache in RAM

# Setup training job

In [5]:
base_job_name = (
    f"{encoder}-{''.join([cl[0] for cl in classes.split(',')])}-"
    f"{input_width}x{input_height}-{crop_width}x{crop_height}-"
    f"{train_dataset_mode}-{crops_per_train_image if train_dataset_mode=='random' else f'{crops_per_train_image_w}x{crops_per_train_image_h}'}-"
    f"{crops_per_val_image_w}x{crops_per_val_image_h}-"
    f"{int(train_ratio*100)}-"
    f"{train_batch_size}-{val_batch_size}"
)
print(base_job_name)

output_path = f"s3://st-crayon-dev/tf-outputs/"

now = datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S-%f")
checkpoint_s3_uri = f"s3://st-crayon-dev/tf-checkpoints/{base_job_name}_{now}/"

resnet50-acrlg-224x224-896x896-random-64-8x4-80-32-32


# Create estimator and train

In [6]:
estimator = TensorFlow(
    sagemaker_session=sagemaker_session,
    entry_point="unet.py",
    source_dir=os.path.join(AI_AUTOMATION_REPO_DIR, "src", "unet"),
    role=SAGEMAKER_ROLE,
    instance_count=1,
    instance_type="ml.p3.2xlarge",
    use_spot_instances=True,  # uncomment this line to use spot instance
    max_run=60 * 60 * 12,  # uncomment this line to use spot instance
    max_wait=60 * 60 * 18,  # uncomment this line to use spot instance
    framework_version="2.1",
    py_version="py3",
    base_job_name=base_job_name,
    output_path=output_path,
    checkpoint_s3_uri=checkpoint_s3_uri,
    container_log_level=logging.WARNING,
    hyperparameters={
        "encoder": encoder,
        "pretrained": pretrained,
        "classes": classes,
        "input-width": input_width,
        "input-height": input_height,
        "crop-width": crop_width,
        "crop-height": crop_height,
        "train-dataset-mode": train_dataset_mode,
        "crops-per-train-image": crops_per_train_image,
        "crops-per-val-image-w": crops_per_val_image_w,
        "crops-per-val-image-h": crops_per_val_image_h,
        "train-batch-size": train_batch_size,
        "val-batch-size": val_batch_size,
        "epochs": epochs,
        "cache-size": cache_size,
    },
    metric_definitions=[
        {
            "Name": "train:loss",
            "Regex": ".*loss: ([0-9\\.]+) - mean_io_u[_0-9]*: [0-9\\.]+ - categorical_accuracy: [0-9\\.]+.*",
        },
        {
            "Name": "train:categorical_accuracy",
            "Regex": ".*loss: [0-9\\.]+ - mean_io_u[_0-9]*: [0-9\\.]+ - categorical_accuracy: ([0-9\\.]+).*",
        },
        {
            "Name": "train:miou",
            "Regex": ".*loss: [0-9\\.]+ - mean_io_u[_0-9]*: ([0-9\\.]+) - categorical_accuracy: [0-9\\.]+.*",
        },
        {
            "Name": "validation:loss",
            "Regex": ".*loss: [0-9\\.]+ - mean_io_u[_0-9]*: [0-9\\.]+ - categorical_accuracy: [0-9\\.]+ - val_loss: ([0-9\\.]+) - val_mean_io_u[_0-9]*: [0-9\\.]+ - val_categorical_accuracy: [0-9\\.]+.*",
        },
        {
            "Name": "validation:categorical_accuracy",
            "Regex": ".*loss: [0-9\\.]+ - mean_io_u[_0-9]*: [0-9\\.]+ - categorical_accuracy: [0-9\\.]+ - val_loss: [0-9\\.]+ - val_mean_io_u[_0-9]*: [0-9\\.]+ - val_categorical_accuracy: ([0-9\\.]+).*",
        },
        {
            "Name": "validation:miou",
            "Regex": ".*loss: [0-9\\.]+ - mean_io_u[_0-9]*: [0-9\\.]+ - categorical_accuracy: [0-9\\.]+ - val_loss: [0-9\\.]+ - val_mean_io_u[_0-9]*: ([0-9\\.]+) - val_categorical_accuracy: [0-9\\.]+.*",
        },
    ],
)

In [7]:
input_dir = "s3://st-crayon-dev/annotation/phase_1/export-2020-08-25T18_12_45.454Z"  # S3 folder where images and masks are saved
estimator.fit(input_dir)

# Deploy estimator

In [8]:
predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge')

---------------!

In [9]:
predictor.endpoint_name

'resnet50-acrlg-224x224-896x896-random-6-2020-09-02-21-07-38-621'

# Delete deployed endpoint

In [None]:
predictor.delete_endpoint()