### This notebook demonstrates how to train a U-net model with different hyperparameters as experiment trials on Sagemaker

In [1]:
import os
import logging
import sagemaker
import boto3
from sagemaker.tensorflow import TensorFlow
from datetime import datetime
from smexperiments import experiment

# Configure environment

In [2]:
boto_session = boto3.Session(profile_name="crayon-site") # specify your local aws profile
sagemaker_session = sagemaker.Session(boto_session)
SAGEMAKER_ROLE = "AmazonSageMaker-ExecutionRole-20200714T182988"

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [3]:
AI_AUTOMATION_REPO_DIR = "/home/tailaiw/work/site/ai-automation/" # specific you local path to the repo

# Setup hyperparameters

### Fixed hyperparameters

In [4]:
encoder = "resnet50"
pretrained = 1

classes = "asphalt,concrete,rooftop,landscape,gravel" 
classes = classes.replace(" ", "")

train_dataset_mode = "random"
train_ratio = 0.8

epochs = 100
cache_size = 500 

### Hyperparameters to experiment

In [5]:
experiment_grid = [
    {
        "input_width": 224,
        "input_height": 224,
        "crop_width": 896,
        "crop_height": 896,
        "crops_per_train_image": 64,
        "crops_per_val_image_w": 8,
        "crops_per_val_image_h": 4,
        "train_batch_size": 32,
        "val_batch_size": 32,
    },
    {
        "input_width": 224,
        "input_height": 224,
        "crop_width": 896 * 2,
        "crop_height": 896 * 2,
        "crops_per_train_image": 16,
        "crops_per_val_image_w": 4,
        "crops_per_val_image_h": 2,
        "train_batch_size": 32,
        "val_batch_size": 32,
    },
    {
        "input_width": 224 * 2,
        "input_height": 224 * 2,
        "crop_width": 896,
        "crop_height": 896,
        "crops_per_train_image": 64,
        "crops_per_val_image_w": 8,
        "crops_per_val_image_h": 4,
        "train_batch_size": 8,
        "val_batch_size": 8,
    },
    {
        "input_width": 224 * 2,
        "input_height": 224 * 2,
        "crop_width": 896 * 2,
        "crop_height": 896 * 2,
        "crops_per_train_image": 16,
        "crops_per_val_image_w": 4,
        "crops_per_val_image_h": 2,
        "train_batch_size": 8,
        "val_batch_size": 8,
    },
    {
        "input_width": 224 * 4,
        "input_height": 224 * 4,
        "crop_width": 896,
        "crop_height": 896,
        "crops_per_train_image": 64,
        "crops_per_val_image_w": 8,
        "crops_per_val_image_h": 4,
        "train_batch_size": 4,
        "val_batch_size": 4,
    },
    {
        "input_width": 224 * 4,
        "input_height": 224 * 4,
        "crop_width": 896 * 2,
        "crop_height": 896 * 2,
        "crops_per_train_image": 16,
        "crops_per_val_image_w": 4,
        "crops_per_val_image_h": 2,
        "train_batch_size": 4,
        "val_batch_size": 4,
    },
]

# Setup experiment and create trials

In [6]:
experiment_name = "unet-cropsize"
if experiment_name not in [
    exp.experiment_name
    for exp in list(
        experiment.Experiment.list(
            sagemaker_boto_client=sagemaker_session.sagemaker_client
        )
    )
]:
    my_experiment = experiment.Experiment.create(
        experiment_name=experiment_name,
        sagemaker_boto_client=sagemaker_session.sagemaker_client,
    )
else:
    my_experiment = experiment.Experiment.load(
        sagemaker_boto_client=sagemaker_session.sagemaker_client,
        experiment_name=experiment_name,
    )

In [7]:
for hp in experiment_grid:
    # load experimental hyperparameters
    input_width = hp["input_width"]
    input_height = hp["input_height"]
    crop_width = hp["crop_width"]
    crop_height = hp["crop_height"]
    crops_per_train_image = hp["crops_per_train_image"]
    crops_per_val_image_w = hp["crops_per_val_image_w"]
    crops_per_val_image_h = hp["crops_per_val_image_h"]
    train_batch_size = hp["train_batch_size"]
    val_batch_size = hp["val_batch_size"]

    # configure trial
    base_job_name = (
        f"{encoder}-{''.join([cl[0] for cl in classes.split(',')])}-"
        f"{input_width}x{input_height}-{crop_width}x{crop_height}-"
        f"{train_dataset_mode}-{crops_per_train_image}-"
        f"{crops_per_val_image_w}x{crops_per_val_image_h}-"
        f"{int(train_ratio*100)}-"
        f"{train_batch_size}-{val_batch_size}"
    )
    output_path = f"s3://st-crayon-dev/tf-outputs/"
    now = datetime.utcnow().strftime("%Y-%m-%d-%H-%M-%S-%f")
    checkpoint_s3_uri = f"s3://st-crayon-dev/tf-checkpoints/{base_job_name}_{now}/"

    trial_name = base_job_name
    if trial_name not in [trial.trial_name for trial in my_experiment.list_trials()]:
        my_trial = my_experiment.create_trial(trial_name=trial_name)
    else:
        my_trial = experiment.trial.Trial.load(
            sagemaker_boto_client=sagemaker_session.sagemaker_client,
            trial_name=trial_name,
        )
    print(my_trial.trial_name)

    # create estimator
    estimator = TensorFlow(
        sagemaker_session=sagemaker_session,
        entry_point="unet.py",
        source_dir=os.path.join(AI_AUTOMATION_REPO_DIR, "src", "unet"),
        role=SAGEMAKER_ROLE,
        instance_count=1,
        instance_type="ml.p3.2xlarge",
        #         use_spot_instances=True,  # uncomment this line to use spot instance
        #         max_run=60 * 60 * 12,  # uncomment this line to use spot instance
        #         max_wait=60 * 60 * 18,  # uncomment this line to use spot instance
        framework_version="2.1",
        py_version="py3",
        base_job_name=base_job_name,
        output_path=output_path,
        checkpoint_s3_uri=checkpoint_s3_uri,
        container_log_level=logging.WARNING,
        hyperparameters={
            "encoder": encoder,
            "pretrained": pretrained,
            "classes": classes,
            "input-width": input_width,
            "input-height": input_height,
            "crop-width": crop_width,
            "crop-height": crop_height,
            "train-dataset-mode": train_dataset_mode,
            "crops-per-train-image": crops_per_train_image,
            "crops-per-val-image-w": crops_per_val_image_w,
            "crops-per-val-image-h": crops_per_val_image_h,
            "train-batch-size": train_batch_size,
            "val-batch-size": val_batch_size,
            "epochs": epochs,
            "cache-size": cache_size,
        },
        metric_definitions=[
            {
                "Name": "train:loss",
                "Regex": ".*loss: ([0-9\\.]+) - mean_io_u[_0-9]*: [0-9\\.]+ - categorical_accuracy: [0-9\\.]+.*",
            },
            {
                "Name": "train:categorical_accuracy",
                "Regex": ".*loss: [0-9\\.]+ - mean_io_u[_0-9]*: [0-9\\.]+ - categorical_accuracy: ([0-9\\.]+).*",
            },
            {
                "Name": "train:miou",
                "Regex": ".*loss: [0-9\\.]+ - mean_io_u[_0-9]*: ([0-9\\.]+) - categorical_accuracy: [0-9\\.]+.*",
            },
            {
                "Name": "validation:loss",
                "Regex": ".*loss: [0-9\\.]+ - mean_io_u[_0-9]*: [0-9\\.]+ - categorical_accuracy: [0-9\\.]+ - val_loss: ([0-9\\.]+) - val_mean_io_u[_0-9]*: [0-9\\.]+ - val_categorical_accuracy: [0-9\\.]+.*",
            },
            {
                "Name": "validation:categorical_accuracy",
                "Regex": ".*loss: [0-9\\.]+ - mean_io_u[_0-9]*: [0-9\\.]+ - categorical_accuracy: [0-9\\.]+ - val_loss: [0-9\\.]+ - val_mean_io_u[_0-9]*: [0-9\\.]+ - val_categorical_accuracy: ([0-9\\.]+).*",
            },
            {
                "Name": "validation:miou",
                "Regex": ".*loss: [0-9\\.]+ - mean_io_u[_0-9]*: [0-9\\.]+ - categorical_accuracy: [0-9\\.]+ - val_loss: [0-9\\.]+ - val_mean_io_u[_0-9]*: ([0-9\\.]+) - val_categorical_accuracy: [0-9\\.]+.*",
            },
        ],
    )

    # train estimator asynchronously
    input_dir = "s3://st-crayon-dev/annotation/phase_1/export-2020-08-25T18_12_45.454Z"  # S3 folder where images and masks are saved
    estimator.fit(
        input_dir,
        wait=False,
        experiment_config={
            "ExperimentName": my_experiment.experiment_name,
            "TrialName": my_trial.trial_name,
            "TrialComponentDisplayName": f"{my_experiment.experiment_name}-{my_trial.trial_name}",
        },
    )

resnet50-acrlg-224x224-896x896-random-64-8x4-80-32-32


INFO:sagemaker:Creating training-job with name: resnet50-acrlg-224x224-896x896-random-6-2020-09-03-01-42-15-994


resnet50-acrlg-224x224-1792x1792-random-16-4x2-80-32-32


INFO:sagemaker:Creating training-job with name: resnet50-acrlg-224x224-1792x1792-random-2020-09-03-01-42-17-540


resnet50-acrlg-448x448-896x896-random-64-8x4-80-8-8


INFO:sagemaker:Creating training-job with name: resnet50-acrlg-448x448-896x896-random-6-2020-09-03-01-42-21-765


resnet50-acrlg-448x448-1792x1792-random-16-4x2-80-8-8


INFO:sagemaker:Creating training-job with name: resnet50-acrlg-448x448-1792x1792-random-2020-09-03-01-42-24-575


resnet50-acrlg-896x896-896x896-random-64-8x4-80-4-4


INFO:sagemaker:Creating training-job with name: resnet50-acrlg-896x896-896x896-random-6-2020-09-03-01-42-26-109


resnet50-acrlg-896x896-1792x1792-random-16-4x2-80-4-4


INFO:sagemaker:Creating training-job with name: resnet50-acrlg-896x896-1792x1792-random-2020-09-03-01-42-28-120
