## Building and pushing image to an EC Repository

In [51]:
!cd .. && scripts/build_and_push.sh

scripts/build_and_push.sh: line 1: ·!/bin/bash: No such file or directory
ECR Login
https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Building image
Sending build context to Docker daemon  822.8kB
Step 1/9 : FROM nvcr.io/nvidia/pytorch:20.08-py3
 ---> c710aa2340b4
Step 2/9 : ENV PYTHONDONTWRITEBYTECODE=1
 ---> Using cache
 ---> 856cd9c0126f
Step 3/9 : ENV PYTHONUNBUFFERED=1
 ---> Using cache
 ---> c56d3042c81a
Step 4/9 : RUN apt-get update && apt-get install -y --no-install-recommends nginx curl
 ---> Using cache
 ---> 9f9821291e14
Step 5/9 : WORKDIR /opt/ml/
 ---> Using cache
 ---> 3b56b46fe6a0
Step 6/9 : RUN pip install sagemaker-training
 ---> Using cache
 ---> 6327a4ef2c30
Step 7/9 : COPY src/ /opt/ml/code/
 ---> c8fabf2119a6
Step 8/9 : ENV SAGEMAKER_PROGRAM train
 ---> Running in 061b5e301872
Removing intermediate container 061b5e301872
 ---> 38952655b53c
Step 9/9 : ENV PATH="/opt/ml/code:${PATH}"
 ---> Running in 266de44f2e00
Removing 

## Sagemaker config

In [52]:
from sagemaker.session import get_execution_role, Session
import os

sagemaker_role = get_execution_role()
sagemaker_session = Session()

In [53]:
from time import gmtime, strftime

bucket_name = sagemaker_session.default_bucket()
key_name = "TiendaApp"
s3_uri_data = "s3://{}/{}/{}/".format(bucket_name, key_name, "data")
s3_uri_output = "s3://{}/{}/{}/".format(bucket_name, key_name, "model")
s3_uri_checkpoint = "s3://{}/{}/{}".format(bucket_name, key_name, "checkpoints")
s3_output_path_tensorboard = "s3://{}/{}/{}".format(bucket_name, key_name, "tensorboard")

account = sagemaker_session.boto_session.client('sts').get_caller_identity()['Account']
region = sagemaker_session.boto_session.region_name
image_name = "yolov5_train"
image_uri = "{0}.dkr.ecr.{1}.amazonaws.com/{2}".format(account, region, image_name)

base_job_name = "test-training-job-{}".format(strftime("%Y-%m-%d-%H-%M-%S", gmtime()))
os.environ["account"] = account
os.environ["s3_uri"] = s3_uri_data

print("Training Job name : {}".format(base_job_name))
print("S3 uri input: {}".format(s3_uri_data))
print("S3 uri output: {}".format(s3_uri_output))
print("image uri: {}".format(image_uri))
#!aws s3 ls $s3_uri
#!aws ecr describe-repositories --registry-id $account

Training Job name : test-training-job-2020-09-21-17-49-47
S3 uri input: s3://sagemaker-us-west-2-430127992102/TiendaApp/data/
S3 uri output: s3://sagemaker-us-west-2-430127992102/TiendaApp/model/
image uri: 430127992102.dkr.ecr.us-west-2.amazonaws.com/yolov5_train


## Starting a sagemaker training job

In [54]:
from sagemaker.estimator import Estimator
from sagemaker.debugger import TensorBoardOutputConfig

tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=s3_output_path_tensorboard,
    container_local_output_path="/opt/ml/output/tensorboard"
)

metric_definitions = [{"Name": "train:error", "Regex": "\S*train_error=([0-9\.]+)"},
                     {"Name": "val:error", "Regex": "\S*val_error=([0-9\\.]+)"}]
checkpoint_local_path = "/opt/ml/checkpoints/"

yolov3_estimator = Estimator(image_name=image_uri, 
                            role=sagemaker_role,
                             output_path=s3_uri_output,
                            train_instance_count=2,
                           train_instance_type="ml.g4dn.xlarge",
                            train_volume_size=35,
                            sagemaker_session=sagemaker_session,
                             tensorboard_output_config = tensorboard_output_config,
                             enable_sagemaker_metrics = True,
                             metric_definitions = metric_definitions,
                             checkpoint_s3_uri = s3_uri_checkpoint,
                             checkpoint_local_path = checkpoint_local_path,
                            base_job_name = base_job_name,
                            hyperparameters={"test": "this is a test", "batch": 32},
                            tags=[{"Key": "Name", "Value": "test-job"},
                                 {"Key": "Description", "Value": "Test training job"}])


yolov3_estimator.fit(job_name=base_job_name, wait=True)
#yolov3_estimator.fit(inputs={"training": s3_uri_data}, job_name=base_job_name, wait=True)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


2020-09-21 17:49:48 Starting - Starting the training job...
2020-09-21 17:49:52 Starting - Launching requested ML instances......
2020-09-21 17:51:02 Starting - Preparing the instances for training......
2020-09-21 17:51:58 Downloading - Downloading input data
[34m== PyTorch ==[0m
[0m
[34mNVIDIA Release 20.08 (build 15516749)[0m
[34mPyTorch Version 1.7.0a0+8deb4fe
[0m
[34mContainer image Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
[0m
[34mCopyright (c) 2014-2020 Facebook Inc.[0m
[34mCopyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)[0m
[34mCopyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)[0m
[34mCopyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)[0m
[34mCopyright (c) 2011-2013 NYU                      (Clement Farabet)[0m
[34mCopyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)[0m
[34mCopyright (c) 2006      Idiap Research Institute (Samy Ben