# AutoGluon Object Detectino using SageMaker

로컬 개발 환경이 아닌 SageMaker 훈련 인스턴스를 사용하여 훈련 수행

## Build Docker image and Push to ECR
---

In [None]:
%%bash

#!/usr/bin/env bash

echo '{
    "runtimes": {
        "nvidia": {
            "path": "nvidia-container-runtime",
            "runtimeArgs": []
        }
    }
}' > daemon.json

sudo cp daemon.json /etc/docker/daemon.json && rm daemon.json

DAEMON_PATH="/etc/docker"
MEMORY_SIZE=10G

FLAG=$(cat $DAEMON_PATH/daemon.json | jq 'has("data-root")')
# echo $FLAG

if [ "$FLAG" == true ]; then
    echo "Already revised"
else
    echo "Add data-root and default-shm-size=$MEMORY_SIZE"
    sudo cp $DAEMON_PATH/daemon.json $DAEMON_PATH/daemon.json.bak
    sudo cat $DAEMON_PATH/daemon.json.bak | jq '. += {"data-root":"/home/ec2-user/SageMaker/.container/docker","default-shm-size":"'$MEMORY_SIZE'"}' | sudo tee $DAEMON_PATH/daemon.json > /dev/null
    sudo service docker restart
    echo "Docker Restart"
fi

sudo docker info | grep Root

In [None]:
import sagemaker
from sagemaker import utils

role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.session.Session()
region = sagemaker_session._region_name

bucket = sagemaker_session.default_bucket()
output_path = f"s3://{bucket}/ag-od-{utils.sagemaker_timestamp()}"

In [None]:
container = "autogluon-objdetect-training-gpu"
tag = "latest"
! ./build_and_push.sh $container $tag docker/Dockerfile.gpu

## Training Start
---

In [None]:
import sys
sys.path.append("./ag_scripts")

from ag_scripts.ag_model import (
    AutoGluonSagemakerEstimator,
    AutoGluonNonRepackInferenceModel,
    AutoGluonSagemakerInferenceModel,
    AutoGluonRealtimePredictor,
    AutoGluonBatchPredictor,
)

container_uri = "143656149352.dkr.ecr.us-east-1.amazonaws.com/autogluon-objdetect-training-gpu:latest"
# container_uri = "784880277394.dkr.ecr.ap-northeast-2.amazonaws.com/ag-od-training-gpu:ag-gpu-0.7-src"

task = 'objdetection-pikachu'
s3_train_path = f"s3://{bucket}/{task}"
!aws s3 sync {task} {s3_train_path}

In [None]:
# !aws s3 ls {s3_train_path}/annotations/

In [None]:
ag_hyper_params = {
    "checkpoint_name": "yolox_l_8x8_300e_coco",
    "num_gpus": -1,  # use all GPUs
    "val_metric": "map",
    "annotations_path": "annotations/train-coco.json",
    "learning_rate": 3e-5, # we use two stage and detection head has 100x lr
    "per_gpu_batch_size": 8,  # decrease it when model is large
    "max_epochs": 10,  # for the real use case, at least 50
    "check_val_every_n_epoch": 2, # make sure there is at least one validation
    "patience": 3  # Early stop after k consective validations are not the best
}

In [None]:
estimator = AutoGluonSagemakerEstimator(
    custom_image_uri=container_uri,
    role=role,
    entry_point="train.py",
    source_dir="scripts",
    region=region,
    instance_count=1,
    instance_type="ml.g5.xlarge",
    #instance_type="local",
    framework_version="0.7",
    py_version="py39",
    base_job_name="autogluon-objdetect-train",
    volume_size=30,
    disable_profiler=True,
    debugger_hook_config=False,
    hyperparameters=ag_hyper_params
)

In [None]:
job_name = utils.unique_name_from_base("autogluon-objdetect")
estimator.fit(
    {
        "train": s3_train_path,
    },
    job_name=job_name,
    wait=False
)
print(job_name)