# <B> # SageMaker Training with Experiments and Processing </B>
* Container: codna_python3

## 학습 작업의 실행 노트북 개요

- SageMaker Training에 SageMaker 실험을 추가하여 여러 실험의 결과를 비교할 수 있습니다.
    - [작업 실행 시 필요 라이브러리 import](#작업-실행-시-필요-라이브러리-import)
    - [SageMaker 세션과 Role, 사용 버킷 정의](#SageMaker-세션과-Role,-사용-버킷-정의)
    - [하이퍼파라미터 정의](#하이퍼파라미터-정의)
    - [학습 실행 작업 정의](#학습-실행-작업-정의)
        - 학습 코드 명
        - 학습 코드 폴더 명
        - 학습 코드가 사용한 Framework 종류, 버전 등
        - 학습 인스턴스 타입과 개수
        - SageMaker 세션
        - 학습 작업 하이퍼파라미터 정의
        - 학습 작업 산출물 관련 S3 버킷 설정 등
    - [학습 데이터셋 지정](#학습-데이터셋-지정)
        - 학습에 사용하는 데이터셋의 S3 URI 지정
    - [SageMaker 실험 설정](#SageMaker-실험-설정)
    - [학습 실행](#학습-실행)
    - [데이터 세트 설명](#데이터-세트-설명)
    - [실험 결과 보기](#실험-결과-보기)

## AutoReload

In [None]:
%load_ext autoreload
%autoreload 2

## 0. Install packages

In [None]:
install_needed = True  # should only be True once
# install_needed = False

In [None]:
%%bash
#!/bin/bash

DAEMON_PATH="/etc/docker"
MEMORY_SIZE=10G

FLAG=$(cat $DAEMON_PATH/daemon.json | jq 'has("data-root")')
# echo $FLAG

if [ "$FLAG" == true ]; then
    echo "Already revised"
else
    echo "Add data-root and default-shm-size=$MEMORY_SIZE"
    sudo cp $DAEMON_PATH/daemon.json $DAEMON_PATH/daemon.json.bak
    sudo cat $DAEMON_PATH/daemon.json.bak | jq '. += {"data-root":"/home/ec2-user/SageMaker/.container/docker","default-shm-size":"'$MEMORY_SIZE'"}' | sudo tee $DAEMON_PATH/daemon.json > /dev/null
    sudo service docker restart
    echo "Docker Restart"
fi

In [None]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install -U pip
    !{sys.executable} -m pip install -U smdebug sagemaker-experiments
    !{sys.executable} -m pip install -U sagemaker
    !sudo curl -L "https://github.com/docker/compose/releases/download/v2.7.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
    !sudo chmod +x /usr/local/bin/docker-compose
    
    IPython.Application.instance().kernel.do_shutdown(True)

## 1. parameter store 설정

In [None]:
import boto3
from utils.ssm import parameter_store

In [None]:
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)
strPrefix = pm.get_params(key="PREFIX")

In [None]:
strBucketName = pm.get_params(key="-".join([strPrefix, "BUCKET"]))
strExecutionRole = pm.get_params(key="-".join([strPrefix, "SAGEMAKER-ROLE-ARN"]))

In [None]:
print (f'strBucketName: {strBucketName}')
print (f'strExecutionRole: {strExecutionRole}')

## 2. Dataset

In [None]:
import os

In [None]:
strS3DataPath = f"s3://{strBucketName}/dataset" 
strLocalDataPath = os.path.join(os.getcwd(), "data")

In [None]:
!aws s3 sync $strLocalDataPath $strS3DataPath

## 3.Training-job

In [None]:
import os
import sagemaker
from sagemaker.xgboost.estimator import XGBoost

* **Set Up SageMaker Experiment**
    - Create or load [SageMaker Experiment](https://docs.aws.amazon.com/sagemaker/latest/dg/experiments.html) for the example training job. This will create an experiment trial object in SageMaker.

In [None]:
from time import strftime
from smexperiments.trial import Trial
from smexperiments.experiment import Experiment

In [None]:
def create_experiment(experiment_name):
    try: sm_experiment = Experiment.load(experiment_name)
    except: sm_experiment = Experiment.create(experiment_name=experiment_name)

In [None]:
def create_trial(experiment_name):
    create_date = strftime("%m%d-%H%M%s")
    sm_trial = Trial.create(trial_name=f'{experiment_name}-{create_date}',
                            experiment_name=experiment_name)
    job_name = f'{sm_trial.trial_name}'
    return job_name

* params for training job

In [None]:
# Set to True to enable SageMaker to run locally
local_mode = True

if local_mode:
    
    from sagemaker.local import LocalSession
    
    strInstanceType = "local"
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
        
    dicDataChannels = {
        "training": f'file://{os.path.join(strLocalDataPath, "train.csv")}',
        "testing": f'file://{os.path.join(strLocalDataPath, "test.csv")}',
    }
    
else:
    strInstanceType = "ml.m5.2xlarge" #"ml.p3.2xlarge"#"ml.g4dn.8xlarge"#"ml.p3.2xlarge", 'ml.p3.16xlarge' , ml.g4dn.8xlarge
    
    sagemaker_session = sagemaker.Session()
    dicDataChannels = {
        "training": os.path.join(strS3DataPath, "train.csv"), 
        "testing": os.path.join(strS3DataPath, "test.csv"), 
    }

nInstanceCount = 1

bSpotTraining = False
if bSpotTraining:
    nMaxWait = 1*60*60
    nMaxRun = 1*60*60
    
else:
    nMaxWait = None
    nMaxRun = 1*60*60
    

bUseTrainWarmPool = False ## training image 다운받지 않음, 속도 빨라진다
if bUseTrainWarmPool: nKeepAliveSeconds = 3600 ## 최대 1시간 동안!!, service quota에서 warmpool을 위한 request 필요
else: nKeepAliveSeconds = None
if bSpotTraining:
    bUseTrainWarmPool = False # warmpool은 spot instance 사용시 활용 할 수 없음
    nKeepAliveSeconds = None
    


strOutputPath = os.path.join(
    "s3://{}".format(strBucketName),
    strPrefix,
    "training",
    "model-output"
)

strCodeLocation = os.path.join(
    "s3://{}".format(strBucketName),
    strPrefix,
    "training",
    "backup_codes"
)

strExperimentName = '-'.join([strPrefix, "experiments"])

## You can't override the metric definitions for Amazon SageMaker algorithms. 
# strNumeticRegEx = "([0-9\\.]+)(e-?[[01][0-9])?"
# listMetricDefinitions = [
#     {"Name": "train_loss", "Regex": f"loss={strNumeticRegEx}"},
#     {"Name": "wer", "Regex": f"wer:{strNumeticRegEx}"}
# ]

# dicGitConfig = {
#     'repo': f'https://{pm.get_params(key="-".join([prefix, "CODE_REPO"]))}',
#     'branch': 'main',
#     'username': pm.get_params(key="-".join([prefix, "CODECOMMIT-USERNAME"]), enc=True),
#     'password': pm.get_params(key="-".join([prefix, "CODECOMMIT-PWD"]), enc=True)
# }  

kwargs = {}

In [None]:
print (f'strInstanceType: {strInstanceType}')
print (f'nInstanceCount: {nInstanceCount}')
print (f'sagemaker_session: {sagemaker_session}')
print (f'bSpotTraining: {bSpotTraining}')
print (f'strExperimentName: {strExperimentName}')
print (f'dicDataChannels: {dicDataChannels}')
print (f'strOutputPath: {strOutputPath}')
print (f'strCodeLocation: {strCodeLocation}')
print (f'bUseTrainWarmPool: {bUseTrainWarmPool}/{nKeepAliveSeconds}')

* Define training job

In [None]:
dicHyperparameters = {
    "scale_pos_weight" : "19",    
    "max_depth": "2",
    "eta": "0.3",
    "objective": "binary:logistic",
    "num_round": "100",
}

In [None]:
estimator = XGBoost(
    entry_point="xgboost_starter_script.py",
    source_dir="source/train/",
    output_path=strOutputPath,
    code_location=strCodeLocation,
    hyperparameters=dicHyperparameters, ## Contatiner내 env. variable로 들어 감
    role=strExecutionRole,
    sagemaker_session=sagemaker_session,
    instance_count=nInstanceCount,
    instance_type=strInstanceType,
    framework_version="1.3-1",
    max_run=nMaxRun,
    use_spot_instances=bSpotTraining,
    max_wait=nMaxWait,
    keep_alive_period_in_seconds=nKeepAliveSeconds,
    enable_sagemaker_metrics=True,
    #metric_definitions=listMetricDefinitions,
    volume_size=256, ## GB
)

* run

In [None]:
if strInstanceType =='local_gpu': estimator.checkpoint_s3_uri = None

create_experiment(strExperimentName)
job_name = create_trial(strExperimentName)

estimator.fit(
    inputs=dicDataChannels, 
    job_name=job_name,
    experiment_config={
      'TrialName': job_name,
      'TrialComponentDisplayName': job_name,
    },
    wait=True,
)

* save model-path, experiment-name

In [None]:
pm.put_params(key="-".join([strPrefix, "MODEL-PATH"]), value=estimator.model_data, overwrite=True)
pm.put_params(key="-".join([strPrefix, "EXPERI-NAME"]), value=strExperimentName, overwrite=True)

* show experiments

In [None]:
from sagemaker.analytics import ExperimentAnalytics
import pandas as pd
#pd.options.display.max_columns = 50
#pd.options.display.max_rows = 10
#pd.options.display.max_colwidth = 100

In [None]:
trial_component_training_analytics = ExperimentAnalytics(
    sagemaker_session= sagemaker_session,
    experiment_name= strExperimentName,
    sort_by="metrics.validation:auc.max",        
    sort_order="Descending",
    metric_names=["validation:auc"]
)

trial_component_training_analytics.dataframe()[['Experiments', 'Trials', 'validation:auc - Min', 'validation:auc - Max',
                                                'validation:auc - Avg', 'validation:auc - StdDev', 'validation:auc - Last', 
                                                'eta', 'max_depth', 'num_round', 'scale_pos_weight']]