# <B> Anormaly Detection based on AutoEncoder </B>
* Container: codna_pytorch_p310

## AutoEncoder based anomaly detection

- **RaPP** - Novelty Detection with Reconstruction along Projection Pathway <br>
<p align="center">
    <img src="imgs/rapp-f1.png" width="1100" height="300" style="display: block; margin: 0 auto"/>
</p>
<p align="center">
    <img src="imgs/rapp-f2.png" width="1100" height="300" style="display: block; margin: 0 auto"/>
</p>

    * [Ppaer, ICLR 2020] https://openreview.net/attachment?id=HkgeGeBYDB&name=original_pdf
    * [Desc, KOREAN] [RaPP](https://makinarocks.github.io/rapp/)
    * [Supplement #1] [Autoencoder based Anomaly Detection](https://makinarocks.github.io/Autoencoder-based-anomaly-detection/)
    * [Supplement #2] [Reference code (github)](https://github.com/Aiden-Jeon/RaPP)
        

## 0. AutoReload

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys, os
module_path = ".."
sys.path.append(os.path.abspath(module_path))

## 1. Parameter store 설정

In [None]:
import boto3
from utils.ssm import parameter_store

In [None]:
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)
strPrefix = pm.get_params(key="PREFIX")

## 3. pramamters for tasks

In [None]:
strAccountId = pm.get_params(key="-".join([strPrefix, "ACCOUNT-ID"]))
strBucketName = pm.get_params(key="-".join([strPrefix, "BUCKET"]))
strExecutionRole = pm.get_params(key="-".join([strPrefix, "SAGEMAKER-ROLE-ARN"]))
strS3DataPath = pm.get_params(key="-".join([strPrefix, "S3-DATA-PATH"]))

In [None]:
print (f"prefix: {strPrefix}")
print (f"account_id: {strAccountId}")
print (f"defaulut_bucket: {strBucketName}")
print (f"sagemaker_role: {strExecutionRole}")
print (f"s3_data_path: {strS3DataPath}")

## 1. Data manipulation and visualization

In [None]:
import os
import pandas as pd
from task_utils.util import plot_click_w_fault_and_res, plot_click_w_fault_res_ad, plot_click_w_ad_exp

* load data and derive features

In [None]:
clicks_1T = pd.read_csv(os.path.join(strS3DataPath, "clicks_1T.csv"), parse_dates=["timestamp"]).set_index("timestamp")
clicks_1T["residual"] = clicks_1T['click'] - clicks_1T['user'] 
clicks_1T["fault"] = pd.read_csv(os.path.join(strS3DataPath, "fault_label_1T.csv"), header=None).values[0] ## label
clicks_1T["time"] = [int(str(time).split(" ")[1].split(":")[0]) for time in clicks_1T.index] ## time variable

In [None]:
print (f'data shape: {clicks_1T.shape}')
print (f'timestamp min: {clicks_1T.index.min()}, max: {clicks_1T.index.max()}')

* visualization

In [None]:
plot_click_w_fault_and_res(clicks_1T)

* upload data to s3 and local

In [None]:
strTrainDataName = "merged_clicks_1T.csv"
clicks_1T.to_csv(os.path.join(strS3DataPath, strTrainDataName), index=True) # to s3
clicks_1T.to_csv(os.path.join("./data", strTrainDataName), index=True) # to local

print (f'train_data_name: {strTrainDataName}')

## 2. Processing-job for preprocessing

In [None]:
import os
import sagemaker
from sagemaker.pytorch.estimator import PyTorch
from sagemaker.processing import ProcessingInput, ProcessingOutput, FrameworkProcessor

### Execution based on cloud / local
* params for processing job
    - cloud mode: `local_mode=False`
    - local mode: `local_mode=True`

In [None]:
local_mode = False

if local_mode: 
    strInstanceType = 'local'
    
    import os
    from sagemaker.local import LocalSession
    
    sagemaker_session = LocalSession()
    strDataPath = str(os.path.join("file://", os.getcwd(), "data"))
    
else:
    strInstanceType = "ml.m5.xlarge"
    sagemaker_session = sagemaker.Session()
    strDataPath = strS3DataPath
        
print (f"instance-type: {strInstanceType}")
print (f'role: {strExecutionRole}')
print (f"bucket: {strBucketName}")
print (f"dataset-path: {strDataPath}")
print (f"sagemaker_session: {sagemaker_session}")

* Define processing job

In [None]:
dataset_processor = FrameworkProcessor(
    estimator_cls=PyTorch,
    framework_version="2.4.0",
    py_version="py311",
    image_uri=None,
    instance_type=strInstanceType,
    instance_count=1,
    role=strExecutionRole,
    base_job_name="preprocessing", # bucket에 보이는 이름 (pipeline으로 묶으면 pipeline에서 정의한 이름으로 bucket에 보임)
    sagemaker_session=sagemaker_session
)

strProcPrefix = "/opt/ml/processing"

strOutputPath = os.path.join(
    "s3://{}".format(strBucketName),
    strPrefix,
    "preprocessing",
    "output"
)

nShingleSize = 4

In [None]:
print (f'strOutputPath: {strOutputPath}')
print (f'nShingleSize: {nShingleSize}')

In [None]:
dataset_processor.run(
    #job_name="preprocessing", ## 이걸 넣어야 캐시가 작동함, 안그러면 프로세서의 base_job_name 이름뒤에 날짜 시간이 붙어서 캐시 동작 안함
    #git_config=git_config,
    code='preprocessing.py', #소스 디렉토리 안에서 파일 path
    source_dir= "./src/preprocessing", #현재 파일에서 소스 디렉토리 상대경로 # add processing.py and requirements.txt here
    inputs=[
        ProcessingInput(
            input_name="input-data",
            source=strDataPath,
            destination=os.path.join(strProcPrefix, "input")
        ),
    ],
    outputs=[       
        ProcessingOutput(
            output_name="output-data",
            source=os.path.join(strProcPrefix, "output"),
            destination=strOutputPath
        ),
    ],
    arguments=[
        "--proc_prefix", strProcPrefix, \
        "--shingle_size", str(nShingleSize), \
        "--train_data_name", strTrainDataName
    ]
)

* download preprocessed data to local

In [None]:
!aws s3 sync $strOutputPath ./data/preprocessing --quiet

* save params

In [None]:
pm.put_params(key="-".join([strPrefix, "PREP-DATA-PATH"]), value=strOutputPath, overwrite=True)
print (f'S3-PREP-DATA-PATH: {pm.get_params(key="-".join([strPrefix, "PREP-DATA-PATH"]))}')

## 3. Training-job for anomaly detection

* check gpu

In [None]:
import torch

In [None]:
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"# DEVICE {i}: {torch.cuda.get_device_name(i)}")
        print("- Memory Usage:")
        print(f"  Allocated: {round(torch.cuda.memory_allocated(i)/1024**3,1)} GB")
        print(f"  Cached:    {round(torch.cuda.memory_reserved(i)/1024**3,1)} GB\n")

else:
    print("# GPU is not available")

# GPU 할당 변경하기
GPU_NUM = 0 # 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU

print ('# Current cuda device: ', torch.cuda.current_device()) # check

In [None]:
import os
from sagemaker.pytorch.estimator import PyTorch
from sagemaker.inputs import TrainingInput

* **Set Up SageMaker Experiment**
    - Create or load [SageMaker Experiment](https://docs.aws.amazon.com/sagemaker/latest/dg/experiments.html) for the example training job. This will create an experiment trial object in SageMaker.
    - **pip instatll sagemaker-experiments**

### Execution based on cloud / local
* params for processing job
    - cloud mode: `local_mode=False`
    - local mode: `local_mode=True`

### Enable warmpool
* `bUseTrainWarmPool = True`

* params for training job

In [None]:
# Set to True to enable SageMaker to run locally
local_mode = True

if local_mode:
    strInstanceType = "local_gpu"
    
    import os
    from sagemaker.local import LocalSession
    
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    
    strLocalDataDir = os.getcwd() + '/data/preprocessing'
    dicDataChannels = {
        "train": f"file://{strLocalDataDir}",
        "validation": f"file://{strLocalDataDir}"
    }
    
else:
    
    strInstanceType = "ml.p3.2xlarge" #"ml.p3.2xlarge"#"ml.g4dn.8xlarge"#"ml.p3.2xlarge", 'ml.p3.16xlarge' , ml.g4dn.8xlarge
    
    sagemaker_session = sagemaker.Session()
    dicDataChannels = {
        "train": pm.get_params(key="-".join([strPrefix, "PREP-DATA-PATH"])),
        "validation": pm.get_params(key="-".join([strPrefix, "PREP-DATA-PATH"]))
    }

dicHyperParams = {
    "epochs":"50",
    "batch_size":"128", 
    "lr":"1e-2",
    "shingle_size":"4",
    "num_features":"4",
    "emb_size":"4",
    "workers":"2"
}

nInstanceCount = 1

bSpotTraining = False
if bSpotTraining:
    nMaxWait = 1*60*60
    nMaxRun = 1*60*60
    
else:
    nMaxWait = None
    nMaxRun = 1*60*60

bUseTrainWarmPool = True ## training image 다운받지 않음, 속도 빨라진다
if bUseTrainWarmPool: nKeepAliveSeconds = 3600 ## 최대 1시간 동안!!, service quota에서 warmpool을 위한 request 필요
else: nKeepAliveSeconds = None
if bSpotTraining:
    bUseTrainWarmPool = False # warmpool은 spot instance 사용시 활용 할 수 없음
    nKeepAliveSeconds = None
    
    
strProcPrefix = "/opt/ml/processing"

strOutputPath = os.path.join(
    "s3://{}".format(strBucketName),
    strPrefix,
    "training",
    "model-output"
)

strCodeLocation = os.path.join(
    "s3://{}".format(strBucketName),
    strPrefix,
    "training",
    "backup_codes"
)

num_re = "([0-9\\.]+)(e-?[[01][0-9])?"
metric_definitions = [
    {"Name": "train_loss", "Regex": f"train_loss:{num_re}"},
    {"Name": "train_cos", "Regex": f"train_cos:{num_re}"},
    {"Name": "val_cos", "Regex": f"val_cos:{num_re}"}
]

kwargs = {}

In [None]:
print (f'local_mode: {local_mode}')
print (f'sagemaker_session: {sagemaker_session}')
print (f'strInstanceType: {strInstanceType}')
print (f'dicDataChannels: {dicDataChannels}')
print (f'strOutputPath: {strOutputPath}')

* Define training job

In [None]:
estimator = PyTorch(
    entry_point="main.py", # the script we want to run
    source_dir="./src/training", # where our conf/script is
    #git_config=git_config,
    role=strExecutionRole,
    instance_type=strInstanceType,
    instance_count=nInstanceCount,
    image_uri=None,
    framework_version="2.0.0", # version of PyTorch
    py_version="py310",
    volume_size=128,
    code_location=strCodeLocation,
    output_path=strOutputPath,
    disable_profiler=True,
    debugger_hook_config=False,
    hyperparameters=dicHyperParams,
    sagemaker_session=sagemaker_session,
    metric_definitions=metric_definitions,
    max_run=nMaxRun,
    use_spot_instances=bSpotTraining,  # spot instance 활용
    max_wait=nMaxWait,
    keep_alive_period_in_seconds=nKeepAliveSeconds,
    enable_sagemaker_metrics=True,
    #distribution=distribution,
    **kwargs
)

In [None]:
if strInstanceType =='local_gpu': estimator.checkpoint_s3_uri = None

#create_experiment(strExperimentName)
#job_name = create_trial(strExperimentName)
#job_name = "training-ad-4"
estimator.fit(
    inputs=dicDataChannels, 
    #job_name=job_name,
    #experiment_config={
    #  'TrialName': job_name,
    #  'TrialComponentDisplayName': job_name,
    #},
    wait=True,
)

In [None]:
print (f'model data: {estimator.model_data}')

* save params

In [None]:
pm.put_params(key="-".join([strPrefix, "S3-MODEL-ARTIFACT"]), value=estimator.model_data, overwrite=True)
print (f'S3-MODEL-ARTIFACT: {pm.get_params(key="-".join([strPrefix, "S3-MODEL-ARTIFACT"]))}')

### SageMaker Distributed Data Parallel (SMDDP)
* code conversion to use SMDDP (`./src/trating/main_ddp.py`) 
    - warpping model with ddp
    - local rank
    - data loader (distributed sampler)
* `distribution={"smdistributed": {"dataparallel": {"enabled": True}}}`

#distribution = {"torch_distributed": {"enabled": True}} 
#distribution = {"pytorchddp": {"enabled": True}} 


In [None]:
# Set to True to enable SageMaker to run locally
local_mode = True

if local_mode:
    strInstanceType = "local_gpu"
    
    import os
    from sagemaker.local import LocalSession
    
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    
    strLocalDataDir = os.getcwd() + '/data/preprocessing'
    dicDataChannels = {
        "train": f"file://{strLocalDataDir}",
        "validation": f"file://{strLocalDataDir}"
    }
    
else:
    
    strInstanceType = "ml.p3.2xlarge" #"ml.p3.2xlarge"#"ml.g4dn.8xlarge"#"ml.p3.2xlarge", 'ml.p3.16xlarge' , ml.g4dn.8xlarge
    
    sagemaker_session = sagemaker.Session()
    dicDataChannels = {
        "train": pm.get_params(key="-".join([strPrefix, "PREP-DATA-PATH"])),
        "validation": pm.get_params(key="-".join([strPrefix, "PREP-DATA-PATH"]))
    }

dicHyperParams = {
    "epochs":"50",
    "batch_size":"128", 
    "lr":"1e-2",
    "shingle_size":"4",
    "num_features":"4",
    "emb_size":"4",
    "workers":"2"
}

nInstanceCount = 1

bSpotTraining = False
if bSpotTraining:
    nMaxWait = 1*60*60
    nMaxRun = 1*60*60
    
else:
    nMaxWait = None
    nMaxRun = 1*60*60

bUseTrainWarmPool = True ## training image 다운받지 않음, 속도 빨라진다
if bUseTrainWarmPool: nKeepAliveSeconds = 3600 ## 최대 1시간 동안!!, service quota에서 warmpool을 위한 request 필요
else: nKeepAliveSeconds = None
if bSpotTraining:
    bUseTrainWarmPool = False # warmpool은 spot instance 사용시 활용 할 수 없음
    nKeepAliveSeconds = None
    
    
strProcPrefix = "/opt/ml/processing"

strOutputPath = os.path.join(
    "s3://{}".format(strBucketName),
    strPrefix,
    "training",
    "model-output"
)

strCodeLocation = os.path.join(
    "s3://{}".format(strBucketName),
    strPrefix,
    "training",
    "backup_codes"
)

#distribution={"smdistributed": {"dataparallel": {"enabled": True}}}
distribution={"torch_distributed": {"enabled": True}}

num_re = "([0-9\\.]+)(e-?[[01][0-9])?"
metric_definitions = [
    {"Name": "train_loss", "Regex": f"train_loss:{num_re}"},
    {"Name": "train_cos", "Regex": f"train_cos:{num_re}"},
    {"Name": "val_cos", "Regex": f"val_cos:{num_re}"}
]

kwargs = {}

In [None]:
estimator = PyTorch(
    entry_point="main_ddp.py", # the script we want to run
    source_dir="./src/training", # where our conf/script is
    #git_config=git_config,
    role=strExecutionRole,
    instance_type=strInstanceType,
    instance_count=nInstanceCount,
    image_uri=None,
    framework_version="2.0.0", # version of PyTorch
    py_version="py310",
    volume_size=128,
    code_location=strCodeLocation,
    output_path=strOutputPath,
    disable_profiler=True,
    debugger_hook_config=False,
    hyperparameters=dicHyperParams,
    sagemaker_session=sagemaker_session,
    metric_definitions=metric_definitions,
    max_run=nMaxRun,
    use_spot_instances=bSpotTraining,  # spot instance 활용
    max_wait=nMaxWait,
    keep_alive_period_in_seconds=nKeepAliveSeconds,
    enable_sagemaker_metrics=True,
    distribution=distribution,
    **kwargs
)

if strInstanceType =='local_gpu':
    estimator.checkpoint_s3_uri = None

estimator.fit(
    inputs=dicDataChannels,
    wait=True,
)

### SageMaker Experiments using MLFlow
* code conversion to use Experiments (`./src/trating/main_ddp_mlflow_exp.py`) 

In [None]:
import mlflow
from time import strftime

In [None]:
create_date = strftime("%m%d-%H%M%s")
tracking_server_arn = pm.get_params(key="-".join([strPrefix, "MLFLOW-TRACKING-SERVER-ARN"]))
mlflow_exp_name = f'run-{strPrefix}-exp-{create_date}'
mlflow.create_experiment(mlflow_exp_name)

print (f'tracking_server_arn: {tracking_server_arn}')
print (f'mlflow_exp_name: {mlflow_exp_name}')

In [None]:
# Set to True to enable SageMaker to run locally
local_mode = False

if local_mode:
    strInstanceType = "local_gpu"
    
    import os
    from sagemaker.local import LocalSession
    
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    
    strLocalDataDir = os.getcwd() + '/data/preprocessing'
    dicDataChannels = {
        "train": f"file://{strLocalDataDir}",
        "validation": f"file://{strLocalDataDir}"
    }
    
else:
    
    strInstanceType = "ml.p3.2xlarge" #"ml.p3.2xlarge"#"ml.g4dn.8xlarge"#"ml.p3.2xlarge", 'ml.p3.16xlarge' , ml.g4dn.8xlarge
    
    sagemaker_session = sagemaker.Session()
    dicDataChannels = {
        "train": pm.get_params(key="-".join([strPrefix, "PREP-DATA-PATH"])),
        "validation": pm.get_params(key="-".join([strPrefix, "PREP-DATA-PATH"]))
    }

dicHyperParams = {
    "epochs":"50",
    "batch_size":"128", 
    "lr":"1e-2",
    "shingle_size":"4",
    "num_features":"4",
    "emb_size":"4",
    "workers":"2"
}

nInstanceCount = 1

bSpotTraining = False
if bSpotTraining:
    nMaxWait = 1*60*60
    nMaxRun = 1*60*60
    
else:
    nMaxWait = None
    nMaxRun = 1*60*60

bUseTrainWarmPool = True ## training image 다운받지 않음, 속도 빨라진다
if bUseTrainWarmPool: nKeepAliveSeconds = 3600 ## 최대 1시간 동안!!, service quota에서 warmpool을 위한 request 필요
else: nKeepAliveSeconds = None
if bSpotTraining:
    bUseTrainWarmPool = False # warmpool은 spot instance 사용시 활용 할 수 없음
    nKeepAliveSeconds = None
    
    
strProcPrefix = "/opt/ml/processing"

strOutputPath = os.path.join(
    "s3://{}".format(strBucketName),
    strPrefix,
    "training",
    "model-output"
)

strCodeLocation = os.path.join(
    "s3://{}".format(strBucketName),
    strPrefix,
    "training",
    "backup_codes"
)

#distribution={"smdistributed": {"dataparallel": {"enabled": True}}}
distribution={"torch_distributed": {"enabled": True}}

environment={
    "MLFLOW_TRACKING_ARN": tracking_server_arn,
    "EXPERIMENT_NAME": mlflow_exp_name
}
strExperimentName = '-'.join([strPrefix, "experiments-test"])
create_date = strftime("%m%d-%H%M%s")
strRunName = f'run-smimd-ddp-{create_date}'

num_re = "([0-9\\.]+)(e-?[[01][0-9])?"
metric_definitions = [
    {"Name": "train_loss", "Regex": f"train_loss:{num_re}"},
    {"Name": "train_cos", "Regex": f"train_cos:{num_re}"},
    {"Name": "val_cos", "Regex": f"val_cos:{num_re}"}
]

kwargs = {}

In [None]:
environment={
    "MLFLOW_TRACKING_ARN": tracking_server_arn,
    "EXPERIMENT_NAME": mlflow_exp_name
}

estimator = PyTorch(
    entry_point="main_ddp_mlflow_exp.py", # the script we want to run
    source_dir="./src/training", # where our conf/script is
    #git_config=git_config,
    role=strExecutionRole,
    instance_type=strInstanceType,
    instance_count=nInstanceCount,
    image_uri=None,
    framework_version="2.0.0", # version of PyTorch
    py_version="py310",
    volume_size=128,
    code_location=strCodeLocation,
    output_path=strOutputPath,
    disable_profiler=True,
    debugger_hook_config=False,
    hyperparameters=dicHyperParams,
    sagemaker_session=sagemaker_session,
    metric_definitions=metric_definitions,
    max_run=nMaxRun,
    use_spot_instances=bSpotTraining,  # spot instance 활용
    max_wait=nMaxWait,
    keep_alive_period_in_seconds=nKeepAliveSeconds,
    enable_sagemaker_metrics=True,
    distribution=distribution,
    environment=environment,
    **kwargs
)

if strInstanceType =='local_gpu':
    estimator.checkpoint_s3_uri = None

estimator.fit(
    inputs=dicDataChannels,
    wait=True,
)

### SageMaker Experiments (will be deplecated)
* code conversion to use Experiments (`./src/trating/main_ddp_exp.py`) 

In [None]:
from time import strftime
from sagemaker.experiments.run import Run

In [None]:
# Set to True to enable SageMaker to run locally
local_mode = False

if local_mode:
    strInstanceType = "local_gpu"
    
    import os
    from sagemaker.local import LocalSession
    
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    
    strLocalDataDir = os.getcwd() + '/data/preprocessing'
    dicDataChannels = {
        "train": f"file://{strLocalDataDir}",
        "validation": f"file://{strLocalDataDir}"
    }
    
else:
    
    strInstanceType = "ml.p3.2xlarge" #"ml.p3.2xlarge"#"ml.g4dn.8xlarge"#"ml.p3.2xlarge", 'ml.p3.16xlarge' , ml.g4dn.8xlarge
    
    sagemaker_session = sagemaker.Session()
    dicDataChannels = {
        "train": pm.get_params(key="-".join([strPrefix, "PREP-DATA-PATH"])),
        "validation": pm.get_params(key="-".join([strPrefix, "PREP-DATA-PATH"]))
    }

dicHyperParams = {
    "epochs":"50",
    "batch_size":"128", 
    "lr":"1e-2",
    "shingle_size":"4",
    "num_features":"4",
    "emb_size":"4",
    "workers":"2"
}

nInstanceCount = 1

bSpotTraining = False
if bSpotTraining:
    nMaxWait = 1*60*60
    nMaxRun = 1*60*60
    
else:
    nMaxWait = None
    nMaxRun = 1*60*60

bUseTrainWarmPool = True ## training image 다운받지 않음, 속도 빨라진다
if bUseTrainWarmPool: nKeepAliveSeconds = 3600 ## 최대 1시간 동안!!, service quota에서 warmpool을 위한 request 필요
else: nKeepAliveSeconds = None
if bSpotTraining:
    bUseTrainWarmPool = False # warmpool은 spot instance 사용시 활용 할 수 없음
    nKeepAliveSeconds = None
    
    
strProcPrefix = "/opt/ml/processing"

strOutputPath = os.path.join(
    "s3://{}".format(strBucketName),
    strPrefix,
    "training",
    "model-output"
)

strCodeLocation = os.path.join(
    "s3://{}".format(strBucketName),
    strPrefix,
    "training",
    "backup_codes"
)

distribution={"smdistributed": {"dataparallel": {"enabled": True}}}

strExperimentName = '-'.join([strPrefix, "experiments-test"])
create_date = strftime("%m%d-%H%M%s")
strRunName = f'run-smimd-ddp-{create_date}'

num_re = "([0-9\\.]+)(e-?[[01][0-9])?"
metric_definitions = [
    {"Name": "train_loss", "Regex": f"train_loss:{num_re}"},
    {"Name": "train_cos", "Regex": f"train_cos:{num_re}"},
    {"Name": "val_cos", "Regex": f"val_cos:{num_re}"}
]

kwargs = {}

In [None]:
with Run(
    experiment_name=strExperimentName,
    run_name=strRunName,
    sagemaker_session=sagemaker_session
) as run:

    estimator = PyTorch(
        entry_point="main_ddp_exp.py", # the script we want to run
        source_dir="./src/training", # where our conf/script is
        #git_config=git_config,
        role=strExecutionRole,
        instance_type=strInstanceType,
        instance_count=nInstanceCount,
        image_uri=None,
        framework_version="2.0.0", # version of PyTorch
        py_version="py310",
        volume_size=128,
        code_location=strCodeLocation,
        output_path=strOutputPath,
        disable_profiler=True,
        debugger_hook_config=False,
        hyperparameters=dicHyperParams,
        sagemaker_session=sagemaker_session,
        metric_definitions=metric_definitions,
        max_run=nMaxRun,
        use_spot_instances=bSpotTraining,  # spot instance 활용
        max_wait=nMaxWait,
        keep_alive_period_in_seconds=nKeepAliveSeconds,
        enable_sagemaker_metrics=True,
        distribution=distribution,
        environment=,
        **kwargs
    )

    if strInstanceType =='local_gpu':
        estimator.checkpoint_s3_uri = None

    estimator.fit(
        inputs=dicDataChannels,
        wait=True,
    )

### SageMaker HyperparameterTuner

In [None]:
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter

In [None]:
# Set to True to enable SageMaker to run locally
local_mode = False

if local_mode:
    strInstanceType = "local_gpu"
    
    import os
    from sagemaker.local import LocalSession
    
    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}
    
    strLocalDataDir = os.getcwd() + '/data/preprocessing'
    dicDataChannels = {
        "train": f"file://{strLocalDataDir}",
        "validation": f"file://{strLocalDataDir}"
    }
    
else:
    
    strInstanceType = "ml.g5.2xlarge" #"ml.p3.2xlarge"#"ml.g4dn.8xlarge"#"ml.p3.2xlarge", 'ml.p3.16xlarge' , ml.g4dn.8xlarge
    
    sagemaker_session = sagemaker.Session()
    dicDataChannels = {
        "train": pm.get_params(key="-".join([strPrefix, "PREP-DATA-PATH"])),
        "validation": pm.get_params(key="-".join([strPrefix, "PREP-DATA-PATH"]))
    }

dicHyperParams = {
    "epochs":"50",
    "batch_size":"128", 
    "lr":"1e-2",
    "shingle_size":"4",
    "num_features":"4",
    "emb_size":"4",
    "workers":"2"
}

nInstanceCount = 1

bSpotTraining = False
if bSpotTraining:
    nMaxWait = 1*60*60
    nMaxRun = 1*60*60
    
else:
    nMaxWait = None
    nMaxRun = 1*60*60

bUseTrainWarmPool = True ## training image 다운받지 않음, 속도 빨라진다
if bUseTrainWarmPool: nKeepAliveSeconds = 3600 ## 최대 1시간 동안!!, service quota에서 warmpool을 위한 request 필요
else: nKeepAliveSeconds = None
if bSpotTraining:
    bUseTrainWarmPool = False # warmpool은 spot instance 사용시 활용 할 수 없음
    nKeepAliveSeconds = None
    
    
strProcPrefix = "/opt/ml/processing"

strOutputPath = os.path.join(
    "s3://{}".format(strBucketName),
    strPrefix,
    "training",
    "model-output"
)

strCodeLocation = os.path.join(
    "s3://{}".format(strBucketName),
    strPrefix,
    "training",
    "backup_codes"
)

distribution={"smdistributed": {"dataparallel": {"enabled": True}}}

num_re = "([0-9\\.]+)(e-?[[01][0-9])?"
metric_definitions = [
    {"Name": "train_loss", "Regex": f"train_loss:{num_re}"},
    {"Name": "train_cos", "Regex": f"train_cos:{num_re}"},
    {"Name": "val_cos", "Regex": f"val_cos:{num_re}"}
]

## hyperparam tuner
base_tuning_job_name = '-'.join([strPrefix, "hyperparam-test"])

#experiments
experiment_name = '-'.join([strPrefix, "exp-hyperparam"])
create_date = strftime("%m%d-%H%M%s")
strRunName = f'run-smimd-ddp-hyperparam-{create_date}'

tuner_args = dict(
    objective_metric_name="val_cos",
    objective_type="Maximize", #Minimize
    metric_definitions=metric_definitions,
    max_jobs=4,
    max_parallel_jobs=2,
    early_stopping_type="Auto",
)

kwargs = {}

In [None]:
estimator = PyTorch(
    entry_point="main_ddp.py", # the script we want to run
    source_dir="./src/training", # where our conf/script is
    #git_config=git_config,
    role=strExecutionRole,
    instance_type=strInstanceType,
    instance_count=nInstanceCount,
    image_uri=None,
    framework_version="2.0.0", # version of PyTorch
    py_version="py310",
    volume_size=128,
    code_location=strCodeLocation,
    output_path=strOutputPath,
    disable_profiler=True,
    debugger_hook_config=False,
    hyperparameters=dicHyperParams,
    sagemaker_session=sagemaker_session,
    metric_definitions=metric_definitions,
    max_run=nMaxRun,
    use_spot_instances=bSpotTraining,  # spot instance 활용
    max_wait=nMaxWait,
    keep_alive_period_in_seconds=nKeepAliveSeconds,
    enable_sagemaker_metrics=True,
    distribution=distribution,
    environment={"AWS_REGION": strRegionName},
    **kwargs
)

# Define a Hyperparameter Tuning Job
tuner = HyperparameterTuner(
    estimator=estimator,
    hyperparameter_ranges={
        "batch_size": IntegerParameter(50, 100, "Auto"),
    },
    base_tuning_job_name=base_tuning_job_name,
    **tuner_args,
)

In [None]:
with Run(
    experiment_name=experiment_name,
    run_name=strRunName,
    sagemaker_session=sagemaker_session
) as run:

    if strInstanceType =='local_gpu':
        estimator.checkpoint_s3_uri = None

    # Start the tuning job with the specified input data
    tuner.fit(
        inputs=dicDataChannels,
        wait=True
    )