# 2.1 Amazon SageMaker Training using FSx for Luster

### 작업 실행 시 필요 라이브러리 import

In [1]:
import boto3
import sagemaker

### Bucket 정의

In [2]:
use_default_bucket = True
# use_default_bucket = False

if use_default_bucket:
    bucket = sagemaker.session.Session().default_bucket()
else:
    bucket = '<Type your bucket name here>'
    
print("bucket: ", bucket)        

bucket:  sagemaker-us-east-1-057716757052


###  훈련 잡에 사용할 SageMaker Role 정의 
- 로컬 머신에서 사용할 경우에는 아래의 주석을 해제하시고, SageMaker Execution Role ARN 을 넣어 주세요.

In [3]:
# # use_local_machine = False
use_local_machine = True

if use_local_machine:
    # role = '<Type Role ARN here>'
    pass
else:
    role = sagemaker.get_execution_role()    


## Local 파일을 FSx for Luster 에 복사 하기

In [6]:
is_fsx_data = True
is_delete_fsx_data = False
local_data_path = "../data/dataset/"
fsx_path = '/fsx/fraud/'

import os

def run_os_command(cmd):
    return_value = os.system(cmd)
    print("return_value: \n", return_value)
    

def prepare_fsx_data(is_fsx_data, local_data_path, fsx_path):
    if is_fsx_data:
        print("fsx_path: ", fsx_path)
        cmd = f'sudo cp -r {local_data_path} {fsx_path}'
        run_os_command(cmd)
        cmd = f'cd {fsx_path} && pwd && ls -R'    
        run_os_command(cmd)
        inputs = f'file://{fsx_path}'
        
    return inputs

if is_delete_fsx_data:    
    cmd = f'cd {fsx_path} && sudo rm -rf dataset && ls -R'    
    run_os_command(cmd)
    


In [7]:
inputs = prepare_fsx_data(is_fsx_data, local_data_path, fsx_path)    

print("input for fsx_path: ", inputs)                

fsx_path:  /fsx/fraud/
return_value: 
 0
/fsx/fraud
.:
test.csv
train.csv
return_value: 
 0
input for fsx_path:  file:///fsx/fraud/


### 하이퍼파라미터 정의

In [8]:
hyperparameters = {
       "scale_pos_weight" : "29",    
        "max_depth": "3",
        "eta": "0.2",
        "objective": "binary:logistic",
        "num_round": "100",
}

### 학습 실행 작업 정의

In [10]:
sagemaker_session = sagemaker.session.Session()
instance_count = 1
instance_type = "ml.m5.large"

max_run = 1*60*60


## FSx for Luster 데이터 채널 준비

In [19]:
from sagemaker.inputs import FileSystemInput

# Specify FSx Lustre file system id.
# file_system_id = # "fs-xxxxxxxxxxxxxx"
file_system_id = 'fs-0b14fc14ade3ce3e3'

# Specify directory path for input data on the file system. 
# You need to provide normalized and absolute path below.
# base_path = "<your-mount-name>"

###########################
# Train Data
###########################
base_path = '/pm6ybbev'

file_system_directory_path = f'{base_path}/fraud'
print(f'FSx file-system data input path: {file_system_directory_path}')

file_system_type = 'FSxLustre'
file_system_access_mode = 'rw'

train = FileSystemInput(file_system_id=file_system_id,
                        file_system_type=file_system_type,
                        directory_path=file_system_directory_path,
                        file_system_access_mode=file_system_access_mode)

###########################
# Log Data
###########################

# log_file_system_directory_path = f'{base_path}/fraud/log'
# print(f'FSx file-system log path: {log_file_system_directory_path}')


# log = FileSystemInput(file_system_id=file_system_id,
#                       file_system_type=file_system_type,
#                       directory_path=log_file_system_directory_path,
#                       file_system_access_mode=file_system_access_mode)

# inputs = {'train': train, 'log': log}

inputs = {'train': train}


print("inputs: \n", inputs)



FSx file-system data input path: /pm6ybbev/fraud
inputs: 
 {'train': <sagemaker.inputs.FileSystemInput object at 0x7ff64eda1570>}


## Setup FSx for Luster Network

In [20]:
# Give Amazon SageMaker Training Jobs Access to FileSystem Resources in Your Amazon VPC.
# security_group_ids = # ['sg-xxxxxxxx'] 
# subnets = # [ 'subnet-xxxxxxx']

subnet_id = 'subnet-040bdcb3d561b3606'
security_group_id = 'sg-0f29b424d0b3cb98e'

security_group_ids = list()
security_group_ids.append(security_group_id)
subnets = list()
subnets.append(subnet_id)
print("security_group_ids: ", security_group_ids)
print("subnets: ", subnets)


security_group_ids:  ['sg-0f29b424d0b3cb98e']
subnets:  ['subnet-040bdcb3d561b3606']


In [21]:
from sagemaker.xgboost.estimator import XGBoost

estimator = XGBoost(
    entry_point="xgboost_fsx_luster_script.py",
    source_dir='src',
    hyperparameters=hyperparameters,
    role=role,
    sagemaker_session=sagemaker_session,
    instance_count=instance_count,
    instance_type=instance_type,
    framework_version="1.3-1",
    max_run=max_run,
    subnets=subnets,
    security_group_ids=security_group_ids,    
)

### 학습 실행

In [22]:
estimator.fit(inputs = inputs,
                  wait=False)

In [23]:
estimator.logs()

2023-05-07 15:15:59 Starting - Starting the training job...ProfilerReport-1683472559: InProgress
...
2023-05-07 15:16:44 Starting - Preparing the instances for training......
2023-05-07 15:17:47 Downloading - Downloading input data...
2023-05-07 15:18:27 Training - Downloading the training image.....[34m[2023-05-07 15:19:06.689 ip-172-30-0-47.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2023-05-07 15:19:06.712 ip-172-30-0-47.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2023-05-07:15:19:06:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2023-05-07:15:19:06:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2023-05-07:15:19:06:INFO] Invoking user training script.[0m
[34m[2023-05-07:15:19:06:INFO] Installing module with the following command:[0m
[34m/miniconda3/bin/python3 -m pip install . [0m
[34mProcessing /opt/ml/code
  Preparing metadata (setup.py): started
  Prepar