# [모듈 3.1] 전처리 스텝 개발

## 0. 기본 세이지 메이커 정보 및 기본 변수 로딩

In [1]:
import boto3
import sagemaker
import pandas as pd

region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()

%store -r 
%store

Stored variables and their in-db values:
base_preproc_input_dir                -> 'opt/ml/processing/input'
dataset_path                          -> 'opt/ml/processing/input/dataset.csv'
input_data_uri                        -> 's3://sagemaker-ap-northeast-2-057716757052/fraud2
processing_instance_count             -> ParameterInteger(name='ProcessingInstanceCount', p
project_prefix                        -> 'fraud2scratch'
s3_dataset_path                       -> 's3://sagemaker-ap-northeast-2-057716757052/fraud2


## 1. 노트북 변수 설정
---

In [2]:
preprocessing_code_dir = 'fraud/preprocessing.py'
%store preprocessing_code_dir

Stored 'preprocessing_code_dir' (str)


## 2.로컬 노트북에서 전처리 로직 실행 
---

### 로컬 환경 셋업 

로컬에서 테스트 하기 위해 다커 컨테이너와 같은 환경 생성

In [3]:
import os
base_output_dir = 'opt/ml/processing/output'
# base_preproc_dir = 'opt/ml/processing'

base_preproc_input_dir = 'opt/ml/processing/input'
os.makedirs(base_preproc_input_dir, exist_ok=True)

base_train_dir = 'opt/ml/processing/output/train'
os.makedirs(base_train_dir, exist_ok=True)

base_validation_dir = 'opt/ml/processing/output/validation'
os.makedirs(base_validation_dir, exist_ok=True)

base_test_dir = 'opt/ml/processing/output/test'
os.makedirs(base_test_dir, exist_ok=True)


### 로컬에서 스크립트 실행

In [4]:
! python fraud/preprocessing.py --base_preproc_input_dir {base_preproc_input_dir} --base_output_dir {base_output_dir} 


numpy version:  1.19.5
#############################################
args.base_output_dir: opt/ml/processing/output
args.base_preproc_input_dir: opt/ml/processing/input
args.label_column: fraud
input files: 
 ['opt/ml/processing/input/dataset.csv']
dataset sample 
    fraud  incident_type_theft  ...  collision_type_rear  collision_type_front
0      0                    0  ...                    1                     0
1      0                    0  ...                    1                     0

[2 rows x 46 columns]
df columns 
 Index(['fraud', 'incident_type_theft', 'policy_state_ca', 'policy_deductable',
       'num_witnesses', 'policy_state_or', 'incident_month',
       'customer_gender_female', 'num_insurers_past_5_years',
       'customer_gender_male', 'total_claim_amount',
       'authorities_contacted_police', 'incident_day', 'collision_type_side',
       'customer_age', 'customer_education', 'driver_relationship_child',
       'driver_relationship_spouse', 'injury_claim', 'inc

## 3. 로컬 다커 컨테이너에서 전처리 로직 실행 
---


In [5]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

instance_type = 'local'
sklearn_processor = SKLearnProcessor(framework_version= "0.23-1",
                                     role=role,
                                     instance_type= instance_type,
                                     instance_count=1)

sklearn_processor.run(code= preprocessing_code_dir,
                      inputs=[ProcessingInput(
                        source=input_data_uri,
                        destination='/opt/ml/processing/input')],
                      outputs=[ProcessingOutput(source='/opt/ml/processing/output/train'),
                               ProcessingOutput(source='/opt/ml/processing/output/validation'),
                               ProcessingOutput(source='/opt/ml/processing/output/test')]
                     )


Job Name:  sagemaker-scikit-learn-2021-04-13-03-08-39-746
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-057716757052/fraud2scratch/input/dataset.csv', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-057716757052/sagemaker-scikit-learn-2021-04-13-03-08-39-746/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-ap-northeast-2-057716757052/sagemaker-scikit-learn-2021-04-13-03-08-39-746/output/output-1', 'LocalPath': '/opt/ml/processing/output/train', 'S3UploadMode': 'EndOfJob'

## 4. 모델 빌딩 파이프라인에서  실행 
---



### 모델 빌딩 파이프라인 변수 생성



In [6]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

processing_instance_count = ParameterInteger(
    name="ProcessingInstanceCount",
    default_value=1
)
processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value="ml.m5.xlarge"
)

input_data = ParameterString(
    name="InputData",
    default_value=input_data_uri,
)


### 전처리 스텝 단계 정의

In [7]:
from sagemaker.sklearn.processing import SKLearnProcessor

framework_version = "0.23-1"

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    base_job_name="sklearn-fraud-process",
    role=role,
)
print("input_data: \n", input_data)

input_data: 
 s3://sagemaker-ap-northeast-2-057716757052/fraud2scratch/input/dataset.csv


In [8]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
    

step_process = ProcessingStep(
    name="FraudScratchProcess",
    processor=sklearn_processor,
    inputs=[
      ProcessingInput(source=input_data, destination="/opt/ml/processing/input"),  
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/output/train"),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/output/validation"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/output/test")
    ],
    code= preprocessing_code_dir,
)

### 파리마터, 단계, 조건을 조합하여 최종 파이프라인 정의



In [9]:
from sagemaker.workflow.pipeline import Pipeline


pipeline_name = f"FraudScratchPreprocessing"
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type, 
        processing_instance_count,
        input_data,
    ],
    steps=[step_process],
)

#### (선택) 파이프라인 정의 확인 

파이프라인을 정의하는 JSON을 생성하고 파이프라인 내에서 사용하는 파라미터와 단계별 속성들이 잘 정의되었는지 확인할 수 있습니다.

In [10]:
import json


definition = json.loads(pipeline.definition())
definition

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'ProcessingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'InputData',
   'Type': 'String',
   'DefaultValue': 's3://sagemaker-ap-northeast-2-057716757052/fraud2scratch/input/dataset.csv'}],
 'Steps': [{'Name': 'FraudScratchProcess',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': {'Get': 'Parameters.ProcessingInstanceType'},
      'InstanceCount': {'Get': 'Parameters.ProcessingInstanceCount'},
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '366743142698.dkr.ecr.ap-northeast-2.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3',
     'ContainerEntrypoint': ['python3',
      '/opt/ml/processing/input/code/preprocessing.py']},
    'RoleArn': 'arn:aws:iam::057716757052:role/service-role/AmazonSageMaker-ExecutionRole-20210120T193680',
    '

### 파이프라인을 SageMaker에 제출하고 실행하기 

파이프라인 정의를 파이프라인 서비스에 제출합니다. 함께 전달되는 역할(role)을 이용하여 AWS에서 파이프라인을 생성하고 작업의 각 단계를 실행할 것입니다.   

In [11]:
pipeline.upsert(role_arn=role)

{'PipelineArn': 'arn:aws:sagemaker:ap-northeast-2:057716757052:pipeline/fraudscratchpreprocessing',
 'ResponseMetadata': {'RequestId': '443bda3d-6aea-467b-8101-ba847eb1e63b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '443bda3d-6aea-467b-8101-ba847eb1e63b',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '98',
   'date': 'Tue, 13 Apr 2021 03:08:45 GMT'},
  'RetryAttempts': 0}}

디폴트값을 이용하여 파이프라인을 샐행합니다. 

In [12]:
execution = pipeline.start()

## 파이프라인 운영: 파이프라인 대기 및 실행상태 확인

워크플로우의 실행상황을 살펴봅니다. 

In [13]:
execution.describe()
# execution.wait()

{'PipelineArn': 'arn:aws:sagemaker:ap-northeast-2:057716757052:pipeline/fraudscratchpreprocessing',
 'PipelineExecutionArn': 'arn:aws:sagemaker:ap-northeast-2:057716757052:pipeline/fraudscratchpreprocessing/execution/zmuxp5ebvie9',
 'PipelineExecutionDisplayName': 'execution-1618283326240',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2021, 4, 13, 3, 8, 46, 175000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2021, 4, 13, 3, 8, 46, 175000, tzinfo=tzlocal()),
 'CreatedBy': {},
 'LastModifiedBy': {},
 'ResponseMetadata': {'RequestId': '9ebe93d1-81c2-4471-aefb-9c0d3ee19391',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '9ebe93d1-81c2-4471-aefb-9c0d3ee19391',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '425',
   'date': 'Tue, 13 Apr 2021 03:08:45 GMT'},
  'RetryAttempts': 0}}

실행이 완료될 때까지 기다립니다.

실행된 단계들을 리스트업합니다. 파이프라인의 단계실행 서비스에 의해 시작되거나 완료된 단계를 보여줍니다.

In [16]:
execution.list_steps()

[{'StepName': 'FraudScratchProcess',
  'StartTime': datetime.datetime(2021, 4, 13, 3, 8, 46, 826000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2021, 4, 13, 3, 12, 33, 784000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:ap-northeast-2:057716757052:processing-job/pipelines-zmuxp5ebvie9-fraudscratchprocess-ysitzyiwmb'}}}]

## 아티펙트 경로 추출

In [43]:
def get_proc_artifact(execution, client, kind=0):
    '''
    kind: 0 --> train
    kind: 2 --> test
    '''
    response = execution.list_steps()
    proc_arn = response[0]['Metadata']['ProcessingJob']['Arn']
    proc_job_name = proc_arn.split('/')[-1]
    # print("proc_job_name: ", proc_job_name)
    
    response = client.describe_processing_job(ProcessingJobName = proc_job_name)
    train_preproc_artifact = response['ProcessingOutputConfig']['Outputs'][kind]['S3Output']['S3Uri']    
    
    return train_preproc_artifact

import boto3
client = boto3.client("sagemaker")

train_preproc_dir_artifact = get_proc_artifact(execution, client, kind=0 )
val_preproc_dir_artifact = get_proc_artifact(execution, client, kind=1 )
test_preproc__dir_artifact = get_proc_artifact(execution, client, kind=2 )
print("train_preproc_artifact: ", train_preproc_artifact)
print("val_preproc_artifact: ", train_preproc_artifact)
print("test_preproc_artifact: ", test_preproc_artifact)

%store train_preproc_dir_artifact
%store val_preproc_dir_artifact
%store test_preproc__dir_artifact

train_preproc_artifact:  s3://sagemaker-ap-northeast-2-057716757052/sklearn-fraud-process-2021-04-13-03-08-45-278/output/train
val_preproc_artifact:  s3://sagemaker-ap-northeast-2-057716757052/sklearn-fraud-process-2021-04-13-03-08-45-278/output/train
test_preproc_artifact:  s3://sagemaker-ap-northeast-2-057716757052/sklearn-fraud-process-2021-04-13-03-08-45-278/output/test
Stored 'train_preproc_dir_artifact' (str)
Stored 'val_preproc_dir_artifact' (str)
Stored 'test_preproc__dir_artifact' (str)
