```python
!pip install llvmlite --ignore-installed
!pip install autogluon
```

# 0. 환경설정

In [1]:
import argparse
import os
import requests
import tempfile
import subprocess, sys

import pandas as pd
import numpy as np
from glob import glob
import copy
from collections import OrderedDict
from pathlib import Path
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder


import logging
import logging.handlers

import json
import base64
import boto3
import sagemaker
from botocore.client import Config
from botocore.exceptions import ClientError

import time
from datetime import datetime as dt
import datetime
from pytz import timezone
from dateutil.relativedelta import *

In [2]:
%store -r

In [3]:
%store

Stored variables and their in-db values:
KST_aday_before                   -> datetime.datetime(2023, 2, 26, 7, 51, 52, 826075)
datalake_base_path                -> 's3://ai-data-lake/crude-palm-oil-prices-forecast'
golden_data_dir                   -> 's3://crude-palm-oil-prices-forecast/golden-data/2
leaderboard_base_path             -> 's3://crude-palm-oil-prices-forecast/trained-model
manifest_base_path                -> 's3://crude-palm-oil-prices-forecast/trained-model
model_base_path                   -> 's3://crude-palm-oil-prices-forecast/trained-model
model_validation_code             -> 'src/v1.2/model_validation.py'
num_fold                          -> '3'
prediction_base_path              -> 's3://crude-palm-oil-prices-forecast/trained-model
preprocessing_code                -> 's3://crude-palm-oil-prices-forecast/src/preproces
project_prefix                    -> 'crude-palm-oil-prices-forecast'
raw_data_path                     -> 's3://ai-data-lake/crude-palm-oil-prices

In [4]:
def get_secret():
    secret_name = "dev/ForecastPalmOilPrice"
    region_name = "ap-northeast-2"
    
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name,
    )

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret

keychain = json.loads(get_secret())
ACCESS_KEY_ID = keychain['AWS_ACCESS_KEY_ID']
ACCESS_SECRET_KEY = keychain['AWS_ACCESS_SECRET_KEY']

BUCKET_NAME_USECASE = keychain['PROJECT_BUCKET_NAME']
DATALAKE_BUCKET_NAME = keychain['DATALAKE_BUCKET_NAME']

S3_PATH_REUTER = keychain['S3_PATH_REUTER']
S3_PATH_WWO = keychain['S3_PATH_WWO']
S3_PATH_STAGE = keychain['S3_PATH_STAGE']
S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
S3_PATH_FORECAST = keychain['S3_PATH_PREDICTION']

boto3_session = boto3.Session(ACCESS_KEY_ID, ACCESS_SECRET_KEY)
sm_session = sagemaker.Session(boto_session = boto3_session)
region = boto3_session.region_name

s3_resource = boto3_session.resource('s3')
palmoil_bucket = s3_resource.Bucket(BUCKET_NAME_USECASE)
datalake_bucket = s3_resource.Bucket(DATALAKE_BUCKET_NAME)

s3_client = boto3_session.client('s3')
sm_client = boto3.client('sagemaker',
                         aws_access_key_id = ACCESS_KEY_ID,
                         aws_secret_access_key = ACCESS_SECRET_KEY,
                         region_name = 'ap-northeast-2')

In [26]:
%%writefile src/v1.2/train.py

import argparse
import os
import requests
import tempfile
import subprocess, sys
import json

import glob
import pandas as pd
import joblib # from sklearn.externals import joblib
import pickle
import tarfile # model registry에는 uri만 등록된다.
from io import StringIO, BytesIO

import logging
import logging.handlers
from logging.config import dictConfig

from dateutil.relativedelta import *
from datetime import datetime as dt
import time

import boto3

KST = dt.today() + relativedelta(hours=9)

###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    '''
    로깅을 위해 파이썬 로거를 사용
    # https://stackoverflow.com/questions/17745914/python-logging-module-is-printing-lines-multiple-times
    '''
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_dir", type=str, default='/opt/ml/processing/input/train')
    parser.add_argument("--test_dir", type=str, default='/opt/ml/processing/input/test')
    parser.add_argument('--output_dir', type = str, default = '/opt/ml/processing/output')
    parser.add_argument('--item', type = str, default = 'FCPOc3')
    parser.add_argument('--target', type = str, default = 'y')
    parser.add_argument('--metric', type = str, default = 'MAPE')    
    parser.add_argument('--quality', type = str, default = 'fast_training')    
    return parser.parse_args()

def create_tarfile(source_dir, output_filename=None):
    ''' create a tarfile from a source directory'''
    if output_filename == None:
        output_filename = "%s/tmptar.tar" %(tempfile.mkdtemp())
    with tarfile.open(output_filename, "w:gz") as tar:
        tar.add(source_dir, arcname=os.path.basename(source_dir))
    return output_filename 

def make_tarfile(source_dir, output_filename):
    with tarfile.open(output_filename, "w:gz") as tar:
        tar.add(source_dir, arcname=os.path.basename(source_dir))
    return os.path.join(source_dir, output_filename)

if __name__ == "__main__":
    ############################################
    ########## 필요 라이브러리 설치  ###########
    ########################################### 
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'autogluon==0.6.1'])
    from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
    
    ######################################
    ## 커맨드 인자, Hyperparameters 처리 ##
    ######################################
    logger.info("######### Argument Info ####################################")
    logger.info("### start training code")    
    logger.info("### Argument Info ###")
    args = parse_args()
        
    logger.info(f"args.train_dir: {args.train_dir}")   
    logger.info(f"args.test_dir: {args.test_dir}")   
    logger.info(f"args.output_dir: {args.output_dir}")    
    logger.info(f"args.item: {args.item}")   
    logger.info(f"args.target: {args.target}")    
    logger.info(f"args.metric: {args.metric}")   
    logger.info(f"args.quality: {args.quality}")   
    
    train_dir = args.train_dir
    test_dir = args.test_dir
    output_dir = args.output_dir
    prediction_dir = os.path.join(output_dir, 'prediction')
    leaderboard_dir = os.path.join(output_dir, 'leaderboard')
    model_dir = os.path.join(output_dir, 'model')
    
    for path in [prediction_dir, leaderboard_dir, model_dir]:
        if not os.path.exists(path):
            os.mkdir(path)
    item = args.item
    target = args.target
    metric = args.metric
    quality = args.quality
    
    trlist = sorted(os.listdir(train_dir))
    telist = sorted(os.listdir(test_dir))
    
    logger.info(f"the list of train data {trlist}")
    logger.info(f"the list of train data {telist}")
    
    for train_file, test_file in zip(trlist, telist):
        logger.info("### Reading input data")
        logger.info(f"### train data: {train_file}")
        logger.info(f"### test data: {test_file}")
        
        df_train = pd.read_csv(os.path.join(train_dir, train_file))
        df_test = pd.read_csv(os.path.join(test_dir, test_file))      
        # df_train = df_train[df_train['ric'] != 'MCCc3']
        # df_test = df_test[df_test['ric'] != 'MCCc3']
        
        logger.info("### Convert TimeSeriesDataFrame")
        df_train.loc[:, "ds"] = pd.to_datetime(df_train.loc[:, "ds"])
        df_test.loc[:, "ds"] = pd.to_datetime(df_test.loc[:, "ds"])

        tdf_train = TimeSeriesDataFrame.from_data_frame(
            df_train,
            id_column="ric",
            timestamp_column="ds",
        )
        tdf_test = TimeSeriesDataFrame.from_data_frame(
            df_test,
            id_column="ric",
            timestamp_column="ds",
        )

        logger.info("### Show the range of date for training and test")    
        logger.info('Item:', item)
        logger.info('Target:', target)   
        logger.info('Train:',tdf_train.loc[item][target].index.min(),'~',tdf_train.loc[item][target].index.max())
        logger.info('Test:',tdf_test.loc[item][target].index.min(),'~',tdf_test.loc[item][target].index.max())
        logger.info('The number of test data:',len(tdf_test.loc[item][target]))

        logger.info("### Training AutoGluon Model")    
        predictor = TimeSeriesPredictor(
            path = model_dir,
            target = target,
            prediction_length = len(tdf_test.loc[item][target]),
            eval_metric = metric,
        )
        predictor.fit(
            train_data = tdf_train,
            presets = quality
        )
        logger.info("the list of data in model_dir {}".format(os.listdir(model_dir)))
        tar_file_path = make_tarfile(model_dir, f'{model_dir}/model.tar.gz')
        logger.info("Saving model to {}".format(tar_file_path))

        predictor_leaderboard = predictor.leaderboard(tdf_test, silent = True)
        predictor_leaderboard = predictor_leaderboard.sort_values(by = ['score_val', 'score_test'],
                                                                  ascending = False)
        predictor_leaderboard.to_csv(os.path.join(leaderboard_dir,
                                                  f'leaderboard-{test_file}'),
                                     index = False)
        logger.info(f"predictor_leaderboard sample: head(2) \n {predictor_leaderboard.head(2)}")
        
        top_model_name = predictor_leaderboard.loc[0, 'model']
        # second_model_name = predictor_leaderboard.loc[1, 'model']
        
        prediction_ag_model_01 = predictor.predict(data = tdf_train,
                                                   model = top_model_name)
#         prediction_ag_model_02 = predictor.predict(data = tdf_train,
#                                                    model = second_model_name)
        pred_result_01 = pd.merge(tdf_test.loc['FCPOc3']['y'], prediction_ag_model_01.loc['FCPOc3'],
                                  left_index = True, right_index = True, how = 'left')
        pred_result_01.to_csv(os.path.join(prediction_dir,
                                           f'pred-{top_model_name}-{test_file}'))   
        # pred_result_02 = pd.merge(tdf_test.loc['FCPOc3']['y'], prediction_ag_model_02.loc['FCPOc3'],
        #                           left_index = True, right_index = True, how = 'left')
        # pred_result_02.to_csv(os.path.join(prediction_dir,
        #                                    f'pred-{second_model_name}-{test_file}'))   

Overwriting src/v1.2/train.py


In [6]:
%store

Stored variables and their in-db values:
KST_aday_before                   -> datetime.datetime(2023, 2, 26, 7, 51, 52, 826075)
datalake_base_path                -> 's3://ai-data-lake/crude-palm-oil-prices-forecast'
golden_data_dir                   -> 's3://crude-palm-oil-prices-forecast/golden-data/2
leaderboard_base_path             -> 's3://crude-palm-oil-prices-forecast/trained-model
manifest_base_path                -> 's3://crude-palm-oil-prices-forecast/trained-model
model_base_path                   -> 's3://crude-palm-oil-prices-forecast/trained-model
model_validation_code             -> 'src/v1.2/model_validation.py'
num_fold                          -> '3'
prediction_base_path              -> 's3://crude-palm-oil-prices-forecast/trained-model
preprocessing_code                -> 's3://crude-palm-oil-prices-forecast/src/preproces
project_prefix                    -> 'crude-palm-oil-prices-forecast'
raw_data_path                     -> 's3://ai-data-lake/crude-palm-oil-prices

In [None]:
!aws s3 cp 'src/v1.2/train.py' 's3://crude-palm-oil-prices-forecast/src/train.py' --exclude ".ipynb_checkpoints*"

In [7]:
training_code = 's3://crude-palm-oil-prices-forecast/src/v1.2/train.py'
%store training_code

Stored 'training_code' (str)


In [9]:
# print(KST_aday_before)

In [10]:
# !aws s3 ls s3://crude-palm-oil-prices-forecast/golden-data/2023/02/07 --recursive |grep -v 'golden-data/2023/02/07/scaler-files/*'

# 1. 모델 빌딩 파이프라인 의 스텝(Step) 생성
## 1) 모델 빌딩 파이프라인 변수 생성
파이프라인에서 사용할 파이프라인 파라미터를 정의합니다. 파이프라인을 스케줄하고 실행할 때 파라미터를 이용하여 실행조건을 커스마이징할 수 있습니다. 파라미터를 이용하면 파이프라인 실행시마다 매번 파이프라인 정의를 수정하지 않아도 됩니다.

지원되는 파라미터 타입은 다음과 같습니다:

- ParameterString - 파이썬 타입에서 str
- ParameterInteger - 파이썬 타입에서 int
- ParameterFloat - 파이썬 타입에서 float
이들 파라미터를 정의할 때 디폴트 값을 지정할 수 있으며 파이프라인 실행시 재지정할 수도 있습니다. 지정하는 디폴트 값은 파라미터 타입과 일치하여야 합니다.

본 노트북에서 사용하는 파라미터는 다음과 같습니다.

- processing_instance_type - 프로세싱 작업에서 사용할 ml.* 인스턴스 타입
- processing_instance_count - 프로세싱 작업에서 사용할 인스턴스 개수
- training_instance_type - 학습작업에서 사용할 ml.* 인스턴스 타입
- model_approval_status - 학습된 모델을 CI/CD를 목적으로 등록할 때의 승인 상태 (디폴트는 "PendingManualApproval")
- input_data - 입력데이터에 대한 S3 버킷 URI
파이프라인의 각 스텝에서 사용할 변수를 파라미터 변수로서 정의 합니다.

In [8]:
yyyy, mm, dd = str(KST_aday_before.year), str(KST_aday_before.month).zfill(2), str(KST_aday_before.day).zfill(2)
print(yyyy,mm,dd)

2023 02 26


In [9]:
from sagemaker.workflow.parameters import (ParameterInteger,
                                           ParameterString,
                                          )

train_instance_type = ParameterString(
    name = "TrainingInstanceType",
    default_value = "ml.m5.xlarge"
)
train_instance_count = ParameterInteger(
    name = "TrainInstanceCount",
    default_value = 1
)
input_golden_data_path = ParameterString(
    name = "InputGoldenDataPath",
    default_value = golden_data_dir,
)
input_train_path = ParameterString(
    name = "InputTrainPath",
    default_value = train_data_dir,
)
input_test_path = ParameterString(
    name = "InputTestPath",
    default_value = test_data_dir,
)

## 2) 프로세서 단계 정의


In [10]:
from sagemaker.image_uris import retrieve
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput

In [11]:
item = 'FCPOc3'
target = 'y'
metric = 'MAPE'
quality = 'fast_training'#'medium_quality'#'fast_training'

In [12]:
role = sagemaker.get_execution_role()
image_uri = retrieve(framework='mxnet',
                     region='ap-northeast-2',
                     version='1.9.0',
                     py_version='py38',
                     image_scope='training',
                     instance_type=train_instance_type)

script_processor_training = ScriptProcessor(
    command=['python3'],
    image_uri=image_uri,
    instance_type = train_instance_type,
    instance_count = train_instance_count,
    base_job_name = f"{BUCKET_NAME_USECASE}(Train Model)",
    role = role,
)

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


## 3) 모델 훈련 파이프라인 스탭 단계 정의
스텝 생성시에 위에서 생성한 Estimator 입력 및 입력 데이타로서 전처리 데이터가 존재하는 S3 경로를 제공합니다.

In [13]:
trained_model_dir

's3://crude-palm-oil-prices-forecast/trained-model/2023/02/26/1677484312.0'

In [27]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

step_training = ProcessingStep(
    name = f"{BUCKET_NAME_USECASE}-Training",
    processor = script_processor_training,
    inputs=[
        ProcessingInput(input_name = 'input_train_path',
                        source = input_train_path,
                        destination = "/opt/ml/processing/input/train"),
        ProcessingInput(input_name = 'input_test_path',
                        source = input_test_path,
                        destination = "/opt/ml/processing/input/test"),
    ],
    outputs=[
        ProcessingOutput(output_name = "prediction_data",
                         source = "/opt/ml/processing/output/prediction",
                         destination = f'{trained_model_dir}/prediction'),
        ProcessingOutput(output_name = "leaderboard_data",
                         source = "/opt/ml/processing/output/leaderboard",
                         destination = f'{trained_model_dir}/leaderboard'),        
        ProcessingOutput(output_name = "model_data",
                         source = "/opt/ml/processing/output/model",
                         destination = f'{trained_model_dir}/model')
        ],
    job_arguments = ["--item", item,
                     "--target", target,
                     "--metric", metric,
                     "--quality", quality],
    code = training_code
)


In [21]:
#     parser.add_argument('--output_data_dir', type = str, default = os.environ.get('SM_OUTPUT_DATA_DIR'))
#     parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
#     parser.add_argument("--train_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
#     parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
#     parser.add_argument('--item', type = str, default = 'FCPOc3')
#     parser.add_argument('--target', type = str, default = 'y')
#     parser.add_argument('--metric', type = str, default = 'MAPE')    
#     parser.add_argument('--quality', type = str, default = 'low_quality')    

# 2. 파이프라인 실행

In [28]:
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep

pipeline = Pipeline(name = BUCKET_NAME_USECASE,
                    parameters = [train_instance_type,        
                                  train_instance_count,         
                                  input_train_path,
                                  input_test_path,
                                 ],
                    steps=[step_training],
)

In [29]:
import json

definition = json.loads(pipeline.definition())
definition

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'TrainInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'InputTrainPath',
   'Type': 'String',
   'DefaultValue': 's3://crude-palm-oil-prices-forecast/golden-data/2023/02/26/1677484312.0/train'},
  {'Name': 'InputTestPath',
   'Type': 'String',
   'DefaultValue': 's3://crude-palm-oil-prices-forecast/golden-data/2023/02/26/1677484312.0/test'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'crude-palm-oil-prices-forecast-Training',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': {'Get': 'Parameters.TrainingInstanceType'},
      'InstanceCount': {'Get': 'Parameters.TrainInstanceCount'},
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri'

에러조심
- ValueError: TimeSeriesPredictor does not yet support missing values. Please make sure that the provided data contains no NaNs.


In [30]:
%%time
start = time.time()
pipeline.upsert(role_arn=sagemaker.get_execution_role())
execution = pipeline.start()
execution.wait()
end = time.time()

CPU times: user 1.26 s, sys: 18.9 ms, total: 1.28 s
Wall time: 12min 41s


In [31]:
print(f"training 시간 : {((end - start)/60):.1f} min({end - start:.1f} sec)")

training 시간 : 12.7 min(761.5 sec)


   
- [2022년 11월 25일] training 시간 : 513.4sec(8.6min)
- [2023년 01월 21일] training 시간 : 16.1 min(965.5 sec)
- [2023년 02월 27일] training 시간 : 14.1 min(848.0 sec)

In [32]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:ap-northeast-2:108594546720:pipeline/crude-palm-oil-prices-forecast',
 'PipelineExecutionArn': 'arn:aws:sagemaker:ap-northeast-2:108594546720:pipeline/crude-palm-oil-prices-forecast/execution/ei618738nqd8',
 'PipelineExecutionDisplayName': 'execution-1677573005922',
 'PipelineExecutionStatus': 'Succeeded',
 'PipelineExperimentConfig': {'ExperimentName': 'crude-palm-oil-prices-forecast',
  'TrialName': 'ei618738nqd8'},
 'CreationTime': datetime.datetime(2023, 2, 28, 8, 30, 5, 802000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2023, 2, 28, 8, 42, 36, 829000, tzinfo=tzlocal()),
 'CreatedBy': {},
 'LastModifiedBy': {},
 'ResponseMetadata': {'RequestId': '5f3eed44-a04f-42c6-827c-67fe54f2a1a6',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '5f3eed44-a04f-42c6-827c-67fe54f2a1a6',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '541',
   'date': 'Tue, 28 Feb 2023 08:59:50 GMT'},
  'RetryAttempts': 0}}

In [33]:
response = execution.list_steps()
response

[{'StepName': 'crude-palm-oil-prices-forecast-Training',
  'StartTime': datetime.datetime(2023, 2, 28, 8, 30, 6, 749000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2023, 2, 28, 8, 42, 36, 625000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'AttemptCount': 0,
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:ap-northeast-2:108594546720:processing-job/pipelines-ei618738nqd8-crude-palm-oil-price-ccawdktqlf'}}}]

In [34]:
proc_arn = response[0]['Metadata']['ProcessingJob']['Arn']
proc_job_name = proc_arn.split('/')[-1]
response = sm_client.describe_processing_job(ProcessingJobName = proc_job_name)
response

{'ProcessingInputs': [{'InputName': 'input_train_path',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://crude-palm-oil-prices-forecast/golden-data/2023/02/26/1677484312.0/train',
    'LocalPath': '/opt/ml/processing/input/train',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'input_test_path',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://crude-palm-oil-prices-forecast/golden-data/2023/02/26/1677484312.0/test',
    'LocalPath': '/opt/ml/processing/input/test',
    'S3DataType': 'S3Prefix',
    'S3InputMode': 'File',
    'S3DataDistributionType': 'FullyReplicated',
    'S3CompressionType': 'None'}},
  {'InputName': 'code',
   'AppManaged': False,
   'S3Input': {'S3Uri': 's3://sagemaker-ap-northeast-2-108594546720/crude-palm-oil-prices-forecast-Training-08f8192b066c43925206e46065093e4e/input/code/train.py',
    'LocalPath': '/opt/ml/processing/input/code',
    'S3D

In [35]:
# step_train.properties.ModelArtifacts.S3ModelArtifacts

# 3. 모델 아티펙트 경로 추출

In [36]:
def get_proc_artifact(execution, client, kind=0):
    '''
    kind: 0 --> prediction
    kind: 1 --> leaderboard
    kind: 2 --> model
    '''
    response = execution.list_steps()
    proc_arn = response[-1]['Metadata']['ProcessingJob']['Arn'] # index -1은 가장 처음 실행 step
    proc_job_name = proc_arn.split('/')[-1] # Processing job name만 추출
    response = client.describe_processing_job(ProcessingJobName = proc_job_name)
    file_uri = response['ProcessingOutputConfig']['Outputs'][kind]['S3Output']['S3Uri']
    return file_uri

In [37]:
prediction_base_path = get_proc_artifact(execution, sm_client, kind=0)
leaderboard_base_path = get_proc_artifact(execution, sm_client, kind=1)
model_base_path = get_proc_artifact(execution, sm_client, kind=2)
print("- prediction_base_path: \n", prediction_base_path)
print("\n- leaderboard_base_path: \n", leaderboard_base_path)
print("\n- model_base_path: \n", model_base_path)

- prediction_base_path: 
 s3://crude-palm-oil-prices-forecast/trained-model/2023/02/26/1677484312.0/prediction

- leaderboard_base_path: 
 s3://crude-palm-oil-prices-forecast/trained-model/2023/02/26/1677484312.0/leaderboard

- model_base_path: 
 s3://crude-palm-oil-prices-forecast/trained-model/2023/02/26/1677484312.0/model


In [29]:
manifest_base_path = model_base_path.rsplit('/',1)[0] + '/manifest'
print("\n- manifest_base_path: \n", manifest_base_path)


- manifest_base_path: 
 s3://crude-palm-oil-prices-forecast/trained-model/2023/02/26/1677484312.0/manifest


In [30]:
%store prediction_base_path
%store leaderboard_base_path
%store model_base_path
%store manifest_base_path

Stored 'prediction_base_path' (str)
Stored 'leaderboard_base_path' (str)
Stored 'model_base_path' (str)
Stored 'manifest_base_path' (str)


In [38]:
print("- prediction_base_path: \n", prediction_base_path)
print("\n- leaderboard_base_path: \n", leaderboard_base_path)
print("\n- model_base_path: \n", model_base_path)
print("\n- manifest_base_path: \n", manifest_base_path)

- prediction_base_path: 
 s3://crude-palm-oil-prices-forecast/trained-model/2023/02/26/1677484312.0/prediction

- leaderboard_base_path: 
 s3://crude-palm-oil-prices-forecast/trained-model/2023/02/26/1677484312.0/leaderboard

- model_base_path: 
 s3://crude-palm-oil-prices-forecast/trained-model/2023/02/26/1677484312.0/model

- manifest_base_path: 
 s3://crude-palm-oil-prices-forecast/trained-model/2023/02/26/1677484312.0/manifest


In [32]:
# cd workspace

In [33]:
# import pandas as pd

# df_pred = pd.read_csv('prediction-test_fold1.csv')

In [34]:
# np.array(df_pred['mean']).reshape(-1,1)

In [35]:
# import joblib
# import numpy as np
# from sklearn.preprocessing import MinMaxScaler

# # scaler = MinMaxScaler()

# scaler = joblib.load('./FCPOc3_y_scaler.pkl') 

# # scaler.inverse_transform(df_pred['y'])[:, [0]]

# # rescaled_actual = scaler.inverse_transform(df_pred['y'].values.reshape(-1,1))
# rescaled_y = scaler.inverse_transform(np.array(df_pred['y']).reshape(-1,1))

# rescaled_pred = scaler.inverse_transform(np.array(df_pred['mean']).reshape(-1,1))



In [36]:
# rescaled_y

In [37]:
# rescaled_pred