# 0. 환경설정

In [1]:
import argparse
import os
import requests
import tempfile
import subprocess, sys

import pandas as pd
import numpy as np
from glob import glob
import copy
from collections import OrderedDict
from pathlib import Path
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

import logging
import logging.handlers

import json
import base64
import boto3
import sagemaker
from botocore.client import Config
from botocore.exceptions import ClientError

import time
from datetime import datetime as dt
import datetime
from pytz import timezone
from dateutil.relativedelta import *

In [2]:
%store -r

In [3]:
%store

Stored variables and their in-db values:
bucket                                  -> 'palm-oil-price-forecast'
preproc_data_dir                        -> 's3://palm-oil-price-forecast/golden-data/2022/12/
preprocessing_code                      -> 'src/v1.1/preprocessing.py'
project_prefix                          -> 'palm-oil-price-forecast'
stage_data_uri                          -> 's3://palm-oil-price-forecast/staged-data'
test_data_uri                           -> 's3://palm-oil-price-forecast/golden-data/2022/12/
test_preproc_dir_artifact1              -> 's3://palm-oil-price-forecast/golden-data/2022/12/
test_preproc_dir_artifact2              -> 's3://palm-oil-price-forecast/golden-data/2022/12/
test_preproc_dir_artifact3              -> 's3://palm-oil-price-forecast/golden-data/2022/12/
test_preproc_dir_artifact4              -> 's3://palm-oil-price-forecast/golden-data/2022/12/
test_preproc_dir_artifact5              -> 's3://palm-oil-price-forecast/golden-data/2022/12/
train_

In [4]:
def get_secret():
    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
    secret_name = "prod/sagemaker"
    region_name = "ap-northeast-2"
    
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId='prod/sagemaker',
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret

keychain = json.loads(get_secret())
ACCESS_KEY_ID = keychain['ACCESS_KEY_ID_ent']
ACCESS_SECRET_KEY = keychain['ACCESS_SECRET_KEY_ent']

BUCKET_NAME_USECASE = keychain['BUCKET_NAME_USECASE_ent']
S3_PATH_STAGE = keychain['S3_PATH_STAGE']
S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
S3_PATH_log = keychain['S3_PATH_LOG']
S3_PATH_FORECAST = keychain['S3_PATH_FORECAST']

boto3_session = boto3.Session(ACCESS_KEY_ID, ACCESS_SECRET_KEY)
sm_session = sagemaker.Session(boto_session = boto3_session)
region = boto3_session.region_name

s3_resource = boto3_session.resource('s3')
bucket = s3_resource.Bucket(BUCKET_NAME_USECASE)
s3_client = boto3_session.client('s3')
sm_client = boto3.client('sagemaker',
                         aws_access_key_id = ACCESS_KEY_ID,
                         aws_secret_access_key = ACCESS_SECRET_KEY,
                         region_name = 'ap-northeast-2')

In [7]:
%%writefile src/v1.1/train.py

import os
import sys
import pickle

import argparse
import pandas as pd
import json

from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame
from autogluon.timeseries.splitter import MultiWindowSplitter

import joblib # from sklearn.externals import joblib

import logging
import logging.handlers

from dateutil.relativedelta import *
from datetime import datetime as dt

KST = dt.today() + relativedelta(hours=9)

###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    '''
    로깅을 위해 파이썬 로거를 사용
    # https://stackoverflow.com/questions/17745914/python-logging-module-is-printing-lines-multiple-times
    '''
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()

def parse_args():
    parser = argparse.ArgumentParser()

    parser.add_argument('--output_data_dir', type = str, default = os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--train_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
    parser.add_argument('--item', type = str, default = 'FCPOc3')
    parser.add_argument('--target', type = str, default = 'y')
    parser.add_argument('--metric', type = str, default = 'MAPE')    
    parser.add_argument('--quality', type = str, default = 'low_quality')    
    
    
    return parser.parse_args()

if __name__ == "__main__":
    ###################################
    ## 커맨드 인자, Hyperparameters 처리 ##
    ###################################        

    logger.info(f"### start training code")    
    args = parse_args()

    logger.info("### Argument Info ###")
    logger.info(f"args.output_data_dir: {args.output_data_dir}")    
    logger.info(f"args.model_dir: {args.model_dir}")        
    logger.info(f"args.train_dir: {args.train_dir}")   
    logger.info(f"args.test_dir: {args.test_dir}")   
    logger.info(f"args.item: {args.item}")   
    logger.info(f"args.target: {args.target}")    
    logger.info(f"args.metric: {args.metric}")   
    logger.info(f"args.quality: {args.quality}")   
    
    output_data_dir = args.output_data_dir
    model_dir = args.model_dir
    train_dir = args.train_dir
    test_dir = args.test_dir
    item = args.item
    target = args.target
    metric = args.metric
    quality = args.quality
    logger.info(f"{os.listdir(train_dir)}")
   
    logger.info("### Reading input data")
    df_train= pd.read_csv(os.path.join(train_dir, 'train_fold1.csv'))
    df_test = pd.read_csv(os.path.join(test_dir, 'test_fold1.csv'))        
    
    logger.info("### Convert TimeSeriesDataFrame")
    df_train.loc[:, "ds"] = pd.to_datetime(df_train.loc[:, "ds"])
    df_test.loc[:, "ds"] = pd.to_datetime(df_test.loc[:, "ds"])
    tdf_train = TimeSeriesDataFrame.from_data_frame(
        df_train,
        id_column="ric",
        timestamp_column="ds",
    )
    tdf_test = TimeSeriesDataFrame.from_data_frame(
        df_test,
        id_column="ric",
        timestamp_column="ds",
    )

    logger.info("### Show the range of date for training and test")    
    logger.info('Item:\t', item)
    logger.info('Target:\t', target)   
    logger.info('Train:\t',tdf_train.loc[item][target].index.min(),'~',tdf_train.loc[item][target].index.max())
    logger.info('Test:\t',tdf_test.loc[item][target].index.min(),'~',tdf_test.loc[item][target].index.max())
    logger.info('The number of test data:',len(tdf_test.loc[item][target]))
    
    logger.info("### Training AutoGluon Model")    
    predictor = TimeSeriesPredictor(
        path = model_dir,
        target = target,
        prediction_length = len(tdf_test.loc[item][target]),
        eval_metric = metric,
    )
    predictor.fit(
        train_data = tdf_train,
        presets = quality
    )    
    logger.info("Saving model to {}".format(model_dir))
    
    # 원래라면 Validation dataset이 input으로 들어와서 leaderboard와 prediction을 해야한다.
    # 근데, 여기서는 아니다. 이번 사이클에서는 test data까지 모두 산출한다음에 넣는것으로 진행하자.
    predictor_leaderboard = predictor.leaderboard(tdf_test, silent = True)
    predictor_leaderboard = predictor_leaderboard.sort_values(by = ['score_val', 'score_test'],
                                                              ascending = False)
    predictor_leaderboard.to_csv(os.path.join(output_data_dir,'leaderboard.csv'), index = False)
                          
    logger.info(f"predictor_leaderboard sample: head(2) \n {predictor_leaderboard.head(2)}")
    prediction_ag_model = predictor.predict(tdf_train)
    logger.info(f"prediction_ag_model sample: head(2) \n {prediction_ag_model.head(2)}")
    
    prediction_result = pd.merge(tdf_test.loc['FCPOc3']['y'], prediction_ag_model.loc['FCPOc3'],
                                 left_index = True, right_index = True, how = 'left')
    prediction_result.to_csv(f'{output_dir}/prediction_result.csv')                          

Overwriting src/v1.1/train.py


In [8]:
%store

Stored variables and their in-db values:
bucket                                  -> 'palm-oil-price-forecast'
preproc_data_dir                        -> 's3://palm-oil-price-forecast/golden-data/2022/12/
preprocessing_code                      -> 'src/v1.1/preprocessing.py'
project_prefix                          -> 'palm-oil-price-forecast'
stage_data_uri                          -> 's3://palm-oil-price-forecast/staged-data'
test_data_uri                           -> 's3://palm-oil-price-forecast/golden-data/2022/12/
test_preproc_dir_artifact1              -> 's3://palm-oil-price-forecast/golden-data/2022/12/
test_preproc_dir_artifact2              -> 's3://palm-oil-price-forecast/golden-data/2022/12/
test_preproc_dir_artifact3              -> 's3://palm-oil-price-forecast/golden-data/2022/12/
test_preproc_dir_artifact4              -> 's3://palm-oil-price-forecast/golden-data/2022/12/
test_preproc_dir_artifact5              -> 's3://palm-oil-price-forecast/golden-data/2022/12/
train_

In [9]:
training_code = 'src/v1.1/train.py'
%store training_code

Stored 'training_code' (str)


In [8]:
# !aws s3 ls s3://palm-oil-price-forecast/golden-data/2022/11/24 --recursive

In [10]:
df_train = pd.read_csv(train_preproc_dir_artifact1)
df_train.head()

Unnamed: 0,ds,high,low,open,y,ric
0,2014-07-02,38.959999,38.48,38.869999,38.470001,BOc1
1,2014-07-03,38.66,38.34,38.470001,38.560001,BOc1
2,2014-07-04,38.59,38.219999,38.495001,38.460001,BOc1
3,2014-07-07,38.52,38.099998,38.52,38.360001,BOc1
4,2014-07-08,38.41,37.77,38.41,37.799999,BOc1


In [11]:
df_test = pd.read_csv(test_preproc_dir_artifact1)
df_test.head()

Unnamed: 0,ds,high,low,open,y,ric
0,2022-11-23,75.010002,73.239998,73.839996,74.889999,BOc1
1,2022-11-24,75.165001,73.825001,74.375,74.689999,BOc1
2,2022-11-25,75.32,74.410004,74.910004,74.489998,BOc1
3,2022-11-28,76.309998,73.120003,74.410004,76.059998,BOc1
4,2022-11-29,77.07,75.330002,76.040001,76.25,BOc1


In [12]:
df_test.tail()

Unnamed: 0,ds,high,low,open,y,ric
2216,2022-12-16,776.0,764.0,770.0,770.0,Wc3
2217,2022-12-19,772.5,752.75,766.25,761.75,Wc3
2218,2022-12-20,778.5,756.5,763.25,770.625,Wc3
2219,2022-12-21,784.5,760.25,760.25,779.5,Wc3
2220,2022-12-22,788.25,771.25,779.75,773.0,Wc3


# 1. 모델 빌딩 파이프라인 의 스텝(Step) 생성
## 1) 모델 빌딩 파이프라인 변수 생성
파이프라인에서 사용할 파이프라인 파라미터를 정의합니다. 파이프라인을 스케줄하고 실행할 때 파라미터를 이용하여 실행조건을 커스마이징할 수 있습니다. 파라미터를 이용하면 파이프라인 실행시마다 매번 파이프라인 정의를 수정하지 않아도 됩니다.

지원되는 파라미터 타입은 다음과 같습니다:

- ParameterString - 파이썬 타입에서 str
- ParameterInteger - 파이썬 타입에서 int
- ParameterFloat - 파이썬 타입에서 float
이들 파라미터를 정의할 때 디폴트 값을 지정할 수 있으며 파이프라인 실행시 재지정할 수도 있습니다. 지정하는 디폴트 값은 파라미터 타입과 일치하여야 합니다.

본 노트북에서 사용하는 파라미터는 다음과 같습니다.

- processing_instance_type - 프로세싱 작업에서 사용할 ml.* 인스턴스 타입
- processing_instance_count - 프로세싱 작업에서 사용할 인스턴스 개수
- training_instance_type - 학습작업에서 사용할 ml.* 인스턴스 타입
- model_approval_status - 학습된 모델을 CI/CD를 목적으로 등록할 때의 승인 상태 (디폴트는 "PendingManualApproval")
- input_data - 입력데이터에 대한 S3 버킷 URI
파이프라인의 각 스텝에서 사용할 변수를 파라미터 변수로서 정의 합니다.

In [13]:
from sagemaker.workflow.parameters import (ParameterInteger,
                                           ParameterString,
                                          )

train_instance_type = ParameterString(
    name = "TrainingInstanceType",
    default_value = "ml.m5.xlarge"
)
train_instance_count = ParameterInteger(
    name = "TrainInstanceCount",
    default_value = 1
)
input_train_data = ParameterString(
    name = "InputTrainData",
    default_value = train_preproc_dir_artifact1,
)
input_test_data = ParameterString(
    name = "InputTestData",
    default_value = test_preproc_dir_artifact1,
)

## 2) 파라미터 세팅


In [14]:
from sagemaker.mxnet import MXNet

KST = dt.today() + relativedelta(hours=9)
estimator_output_path = f"s3://{BUCKET_NAME_USECASE}/{S3_PATH_TRAIN}/{KST.strftime('%Y/%m/%d')}"
print("estimator_output_path: ", estimator_output_path)

mxnet_estimator = MXNet(
    base_job_name = 'Palm_oil_forecast-Train-autogluon060', # prefix
    role = sagemaker.get_execution_role(),
    entry_point = 'train.py',
    source_dir = "src/v1.1", # requirement.txt
    code_location = estimator_output_path,
    output_path = estimator_output_path,
    instance_type = train_instance_type,
    instance_count = train_instance_count,
    framework_version = '1.9.0',
    py_version = 'py38',
    hyperparameters={"item" :'FCPOc3',
                     "target" : 'y',
                     "metric" : 'MAPE',
                     "quality" : 'fast_training'
                    },  

)

estimator_output_path:  s3://palm-oil-price-forecast/trained-model/2022/12/23


## 3) 모델 훈련 스탭 생성
스텝 생성시에 위에서 생성한 Estimator 입력 및 입력 데이타로서 전처리 데이터가 존재하는 S3 경로를 제공합니다.

```python
step_train = TrainingStep(
    name="Palm_oil_forecast-Autogluon052-Train",
    estimator=xgb_train,
    inputs={
        "train": TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
                "train"
            ].S3Output.S3Uri,
            content_type="text/csv"
        ),
    },
)
```

In [15]:
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TrainingStep

step_train = TrainingStep(
    name = 'Palm_oil_forecast-Train-autogluon060',
    estimator = mxnet_estimator,
    inputs = {
        "train" : TrainingInput(
            s3_data = input_train_data,
            content_type = "text/csv"
        ),
        "test" : TrainingInput(
            s3_data = input_test_data,
            content_type = "text/csv"
        ),
    },
)

# 2. 파이프라인 실행

In [16]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = project_prefix
pipeline = Pipeline(name = pipeline_name,
                    parameters = [train_instance_type,        
                                  train_instance_count,         
                                  input_train_data,
                                  input_test_data,
                                 ],
                    steps=[step_train],
)

In [17]:
import json

definition = json.loads(pipeline.definition())
definition

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'TrainInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'InputTrainData',
   'Type': 'String',
   'DefaultValue': 's3://palm-oil-price-forecast/golden-data/2022/12/23/train_fold1.csv'},
  {'Name': 'InputTestData',
   'Type': 'String',
   'DefaultValue': 's3://palm-oil-price-forecast/golden-data/2022/12/23/test_fold1.csv'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'Palm_oil_forecast-Train-autogluon060',
   'Type': 'Training',
   'Arguments': {'AlgorithmSpecification': {'TrainingInputMode': 'File',
     'TrainingImage': '763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/mxnet-training:1.9.0-cpu-py38',
     'EnableSageMakerMetricsTimeSeries': True},
    'OutputDataConfig': {'S3OutputPath': 's3://p

에러조심
- ValueError: TimeSeriesPredictor does not yet support missing values. Please make sure that the provided data contains no NaNs.


In [18]:
%%time
start = time.time()
pipeline.upsert(role_arn=sagemaker.get_execution_role())
execution = pipeline.start()
execution.wait()
end = time.time()

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


WaiterError: Waiter PipelineExecutionComplete failed: Waiter encountered a terminal failure state: For expression "PipelineExecutionStatus" we matched expected path: "Failed"

In [19]:
print(f"training 시간 : {end - start:.1f} sec")
print(f"training 시간 : {((end - start)/60):.1f} min")

NameError: name 'end' is not defined

   
- [2022년 11월 25일] training 시간 : 513.4sec(8.6min)

In [18]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:ap-northeast-2:276114397529:pipeline/palm-oil-price-forecast',
 'PipelineExecutionArn': 'arn:aws:sagemaker:ap-northeast-2:276114397529:pipeline/palm-oil-price-forecast/execution/5c2k5eti372r',
 'PipelineExecutionDisplayName': 'execution-1669625818711',
 'PipelineExecutionStatus': 'Succeeded',
 'PipelineExperimentConfig': {'ExperimentName': 'palm-oil-price-forecast',
  'TrialName': '5c2k5eti372r'},
 'CreationTime': datetime.datetime(2022, 11, 28, 8, 56, 58, 646000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2022, 11, 28, 9, 5, 14, 734000, tzinfo=tzlocal()),
 'CreatedBy': {},
 'LastModifiedBy': {},
 'ResponseMetadata': {'RequestId': '14828b58-346f-4a9c-a42a-d92850e3e460',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '14828b58-346f-4a9c-a42a-d92850e3e460',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '520',
   'date': 'Mon, 28 Nov 2022 09:05:30 GMT'},
  'RetryAttempts': 0}}

In [28]:
response = execution.list_steps()
response

[{'StepName': 'Palm_oil_forecast-Train-autogluon060',
  'StartTime': datetime.datetime(2022, 11, 28, 8, 56, 59, 993000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2022, 11, 28, 9, 5, 14, 414000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'AttemptCount': 0,
  'Metadata': {'TrainingJob': {'Arn': 'arn:aws:sagemaker:ap-northeast-2:276114397529:training-job/pipelines-5c2k5eti372r-Palm-oil-forecast-Tr-IcBejQCKws'}}}]

In [29]:
proc_arn = response[0]['Metadata']['TrainingJob']['Arn']
train_job_name = proc_arn.split('/')[-1]
response = sm_client.describe_training_job(TrainingJobName = train_job_name)
response['TrainingJobName']

'pipelines-5c2k5eti372r-Palm-oil-forecast-Tr-IcBejQCKws'

In [32]:
step_train.properties.ModelArtifacts.S3ModelArtifacts

<sagemaker.workflow.properties.Properties at 0x7f4e9441d940>

# 3. 모델 아티펙트 경로 추출

In [21]:
def get_train_artifact(execution, client):
    response = execution.list_steps()
    proc_arn = response[0]['Metadata']['TrainingJob']['Arn']
    train_job_name = proc_arn.split('/')[-1]
    response = client.describe_training_job(TrainingJobName = train_job_name)
    train_model_uri = response['ModelArtifacts']['S3ModelArtifacts']
    return train_model_uri

In [22]:
train_model_uri = get_train_artifact(execution, sm_client)
leaderboard_uri = train_model_uri.replace('model.tar.gz','output.tar.gz')
print("train_model_uri: \n", train_model_uri)
print("\nleaderboard_uri: \n", leaderboard_uri)

train_model_uri: 
 s3://palm-oil-price-forecast/trained-model/2022/11/28/pipelines-5c2k5eti372r-Palm-oil-forecast-Tr-IcBejQCKws/output/model.tar.gz

leaderboard_uri: 
 s3://palm-oil-price-forecast/trained-model/2022/11/28/pipelines-5c2k5eti372r-Palm-oil-forecast-Tr-IcBejQCKws/output/output.tar.gz


In [23]:
%store train_model_uri
%store leaderboard_uri

Stored 'train_model_uri' (str)
Stored 'leaderboard_uri' (str)
