# SageMaker 모델 빌드 파이프라인을 이용한 모델 빌드 오케스트레이션
Amazon SageMaker Model building pipeline은 머신러닝 워크플로우를 개발하는 데이터 과학자, 엔지니어들에게 SageMaker작업과 재생산가능한 머신러닝 파이프라인을 오케스트레이션하는 기능을 제공합니다. 또한 커스텀빌드된 모델을 실시간 추론환경이나 배치변환을 통한 추론 실행환경으로 배포하거나, 생성된 아티팩트의 계보(lineage)를 추적하는 기능을 제공합니다. 이 기능들을 통해 모델 아티팩트를 배포하고, 업무환경에서의 워크플로우를 배포/모니터링하고, 간단한 인터페이스를 통해 아티팩트의 계보 추적하고, 머신러닝 애플리케이션 개발의 베스트 프렉티스를 도입하여, 보다 안정적인 머신러닝 애플리케이션 운영환경을 구현할 수 있습니다.

SageMaker pipeline 서비스는 JSON 선언으로 구현된 SageMaker Pipeline DSL(Domain Specific Language, 도메인종속언어)를 지원합니다. 이 DSL은 파이프라인 파라마터와 SageMaker 작업단계의 DAG(Directed Acyclic Graph)를 정의합니다. SageMaker Python SDK를 이용하면 이 파이프라인 DSL의 생성을 보다 간편하게 할 수 있습니다.

# 0. 사용 코드들 1.0

### 0-1. preprocessing

In [7]:
%%writefile src/v1.0/preprocessing.py

import argparse
import os
import requests
import tempfile
import subprocess, sys

import pandas as pd
import numpy as np
from glob import glob
import copy
from collections import OrderedDict
from pathlib import Path
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

import logging
import logging.handlers

import json
import base64
import boto3
from botocore.client import Config
from botocore.exceptions import ClientError

import time
from datetime import datetime as dt
import datetime
from pytz import timezone
from dateutil.relativedelta import *

###############################
######### 전역변수 설정 ##########
###############################
KST = dt.today() + relativedelta(hours=9)
ric_list = ['BOc1', 'BOc2', 'BOc3','BOPLKL','BRRTSc1', 'BRRTSc2', 'BRRTSc3', 'CAD=', 'EUR=', 'JPY=', 'KRW=', 'MYR=', 'GBP=', 'INR=','Cc1', 'Cc2', 'Cc3','CCMc1', 'CCMc2', 'CCMc3',
            'CLc1', 'CLc2', 'CLc3','CNY=','COMc1', 'COMc2','COMc3','CTc1', 'CTc2', 'CTc3', 'DJCI', 'DJCIBR', 'DJCICL', 'DJCICN', 'DJCIEN', 'DJCIGR', 'DJCIIA', 'DJCING', 
            'DJCISO', 'DJCIWH', 'DJT','FCHI','FCPOc1', 'FCPOc2', 'FCPOc3','FGVHKL',
            'FKLIc1', 'FKLIc2', 'FKLIc3','FTSE','GCc1', 'GCc2', 'GCc3','GDAXI','GENMKL','HSI','IOIBKL','IXIC','JNIc1','JNIc2','JNIc3','KCc1', 'KCc2', 'KCc3','KLKKKL','KLSE','KQ11', 'KS11',
            'KWc1', 'KWc2', 'KWc3','LCOc1', 'LCOc2', 'LCOc3','LWBc1', 'LWBc2', 'LWBc3','MCCc1', 'MCCc2', 'MCCc3','MXSCKL','Oc1', 'Oc2', 'Oc3','PEPTKL','RRc1', 'RRc2', 'RRc3','RSc1', 'RSc2', 'RSc3',
            'Sc1', 'Sc2', 'Sc3','SIMEKL','SOPSKL','SSEC', 'THPBKL', 'Wc1', 'Wc2', 'Wc3'
           ]

col_names_asis = ['ds','high','low','open','ric']
col_names_tobe = ['ds','high','low','open','y']

###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    '''
    로깅을 위해 파이썬 로거를 사용
    # https://stackoverflow.com/questions/17745914/python-logging-module-is-printing-lines-multiple-times
    '''
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()

def download_object(file_name):
    try:
        s3_client = boto3.client("s3")
        download_path = Path('test') / file_name.replace('/','_')
        s3_client.download_file(
            BUCKET_NAME_USECASE,
            file_name,
            str(download_path)
        )
        return "Success"
    except Exception as e:
        return e

def download_parallel_multiprocessing(path_list):
    with ProcessPoolExecutor() as executor:
        future_to_key = {executor.submit(download_object, key): key for key in path_list}
        for future in futures.as_completed(future_to_key):
            key = future_to_key[future]
            exception = future.exception()
            if not exception:
                yield key, future.result()
            else:
                yield key, exception
                                
def get_list_in_s3(key_id : str,
                   secret_key_id : str,
                   bucket_name : str,
                   s3_path : str) -> list:
    
    s3 = boto3.client('s3',
                      aws_access_key_id = ACCESS_KEY_ID,
                      aws_secret_access_key = ACCESS_SECRET_KEY,
                      region_name = 'ap-northeast-2')
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket = bucket_name,
                               Prefix = s3_path)  # 원하는 bucket 과 하위경로에 있는 object list # dict type
    contents_list = [] # object list의 Contents를 가져옴
    for page in pages:
        for obj in page['Contents']:
            contents_list.append(obj)
    return contents_list

def get_file_folders(s3_client, bucket_name, prefix=""):
    file_names = []
    folders = []

    default_kwargs = {
        "Bucket": bucket_name,
        "Prefix": prefix
    }
    next_token = ""

    while next_token is not None:
        updated_kwargs = default_kwargs.copy()
        if next_token != "":
            updated_kwargs["ContinuationToken"] = next_token

        response = s3_client.list_objects_v2(**default_kwargs)
        contents = response.get("Contents")

        for result in contents:
            key = result.get("Key")
            if key[-1] == "/":
                folders.append(key)
            else:
                file_names.append(key)

        next_token = response.get("NextContinuationToken")

    return file_names, folders


def download_files(s3_client, bucket_name, local_path, file_names, folders):

    local_path = Path(local_path)

    for folder in folders:
        folder_path = Path.joinpath(local_path, folder)
        folder_path.mkdir(parents=True, exist_ok=True)

    for file_name in file_names:
        file_path = Path.joinpath(local_path, file_name)
        file_path.parent.mkdir(parents=True, exist_ok=True)
        s3_client.download_file(
            bucket_name,
            file_name,
            str(file_path)
        )
        
def get_dataframe(base_preproc_input_dir, file_name_prefix ):    
    '''
    파일 이름이 들어가 있는 csv 파일을 모두 저장하여 데이터 프레임을 리턴
    '''
    
    input_files = glob('{}/{}*.csv'.format(base_preproc_input_dir, file_name_prefix))
    #claim_input_files = glob('{}/dataset*.csv'.format(base_preproc_input_dir))    
    logger.info(f"input_files: \n {input_files}")    
    
    if len(input_files) == 0:
        raise ValueError(('There are no files in {}.\n' +
                          'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                          'the data specification in S3 was incorrectly specified or the role specified\n' +
                          'does not have permission to access the data.').format(base_preproc_input_dir, "train"))
        
    raw_data = [ pd.read_csv(file, index_col=0) for file in input_files ]
    df = pd.concat(raw_data)
   
    logger.info(f"dataframe shape \n {df.shape}")    
    logger.info(f"dataset sample \n {df.head(2)}")        
    #logger.info(f"df columns \n {df.columns}")    
    
    return df

def get_secret():
    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
    secret_name = "prod/sagemaker"
    region_name = "ap-northeast-2"
    
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId='prod/sagemaker',
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret
        
def fill_missing_dates(df_in : pd.DataFrame,
                       freq : str
                      ) -> pd.DataFrame : 
    df = df_in.copy()
    if df["ds"].dtype == np.int64:
            df.loc[:, "ds"] = df.loc[:, "ds"].astype(str)
    df.loc[:, "ds"] = pd.to_datetime(df.loc[:, "ds"])
    r = pd.date_range(start = df["ds"].min(),
                      end = df["ds"].max(),
                      freq = freq)
    df = df.set_index("ds").reindex(r).rename_axis("ds").reset_index()
    return df

def fill_missing_price_value(df: pd.DataFrame, col: str, limit_linear : int = 20 ) -> pd.DataFrame :
    initial_is_na = sum(df[col].isnull())
    series = df.loc[:, col].astype(float)
    series = series.interpolate(method="linear", limit=limit_linear, limit_direction="both")
    series = [0 if v < 0 else v for v in series]
    df[col] = series
    return df

def scaling_value(df : pd.DataFrame,
                  col_name : str,
                  ric,
                  s3_resource,
                  BUCKET_NAME_USECASE,
                  S3_PATH_GOLDEN) -> tuple:

    series = df[col_name].values
    scaler = MinMaxScaler()
    series = series.reshape(-1,1)
    scaler.fit(series)
    series = scaler.transform(series)
    with tempfile.TemporaryFile() as fp:
        joblib.dump(scaler, fp)
        fp.seek(0)
        s3_resource.put_object(Body = fp.read(),
                               Bucket = BUCKET_NAME_USECASE,
                               Key = f"{S3_PATH_GOLDEN}/{KST.strftime('%Y/%m/%d')}/scaler-files/{ric}_{col_name}_scaler.pkl")
    return series

def convert_type(raw, cols, type_target):
    '''
    해당 데이터 타입으로 변경
    '''
    df = raw.copy()
    
    for col in cols:
        df[col] = df[col].astype(type_target)
    
    return df

if __name__=='__main__':
    ################################
    ###### 커맨드 인자 파싱   ##########
    ################################
    split_date_default = dt.today() + relativedelta(hours = 9) - relativedelta(months=1)
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--base_output_dir', type=str, default="/opt/ml/processing/output")
    parser.add_argument('--base_preproc_input_dir', type=str, default="/opt/ml/processing/input")   
    parser.add_argument('--split_date', type=str, default=split_date_default.strftime('%Y-%m-%d'))       
    parser.add_argument('--label_column', type=str, default="ric") 
    parser.add_argument("--scaler_switch", type = str, default = 1, help = '1이면 Scaling ON, 0이면 Scaling OFF')
        
    # parse arguments
    args = parser.parse_args()     

    logger.info("######### Argument Info ####################################")
    logger.info(f"args.base_output_dir: {args.base_output_dir}")
    logger.info(f"args.base_preproc_input_dir: {args.base_preproc_input_dir}")    
    logger.info(f"args.label_column: {args.label_column}")        
    logger.info(f"args.split_date: {args.split_date}")   
    logger.info(f"args.scaler_switch: {args.scaler_switch}")   
    
    base_output_dir = args.base_output_dir
    base_preproc_input_dir = args.base_preproc_input_dir
    label_column = args.label_column
    split_date = args.split_date    
    scaler_switch = int(args.scaler_switch)
    ############################################
    ###### Secret Manager에서 키값 가져오기  #######
    ########################################### 
    logger.info(f"\n### Loading the key value using Secret Manager")

    keychain = json.loads(get_secret())
    ACCESS_KEY_ID = keychain['ACCESS_KEY_ID_ent']
    ACCESS_SECRET_KEY = keychain['ACCESS_SECRET_KEY_ent']

    BUCKET_NAME_USECASE = keychain['BUCKET_NAME_USECASE_ent']
    S3_PATH_STAGE = keychain['S3_PATH_STAGE']
    S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
    S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
    S3_PATH_log = keychain['S3_PATH_LOG']

    boto_session = boto3.Session(ACCESS_KEY_ID, ACCESS_SECRET_KEY)
    region = boto_session.region_name
    s3_resource = boto_session.resource('s3')
    s3_client = boto_session.client('s3')
    ############################################
    ###### 1. 데이터 Integration  #######
    ########################################### 
    total_start = time.time()
    start = time.time()
    stage_dir = f'{base_output_dir}/stage/stage.csv"'
    logger.info(f"\n### Data Integration")
    path_list = []
    df_sum = pd.DataFrame()

    for (path, dir, files) in os.walk(base_preproc_input_dir):
        for filename in files:
            ext = os.path.splitext(filename)[-1]
            if ext == '.csv':
                path_list.append("%s/%s" % (path, filename))
                
    logger.info(f"The number for data : {len(path_list)}")
    for file in path_list:
        df_tmp= pd.read_csv(file, encoding='utf-8') 
        df_sum = pd.concat([df_sum, df_tmp])
    df_sum = df_sum.sort_values(by='Date').reset_index(drop=True)
    df_sum.to_csv(f"{base_output_dir}/stage/stage.csv", index = False)
    end = time.time()
    
    logger.info(f"Data Integration is done")
    logger.info(f"Runtime : {end - start:.1f} sec({((end - start)/60):.1f} min)")
    logger.info(f"The number for data : {len(path_list)}")
    logger.info(f"Integrated data sample: head(2) \n {df_sum.head(2)}")
    logger.info(f"Integrated data sample: tail(2) \n {df_sum.tail(2)}")
    
    #################################
    ####   2. 첫번쨰 전처리 단계     ####
    ####   품목선별, 열 삭제, 형변환  ####
    ################################    
    start = time.time()
    logger.info(f"\n ### RIC Item selection")    
    df_sum = df_sum[df_sum['RIC'].isin(ric_list)].reset_index()
    logger.info(f"The number for data after RIC Item selection : {df_sum.shape}")

    logger.info(f"\n ### Column selection")    
    df_sum = df_sum[['Date','HIGH', 'LOW', 'OPEN', 'CLOSE','RIC']]
    logger.info(f"The number for data after Column selection : {df_sum.shape}")
    logger.info(f"\n ### type conversion")    
    df_sum.loc[:, "Date"] = pd.to_datetime(df_sum.loc[:, "Date"])
    df_sum.loc[:, "HIGH"] = df_sum.loc[:, "HIGH"].astype(np.float32)
    df_sum.loc[:, "LOW"] = df_sum.loc[:, "LOW"].astype(np.float32)
    df_sum.loc[:, "OPEN"] = df_sum.loc[:, "OPEN"].astype(np.float32)
    df_sum.loc[:, "CLOSE"] = df_sum.loc[:, "CLOSE"].astype(np.float32)
    ####################################################
    ####   3. Autogluon timeseries 데이터 셋으로 만들기  ####
    ####################################################
    logger.info(f"\n ### Autogluon timeseriesdataframe Conversion")        
    df_list = OrderedDict()
    for name in ric_list:
        df_tmp = df_sum[df_sum['RIC'] == name]
        df_tmp = df_tmp.drop('RIC', axis=1)
        df_list[name] = df_tmp[df_tmp['Date'] >= '2014-07-02'].reset_index(drop = True)
    ####################################################
    ############   4. 열 이름 변경, 결측치 처리  ############
    ###################################################
    logger.info(f"\n ### Rename columns")        
    col_names = ['ds','high','low','open','y']
    for name, value in df_list.items():
        df_list[name].columns = col_names

    logger.info(f"\n ### Fill missing value (Date)")        
    for name, value in df_list.items():
        df_list[name]  = fill_missing_dates(value, 'B')
        num_added = len(df_list[name]) - len(value)
        is_na = sum(df_list[name]['y'].isnull())
    
    logger.info(f"\n ### Fill missing value (Price)")        
    for name, value in df_list.items():
        df_proc1 = fill_missing_price_value(value, 'y')
        df_proc1 = fill_missing_price_value(value, 'high')
        df_proc1 = fill_missing_price_value(value, 'low')
        df_proc1 = fill_missing_price_value(value, 'open')
        df_list[name] = df_proc1
        
    ####################################################
    #################   5. Scaling  ###################
    ###################################################
    if int(scaler_switch) == 1:
        logger.info(f"\n ### Scaling")            
        scale_dir = f"{base_output_dir}/scaler-files"
        os.makedirs(scale_dir, exist_ok=True)
        for name, value in df_list.items():
            for col in ['y','high','open','low']:
                value.loc[:, col] = scaling_value(value, col, name, s3_client, BUCKET_NAME_USECASE, S3_PATH_GOLDEN)
            df_list[name] = value
    else:
        logger.info(f"\n ### No Scaling")
    end = time.time()
    logger.info(f"\n### All Date Transform is done")
    print(f"All Date Transform Run time : {end - start:.1f} sec({((end - start)/60):.1f} min)")

    #################################################
    #####   6. 훈련, 테스트 데이터 세트로 분리 및 저장  ######
    #################################################
    logger.info(f"\n ### Split train, test dataset")            
    df_golden = pd.DataFrame()
    for name, value in df_list.items():
        value = value.assign(ric = name)
        df_golden = pd.concat([df_golden, value])
    df_golden = df_golden.reset_index(drop = True)
    
    # train 데이터 나누기
    df_train = df_golden[df_golden['ds'] < split_date]
    df_train.to_csv(f"{base_output_dir}/train/train.csv", index = False)
    
    df_test = df_golden[df_golden['ds'] >= split_date]
    df_test.to_csv(f"{base_output_dir}/test/test.csv", index = False)
    
    logger.info(f"\n ### Final result for train dataset ")
    logger.info(f"\n ####preprocessed train shape \n {df_train.shape}")        
    logger.info(f"preprocessed train sample: head(2) \n {df_train.head(2)}")
    logger.info(f"preprocessed train sample: tail(2) \n {df_train.tail(2)}")
    
    logger.info(f"\n ####preprocessed test shape \n {df_test.shape}")            
    logger.info(f"preprocessed test sample: head(2) \n {df_test.head(2)}")
    logger.info(f"preprocessed test sample: tail(2) \n {df_test.tail(2)}")

    logger.info(f"\n### End All of data preprocessing")
    total_end = time.time()
    print(f"Run time 시간 : {total_end - total_start:.1f} sec({((total_end - total_start)/60):.1f} min)\n")
    

Overwriting src/v1.0/preprocessing.py


### 0-2. train 

In [8]:
%%writefile src/v1.0/train.py

import os
import sys
import pickle

import argparse
import pandas as pd
import json

from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame
import joblib # from sklearn.externals import joblib

import logging
import logging.handlers

from dateutil.relativedelta import *
from datetime import datetime as dt

KST = dt.today() + relativedelta(hours=9)

###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    '''
    로깅을 위해 파이썬 로거를 사용
    # https://stackoverflow.com/questions/17745914/python-logging-module-is-printing-lines-multiple-times
    '''
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()


if __name__ == "__main__":
    ###################################
    ## 커맨드 인자, Hyperparameters 처리 ##
    ###################################        

    logger.info(f"### start training code")    
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_dir', type = str, default = os.environ.get('SM_OUTPUT_DIR'))
    parser.add_argument('--output_data_dir', type = str, default = os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model_dir', type = str, default = os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train_dir', type = str, default = os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test_dir', type = str, default = os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--item', type = str, default = 'FCPOc3')
    parser.add_argument('--target', type = str, default = 'y')
    parser.add_argument('--metric', type = str, default = 'MAPE')    
    parser.add_argument('--quality', type = str, default = 'low_quality')
    args = parser.parse_args()     

    logger.info("### Argument Info ###")
    logger.info(f"args.output_dir: {args.output_dir}")
    logger.info(f"args.output_data_dir: {args.output_data_dir}")    
    logger.info(f"args.model_dir: {args.model_dir}")        
    logger.info(f"args.train_dir: {args.train_dir}")   
    logger.info(f"args.test_dir: {args.test_dir}")   
    logger.info(f"args.item: {args.item}")   
    logger.info(f"args.target: {args.target}")    
    logger.info(f"args.metric: {args.metric}")   
    logger.info(f"args.quality: {args.quality}")   
    
    output_dir = args.output_dir
    output_data_dir = args.output_data_dir
    model_dir = args.model_dir
    train_dir = args.train_dir
    test_dir = args.test_dir
    item = args.item
    target = args.target
    metric = args.metric
    quality = args.quality
    
    logger.info("### Reading input data")
    df_train= pd.read_csv(os.path.join(train_dir, 'train.csv'))
    df_test = pd.read_csv(os.path.join(test_dir, 'test.csv'))        
    
    logger.info("### Convert TimeSeriesDataFrame")
    df_train.loc[:, "ds"] = pd.to_datetime(df_train.loc[:, "ds"])
    df_test.loc[:, "ds"] = pd.to_datetime(df_test.loc[:, "ds"])
    tdf_train = TimeSeriesDataFrame.from_data_frame(
        df_train,
        id_column="ric",
        timestamp_column="ds",
    )
    tdf_test = TimeSeriesDataFrame.from_data_frame(
        df_test,
        id_column="ric",
        timestamp_column="ds",
    )

    logger.info("### Show the range of date for training and test")    
    logger.info('Item:\t', item)
    logger.info('Target:\t', target)   
    logger.info('Train:\t',tdf_train.loc[item][target].index.min(),'~',tdf_train.loc[item][target].index.max())
    logger.info('Test:\t',tdf_test.loc[item][target].index.min(),'~',tdf_test.loc[item][target].index.max())
    logger.info('The number of test data:',len(tdf_test.loc[item][target]))
    
    logger.info("### Training AutoGluon Model")    
    predictor = TimeSeriesPredictor(
        path = model_dir,
        target = target,
        prediction_length = len(tdf_test.loc[item][target]),
        eval_metric = metric,
    )
    predictor.fit(
        train_data = tdf_train,
        presets = quality
    )    
    logger.info("Saving model to {}".format(model_dir))
    
    # 원래라면 Validation dataset이 input으로 들어와서 leaderboard와 prediction을 해야한다.
    # 근데, 여기서는 아니다. 이번 사이클에서는 test data까지 모두 산출한다음에 넣는것으로 진행하자.
    predictor_leaderboard = predictor.leaderboard(tdf_test, silent = True)
    predictor_leaderboard.to_csv(os.path.join(output_data_dir,'leaderboard.csv'), index = False)
    
    predictions = predictor.predict(train_data)
    predictions.head()

Overwriting src/v1.0/train.py


### 0-3. validation

In [9]:
%%writefile src/v1.0/model_validation.py

import glob
import os
import pandas as pd
import time
from datetime import datetime as dt
import argparse
import json
import boto3
from io import StringIO, BytesIO
import joblib
import sys
import subprocess
import logging
import logging.handlers

import tarfile


###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()

def get_secret():
    secret_name = "prod/sagemaker"
    region_name = "ap-northeast-2"

    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId='prod/sagemaker',
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret

def convert_series_to_description(leaderboard : pd.Series):
    return ','.join(leaderboard.loc[0,['model','score_test','score_val']].to_string().split())

def get_bucket_key_from_uri(uri):
    uri_aws_path = uri.split('//')[1]
    uri_bucket = uri_aws_path.rsplit('/')[0]
    uri_file_path = '/'.join(uri_aws_path.rsplit('/')[1:])
    return uri_bucket, uri_file_path

if __name__=='__main__':
    ################################
    ###### 커맨드 인자 파싱   ##########
    ################################    
    parser = argparse.ArgumentParser()
    parser.add_argument('--base_input_path', type=str, default="/opt/ml/processing/input")   
    parser.add_argument('--s3_model_uri', type=str, default="/opt/ml/processing/model")   
    parser.add_argument('--model_package_group_name', type=str, default='palm-oil-price-forecast')   
    args = parser.parse_args()     

    logger.info("######### Argument Info ####################################")
    logger.info(f"args.base_input_path: {args.base_input_path}")
    logger.info(f"args.s3_model_uri: {args.s3_model_uri}")
    logger.info(f"args.model_package_group_name: {args.model_package_group_name}")
    
    base_input_path = args.base_input_path
    s3_model_uri = args.s3_model_uri
    model_package_group_name = args.model_package_group_name
    
    ############################################
    ###### Secret Manager에서 키값 가져오기  #######
    ########################################### 
    logger.info(f"\n### Loading Key value from Secret Manager")
    
    keychain = json.loads(get_secret())
    ACCESS_KEY_ID = keychain['ACCESS_KEY_ID_ent']
    ACCESS_SECRET_KEY = keychain['ACCESS_SECRET_KEY_ent']

    BUCKET_NAME_USECASE = keychain['BUCKET_NAME_USECASE_ent']
    S3_PATH_STAGE = keychain['S3_PATH_STAGE']
    S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
    S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
    S3_PATH_log = keychain['S3_PATH_LOG']
    boto3_session = boto3.Session(ACCESS_KEY_ID, ACCESS_SECRET_KEY)

    region = boto3_session.region_name

    s3_resource = boto3_session.resource('s3')
    s3_client = boto3_session.client('s3')
    sm_client = boto3.client('sagemaker',
                             aws_access_key_id = ACCESS_KEY_ID,
                             aws_secret_access_key = ACCESS_SECRET_KEY,
                             region_name = 'ap-northeast-2')
    
    ############################################
    ##### Model, Leaderboard 파일 가져오기 #####
    ########################################### 
    logger.info(f"\n### Loading Model, Leaderboard zip files ")
    logger.info(f"\n#### Extract output.tar.gz and Read a Leaderboard ")
    ## 22.11.29 추가: 이전 step인, step_train에서 model.tar.gz의 uri는 가져올 수 있었지만, output.tar.gz는 못가져왔다. 이를 model.tar.gz에서 output.tar.gz으로 바꾸는방식으로 우회하자
    leaderboard_uri = s3_model_uri.replace('model.tar.gz','output.tar.gz')#,f'{base_input_path}/output.tar.gz'
    logger.info(f"\n#### output.tar.gz uri : {leaderboard_uri}")
    output_bucket, output_key = get_bucket_key_from_uri(leaderboard_uri)  
    output_obj = s3_client.get_object(Bucket = output_bucket, Key = output_key)
   
    logger.info("\n######### Model zip file extraction ####################################")
    with tarfile.open(fileobj=output_obj['Body'], mode='r|gz') as file:
        file.extractall(base_input_path)    
    logger.info(f"file list in {base_input_path}: {os.listdir(base_input_path)}")        
    
    # if leaderboard_path.endswith("tar.gz"):
    #     tar = tarfile.open(leaderboard_path, "r:gz")
    #     tar.extractall(base_input_path)
    #     tar.close()
    # elif leaderboard_path.endswith("tar"):
    #     tar = tarfile.open(leaderboard_path, "r:")
    #     tar.extractall(base_input_path)
    #     tar.close()

    leaderboard = pd.read_csv(f'{base_input_path}/leaderboard.csv').sort_values(by = ['score_val', 'score_test'],
                                                                                ascending = False)
    logger.info(f"leaderboard train sample: head(5) \n {leaderboard.head()}")
    logger.info(f"\n#### Set  ")
    model_package_group_name = model_package_group_name
    modelpackage_inference_specification =  {
        "InferenceSpecification": {
            "Containers": [
                {
                    "Image": '763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/autogluon-inference:0.4-cpu-py38',
                    "ModelDataUrl": s3_model_uri#'#args.model_path_uri
                }
            ],
            "SupportedContentTypes": [ "text/csv" ],
            "SupportedResponseMIMETypes": [ "text/csv" ],
        }
    }
    if len(leaderboard[leaderboard['score_val'] > -0.13]) > 0:
        logger.info(f"\n#### Pass the first performance filtering")
        
        create_model_package_input_dict = {
            "ModelPackageGroupName" : model_package_group_name,
            "ModelPackageDescription" : convert_series_to_description(leaderboard),
            "ModelApprovalStatus" : "PendingManualApproval"
        }
        create_model_package_input_dict.update(modelpackage_inference_specification)
        create_model_package_response = sm_client.create_model_package(**create_model_package_input_dict)
        model_package_arn = create_model_package_response["ModelPackageArn"]
        logger.info('### Passed ModelPackage Version ARN : {}'.format(model_package_arn))
        
    else:
        logger.info(f"\n#### None of them passed the filtering")
        create_model_package_input_dict = {
            "ModelPackageGroupName" : model_package_group_name,
            "ModelPackageDescription" : convert_series_to_description(leaderboard),
            "ModelApprovalStatus" : "Rejected"
        }
        create_model_package_input_dict.update(modelpackage_inference_specification)
        create_model_package_response = sm_client.create_model_package(**create_model_package_input_dict)
        model_package_arn = create_model_package_response["ModelPackageArn"]
        logger.info('### Rejected ModelPackage Version ARN : {}'.format(model_package_arn))

Overwriting src/v1.0/model_validation.py


### 0-4. prediction

In [10]:
%%writefile src/v1.0/prediction.py

import argparse
import os
import requests
import tempfile
import subprocess, sys
import json

import glob
import pandas as pd
import joblib
import pickle
import tarfile
from io import StringIO, BytesIO

import logging
import logging.handlers

import time
from datetime import datetime as dt

import boto3


###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    '''
    로깅을 위해 파이썬 로거를 사용
    # https://stackoverflow.com/questions/17745914/python-logging-module-is-printing-lines-multiple-times
    '''
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()


def get_bucket_key_from_uri(uri):
    uri_aws_path = uri.split('//')[1]
    uri_bucket = uri_aws_path.rsplit('/')[0]
    uri_file_path = '/'.join(uri_aws_path.rsplit('/')[1:])
    return uri_bucket, uri_file_path

def get_secret():
    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
    secret_name = "prod/sagemaker"
    region_name = "ap-northeast-2"
    
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId='prod/sagemaker',
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret
        
if __name__=='__main__':
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'autogluon==0.6.0'])
    from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

    ############################################
    ###### Secret Manager에서 키값 가져오기  #######
    ########################################### 
    logger.info(f"\n### Loading Key value from Secret Manager")
    
    keychain = json.loads(get_secret())
    ACCESS_KEY_ID = keychain['ACCESS_KEY_ID_ent']
    ACCESS_SECRET_KEY = keychain['ACCESS_SECRET_KEY_ent']

    BUCKET_NAME_USECASE = keychain['BUCKET_NAME_USECASE_ent']
    S3_PATH_STAGE = keychain['S3_PATH_STAGE']
    S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
    S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
    S3_PATH_log = keychain['S3_PATH_LOG']
    boto3_session = boto3.Session(ACCESS_KEY_ID, ACCESS_SECRET_KEY)

    region = boto3_session.region_name

    s3_resource = boto3_session.resource('s3')
    s3_client = boto3_session.client('s3')
    sm_client = boto3.client('sagemaker',
                             aws_access_key_id = ACCESS_KEY_ID,
                             aws_secret_access_key = ACCESS_SECRET_KEY,
                             region_name = 'ap-northeast-2')
    
    ################################
    ###### 커맨드 인자 파싱   ##########
    ################################    
    parser = argparse.ArgumentParser()
    parser.add_argument('--base_input_dir', type=str, default="/opt/ml/processing/input", help='train,testset 불러오는곳')
    parser.add_argument('--output_dir', type = str, default = "/opt/ml/processing/output", help='예측 결과값이 저장되는 곳, test dataset과 prediction 결과가 merge되서 저장된다.')
    parser.add_argument('--model_package_group_name', type=str, default='palm-oil-price-forecast')   
    args = parser.parse_args()     
    logger.info("\n######### Argument Info ####################################")
    logger.info(f"args.base_input_dir: {args.base_input_dir}")
    logger.info(f"args.output_dir: {args.output_dir}")
    logger.info(f"args.model_package_group_name: {args.model_package_group_name}")

    base_input_dir = args.base_input_dir
    output_dir = args.output_dir
    model_package_group_name = args.model_package_group_name
    model_dir = '/opt/ml/model'
    
    ##########################################################
    ###### 적합한 모델의 URI 찾고, 탑 성능 모델 이름 가져오기 ##########
    #########################################################
    logger.info("\n######### Finding suitable model uri ####################################")
    logger.info(f"Model Group name: {model_package_group_name}")
    model_registry_list = sm_client.list_model_packages(ModelPackageGroupName = model_package_group_name)['ModelPackageSummaryList']
    for model in model_registry_list:
        if (model['ModelPackageGroupName'] == model_package_group_name and
            model['ModelApprovalStatus'] == 'Approved'):
            mr_arn = model['ModelPackageArn']
            break
    describe_model = sm_client.describe_model_package(ModelPackageName=mr_arn)
    s3_model_uri = describe_model['InferenceSpecification']['Containers'][0]['ModelDataUrl']
    top_model_name = describe_model['ModelPackageDescription'].split(',')[1]

    logger.info(f"Found suitable model uri: {s3_model_uri}")
    logger.info(f"And top model name: {top_model_name}")
    
    logger.info("\n#########Download suitable model file  ####################################")
    model_bucket, model_key = get_bucket_key_from_uri(s3_model_uri)  
    model_obj = s3_client.get_object(Bucket = model_bucket, Key = model_key)
    
    ##########################################################
    ###### 모델 압축 풀고 TimeseriesDataFrame으로 변환 ##########
    #########################################################
    logger.info("\n######### Model zip file extraction ####################################")
    with tarfile.open(fileobj=model_obj['Body'], mode='r|gz') as file:
        file.extractall(model_dir)    
    logger.info(f"list in /opt/ml/model: {os.listdir(model_dir)}")        
    
    logger.info("\n######### Convert df_test dataframe into TimeSeriesDataFrame  ###########")        
    df_train = pd.read_csv(os.path.join(f'{base_input_dir}/train/train.csv'))
    df_train.loc[:, "ds"] = pd.to_datetime(df_train.loc[:, "ds"])
    tdf_train = TimeSeriesDataFrame.from_data_frame(
        df_train,
        id_column="ric",
        timestamp_column="ds",
    )
    df_test = pd.read_csv(f"{base_input_dir}/test/test.csv")
    df_test.loc[:, "ds"] = pd.to_datetime(df_test.loc[:, "ds"])
    tdf_test = TimeSeriesDataFrame.from_data_frame(
        df_test,
        id_column="ric",
        timestamp_column="ds",
    )
    logger.info(f"df_test sample: tail(2) \n {tdf_train.tail(2)}")
    logger.info(f"df_test sample: head(2) \n {tdf_test.head(2)}")
    
    ################################
    ###### Prediction 시작 ##########
    ###############################
    logger.info("\n######### Start prediction  ###########")        
    loaded_trainer = pickle.load(open(f"{model_dir}/models/trainer.pkl", 'rb'))
    logger.info(f"loaded_trainer: {loaded_trainer}")
    prediction_ag_model = loaded_trainer.predict(data = tdf_train,
                                                 model = top_model_name)
    logger.info(f"prediction_ag_model sample: head(2) \n {prediction_ag_model.head(2)}")

    prediction_result = pd.merge(tdf_test.loc['FCPOc3']['y'], prediction_ag_model.loc['FCPOc3'],
                                 left_index = True, right_index = True, how = 'left')
    prediction_result.to_csv(f'{output_dir}/prediction_result.csv')

Overwriting src/v1.0/prediction.py


# 0. 사용 코드들 1.1

### 0-1. preprocessing

In [1]:
%%writefile src/v1.1/preprocessing.py

import argparse
import os
import requests
import tempfile
import subprocess, sys

import pandas as pd
import numpy as np
from glob import glob
import copy
from collections import OrderedDict
from pathlib import Path
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

import logging
import logging.handlers

import json
import base64
import boto3
from botocore.client import Config
from botocore.exceptions import ClientError

import time
from datetime import datetime as dt
import datetime
from pytz import timezone
from dateutil.relativedelta import *

###############################
######### 전역변수 설정 ##########
###############################
KST = dt.today() + relativedelta(hours=9)
ric_list = ['BOc1', 'BOc2', 'BOc3','BOPLKL','BRRTSc1', 'BRRTSc2', 'BRRTSc3', 'CAD=', 'EUR=', 'JPY=', 'KRW=', 'MYR=', 'GBP=', 'INR=','Cc1', 'Cc2', 'Cc3','CCMc1', 'CCMc2', 'CCMc3',
            'CLc1', 'CLc2', 'CLc3','CNY=','COMc1', 'COMc2','COMc3','CTc1', 'CTc2', 'CTc3', 'DJCI', 'DJCIBR', 'DJCICL', 'DJCICN', 'DJCIEN', 'DJCIGR', 'DJCIIA', 'DJCING', 
            'DJCISO', 'DJCIWH', 'DJT','FCHI','FCPOc1', 'FCPOc2', 'FCPOc3','FGVHKL',
            'FKLIc1', 'FKLIc2', 'FKLIc3','FTSE','GCc1', 'GCc2', 'GCc3','GDAXI','GENMKL','HSI','IOIBKL','IXIC','JNIc1','JNIc2','JNIc3','KCc1', 'KCc2', 'KCc3','KLKKKL','KLSE','KQ11', 'KS11',
            'KWc1', 'KWc2', 'KWc3','LCOc1', 'LCOc2', 'LCOc3','LWBc1', 'LWBc2', 'LWBc3','MCCc1', 'MCCc2', 'MCCc3','MXSCKL','Oc1', 'Oc2', 'Oc3','PEPTKL','RRc1', 'RRc2', 'RRc3','RSc1', 'RSc2', 'RSc3',
            'Sc1', 'Sc2', 'Sc3','SIMEKL','SOPSKL','SSEC', 'THPBKL', 'Wc1', 'Wc2', 'Wc3'
           ]

col_names_asis = ['ds','high','low','open','ric']
col_names_tobe = ['ds','high','low','open','y']

###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    '''
    로깅을 위해 파이썬 로거를 사용
    # https://stackoverflow.com/questions/17745914/python-logging-module-is-printing-lines-multiple-times
    '''
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()

def download_object(file_name):
    try:
        s3_client = boto3.client("s3")
        download_path = Path('test') / file_name.replace('/','_')
        s3_client.download_file(
            BUCKET_NAME_USECASE,
            file_name,
            str(download_path)
        )
        return "Success"
    except Exception as e:
        return e

def download_parallel_multiprocessing(path_list):
    with ProcessPoolExecutor() as executor:
        future_to_key = {executor.submit(download_object, key): key for key in path_list}
        for future in futures.as_completed(future_to_key):
            key = future_to_key[future]
            exception = future.exception()
            if not exception:
                yield key, future.result()
            else:
                yield key, exception
                                
def get_list_in_s3(key_id : str,
                   secret_key_id : str,
                   bucket_name : str,
                   s3_path : str) -> list:
    
    s3 = boto3.client('s3',
                      aws_access_key_id = ACCESS_KEY_ID,
                      aws_secret_access_key = ACCESS_SECRET_KEY,
                      region_name = 'ap-northeast-2')
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket = bucket_name,
                               Prefix = s3_path)  # 원하는 bucket 과 하위경로에 있는 object list # dict type
    contents_list = [] # object list의 Contents를 가져옴
    for page in pages:
        for obj in page['Contents']:
            contents_list.append(obj)
    return contents_list

def get_file_folders(s3_client, bucket_name, prefix=""):
    file_names = []
    folders = []

    default_kwargs = {
        "Bucket": bucket_name,
        "Prefix": prefix
    }
    next_token = ""

    while next_token is not None:
        updated_kwargs = default_kwargs.copy()
        if next_token != "":
            updated_kwargs["ContinuationToken"] = next_token

        response = s3_client.list_objects_v2(**default_kwargs)
        contents = response.get("Contents")

        for result in contents:
            key = result.get("Key")
            if key[-1] == "/":
                folders.append(key)
            else:
                file_names.append(key)

        next_token = response.get("NextContinuationToken")

    return file_names, folders


def download_files(s3_client, bucket_name, local_path, file_names, folders):

    local_path = Path(local_path)

    for folder in folders:
        folder_path = Path.joinpath(local_path, folder)
        folder_path.mkdir(parents=True, exist_ok=True)

    for file_name in file_names:
        file_path = Path.joinpath(local_path, file_name)
        file_path.parent.mkdir(parents=True, exist_ok=True)
        s3_client.download_file(
            bucket_name,
            file_name,
            str(file_path)
        )
        
def get_dataframe(base_preproc_input_dir, file_name_prefix ):    
    '''
    파일 이름이 들어가 있는 csv 파일을 모두 저장하여 데이터 프레임을 리턴
    '''
    
    input_files = glob('{}/{}*.csv'.format(base_preproc_input_dir, file_name_prefix))
    #claim_input_files = glob('{}/dataset*.csv'.format(base_preproc_input_dir))    
    logger.info(f"input_files: \n {input_files}")    
    
    if len(input_files) == 0:
        raise ValueError(('There are no files in {}.\n' +
                          'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                          'the data specification in S3 was incorrectly specified or the role specified\n' +
                          'does not have permission to access the data.').format(base_preproc_input_dir, "train"))
        
    raw_data = [ pd.read_csv(file, index_col=0) for file in input_files ]
    df = pd.concat(raw_data)
   
    logger.info(f"dataframe shape \n {df.shape}")    
    logger.info(f"dataset sample \n {df.head(2)}")        
    #logger.info(f"df columns \n {df.columns}")    
    
    return df

def get_secret():
    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
    secret_name = "prod/sagemaker"
    region_name = "ap-northeast-2"
    
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId='prod/sagemaker',
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret
        
def fill_missing_dates(df_in : pd.DataFrame,
                       freq : str
                      ) -> pd.DataFrame : 
    df = df_in.copy()
    if df["ds"].dtype == np.int64:
            df.loc[:, "ds"] = df.loc[:, "ds"].astype(str)
    df.loc[:, "ds"] = pd.to_datetime(df.loc[:, "ds"])
    r = pd.date_range(start = df["ds"].min(),
                      end = df["ds"].max(),
                      freq = freq)
    df = df.set_index("ds").reindex(r).rename_axis("ds").reset_index()
    return df

def fill_missing_price_value(df: pd.DataFrame, col: str, limit_linear : int = 20 ) -> pd.DataFrame :
    initial_is_na = sum(df[col].isnull())
    series = df.loc[:, col].astype(float)
    series = series.interpolate(method="linear", limit=limit_linear, limit_direction="both")
    series = [0 if v < 0 else v for v in series]
    df[col] = series
    return df

def scaling_value(df : pd.DataFrame,
                  col_name : str,
                  ric,
                  s3_resource,
                  BUCKET_NAME_USECASE,
                  S3_PATH_GOLDEN) -> tuple:

    series = df[col_name].values
    scaler = MinMaxScaler()
    series = series.reshape(-1,1)
    scaler.fit(series)
    series = scaler.transform(series)
    with tempfile.TemporaryFile() as fp:
        joblib.dump(scaler, fp)
        fp.seek(0)
        s3_resource.put_object(Body = fp.read(),
                               Bucket = BUCKET_NAME_USECASE,
                               Key = f"{S3_PATH_GOLDEN}/{KST.strftime('%Y/%m/%d')}/scaler-files/{ric}_{col_name}_scaler.pkl")
    return series

def convert_type(raw, cols, type_target):
    '''
    해당 데이터 타입으로 변경
    '''
    df = raw.copy()
    
    for col in cols:
        df[col] = df[col].astype(type_target)
    
    return df

if __name__=='__main__':
    ################################
    ###### 커맨드 인자 파싱   ##########
    ################################
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--base_output_dir', type=str, default="/opt/ml/processing/output")
    parser.add_argument('--base_preproc_input_dir', type=str, default="/opt/ml/processing/input")   
    parser.add_argument('--split_date', type=str, default=KST.strftime('%Y-%m-%d'))
    parser.add_argument('--num_fold', type=str, default='5')       

    parser.add_argument('--label_column', type=str, default="ric") 
    parser.add_argument("--scaler_switch", type = str, default = 1, help = '1이면 Scaling ON, 0이면 Scaling OFF')
        
    # parse arguments
    args = parser.parse_args()     

    logger.info("######### Argument Info ####################################")
    logger.info(f"args.base_output_dir: {args.base_output_dir}")
    logger.info(f"args.base_preproc_input_dir: {args.base_preproc_input_dir}")    
    logger.info(f"args.label_column: {args.label_column}")        
    logger.info(f"args.split_date: {args.split_date}")   
    logger.info(f"args.scaler_switch: {args.scaler_switch}")
    logger.info(f"args.num_fold: {args.num_fold}")
    
    base_output_dir = args.base_output_dir
    base_preproc_input_dir = args.base_preproc_input_dir
    label_column = args.label_column
    split_date = args.split_date
    num_fold = int(args.num_fold)
    scaler_switch = int(args.scaler_switch)
    
    ############################################
    ###### Secret Manager에서 키값 가져오기  #######
    ########################################### 
    logger.info(f"\n### Loading the key value using Secret Manager")

    keychain = json.loads(get_secret())
    ACCESS_KEY_ID = keychain['ACCESS_KEY_ID_ent']
    ACCESS_SECRET_KEY = keychain['ACCESS_SECRET_KEY_ent']

    BUCKET_NAME_USECASE = keychain['BUCKET_NAME_USECASE_ent']
    S3_PATH_STAGE = keychain['S3_PATH_STAGE']
    S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
    S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
    S3_PATH_log = keychain['S3_PATH_LOG']

    boto_session = boto3.Session(ACCESS_KEY_ID, ACCESS_SECRET_KEY)
    region = boto_session.region_name
    s3_resource = boto_session.resource('s3')
    s3_client = boto_session.client('s3')
    ############################################
    ###### 1. 데이터 Integration  #######
    ########################################### 
    total_start = time.time()
    start = time.time()
    stage_dir = f'{base_output_dir}/stage'
    logger.info(f"\n### Data Integration")
    path_list = []
    df_sum = pd.DataFrame()

    for (path, dir, files) in os.walk(base_preproc_input_dir):
        for filename in files:
            ext = os.path.splitext(filename)[-1]
            if ext == '.csv':
                path_list.append("%s/%s" % (path, filename))
                
    logger.info(f"The number for data : {len(path_list)}")
    for file in path_list:
        df_tmp= pd.read_csv(file, encoding='utf-8') 
        df_sum = pd.concat([df_sum, df_tmp])
    df_sum = df_sum.sort_values(by='Date').reset_index(drop=True)
    df_sum.to_csv(f"{stage_dir}/stage_integrated.csv", index = False)
    end = time.time()
    
    logger.info(f"Data Integration is done")
    logger.info(f"Runtime : {end - start:.1f} sec({((end - start)/60):.1f} min)")
    logger.info(f"The number for data : {len(path_list)}")
    logger.info(f"Integrated data sample: head(2) \n {df_sum.head(2)}")
    logger.info(f"Integrated data sample: tail(2) \n {df_sum.tail(2)}")
    
    #################################
    ####   2. 첫번쨰 전처리 단계     ####
    ####   품목선별, 열 삭제, 형변환  ####
    ################################    
    start = time.time()
    logger.info(f"\n ### RIC Item selection")    
    df_sum = df_sum[df_sum['RIC'].isin(ric_list)].reset_index()
    logger.info(f"The number for data after RIC Item selection : {df_sum.shape}")

    logger.info(f"\n ### Column selection")    
    df_sum = df_sum[['Date','HIGH', 'LOW', 'OPEN', 'CLOSE','RIC']]
    logger.info(f"The number for data after Column selection : {df_sum.shape}")
    logger.info(f"\n ### type conversion")    
    df_sum.loc[:, "Date"] = pd.to_datetime(df_sum.loc[:, "Date"])
    df_sum.loc[:, "HIGH"] = df_sum.loc[:, "HIGH"].astype(np.float32)
    df_sum.loc[:, "LOW"] = df_sum.loc[:, "LOW"].astype(np.float32)
    df_sum.loc[:, "OPEN"] = df_sum.loc[:, "OPEN"].astype(np.float32)
    df_sum.loc[:, "CLOSE"] = df_sum.loc[:, "CLOSE"].astype(np.float32)
    
    ####################################################
    ####   3. Autogluon timeseries 데이터 셋으로 만들기  ####
    ####################################################
    logger.info(f"\n ### Autogluon timeseriesdataframe Conversion")        
    df_list = OrderedDict()
    for name in ric_list:
        df_tmp = df_sum[df_sum['RIC'] == name]
        df_tmp = df_tmp.drop('RIC', axis=1)
        df_list[name] = df_tmp[df_tmp['Date'] >= '2014-07-02'].reset_index(drop = True)
        
    ####################################################
    ############   4. 열 이름 변경, 결측치 처리  ############
    ###################################################
    logger.info(f"\n ### Rename columns")        
    col_names = ['ds','high','low','open','y']
    for name, value in df_list.items():
        df_list[name].columns = col_names

    logger.info(f"\n ### Fill missing value (Date)")        
    for name, value in df_list.items():
        df_list[name]  = fill_missing_dates(value, 'B')
        num_added = len(df_list[name]) - len(value)
        is_na = sum(df_list[name]['y'].isnull())
    
    logger.info(f"\n ### Fill missing value (Price)")        
    for name, value in df_list.items():
        df_proc1 = fill_missing_price_value(value, 'y')
        df_proc1 = fill_missing_price_value(value, 'high')
        df_proc1 = fill_missing_price_value(value, 'low')
        df_proc1 = fill_missing_price_value(value, 'open')
        df_list[name] = df_proc1
        
    ####################################################
    #################   5. Scaling  ###################
    ###################################################
    if int(scaler_switch) == 1:
        logger.info(f"\n ### Scaling")            
        scale_dir = f"{base_output_dir}/scaler-files"
        os.makedirs(scale_dir, exist_ok=True)
        for name, value in df_list.items():
            for col in ['y','high','open','low']:
                value.loc[:, col] = scaling_value(value, col, name, s3_client, BUCKET_NAME_USECASE, S3_PATH_GOLDEN)
            df_list[name] = value
    else:
        logger.info(f"\n ### No Scaling")
    end = time.time()
    logger.info(f"\n### All Date Transform is done")
    print(f"All Date Transform Run time : {end - start:.1f} sec({((end - start)/60):.1f} min)")

    #################################################
    #####   6. 훈련, 테스트 데이터 세트로 분리 및 저장  ######
    #################################################
    logger.info(f"\n ### Split train, test dataset")            
    df_golden = pd.DataFrame()
    for name, value in df_list.items():
        value = value.assign(ric = name)
        df_golden = pd.concat([df_golden, value])
        
    df_golden = df_golden.reset_index(drop = True)
    
    df_train_fold0 = df_golden[df_golden['ds'] < split_date]
    df_train_fold0.to_csv(f"{base_output_dir}/train/train_fold1.csv", index = False)
    df_test_fold1 = df_golden[df_golden['ds'] >= split_date]
    df_test_fold1.to_csv(f"{base_output_dir}/test/test_fold1.csv", index = False)

    # train 데이터 나누기
    for cnt in range(num_fold):
        split_date = (dt.strptime(split_date, '%Y-%m-%d') - relativedelta(days=30)).strftime('%Y-%m-%d')
    
        logger.info(f"df_train_fold{cnt+1} = df_train_fold{cnt}[df_train_fold{cnt}['ds'] < {split_date}]")
        exec(f"df_train_fold{cnt+1} = df_train_fold{cnt}[df_train_fold{cnt}['ds'] < split_date]")
        exec(f"df_train_fold{cnt+1}.to_csv('{base_output_dir}/train/train_fold{cnt+1}.csv', index = False)")

        logger.info(f"df_test_fold{cnt+1} = df_train_fold{cnt}[df_train_fold{cnt}['ds'] >= {split_date}]")
        exec(f"df_test_fold{cnt+1} = df_train_fold{cnt}[df_train_fold{cnt}['ds'] >= split_date]")
        exec(f"df_test_fold{cnt+1}.to_csv('{base_output_dir}/test/test_fold{cnt+1}.csv', index = False)")
    
    logger.info(f"\n ### Final result for train dataset ")
    logger.info(f"\n ####preprocessed train shape \n {df_train.shape}")        
    logger.info(f"preprocessed train sample: head(2) \n {df_train.head(2)}")
    logger.info(f"preprocessed train sample: tail(2) \n {df_train.tail(2)}")
    
    logger.info(f"\n ####preprocessed test shape \n {df_test.shape}")            
    logger.info(f"preprocessed test sample: head(2) \n {df_test.head(2)}")
    logger.info(f"preprocessed test sample: tail(2) \n {df_test.tail(2)}")

    logger.info(f"\n### End All of data preprocessing")
    total_end = time.time()
    print(f"Run time 시간 : {total_end - total_start:.1f} sec({((total_end - total_start)/60):.1f} min)\n")
    

Overwriting src/v1.0/preprocessing.py


### 0-2. train 

In [2]:
%%writefile src/v1.1/train.py

import os
import sys
import pickle

import argparse
import pandas as pd
import json

from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame
import joblib # from sklearn.externals import joblib

import logging
import logging.handlers

from dateutil.relativedelta import *
from datetime import datetime as dt

KST = dt.today() + relativedelta(hours=9)

###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    '''
    로깅을 위해 파이썬 로거를 사용
    # https://stackoverflow.com/questions/17745914/python-logging-module-is-printing-lines-multiple-times
    '''
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()


if __name__ == "__main__":
    ###################################
    ## 커맨드 인자, Hyperparameters 처리 ##
    ###################################        

    logger.info(f"### start training code")    
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_dir', type = str, default = os.environ.get('SM_OUTPUT_DIR'))
    parser.add_argument('--output_data_dir', type = str, default = os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model_dir', type = str, default = os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train_dir', type = str, default = os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test_dir', type = str, default = os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--item', type = str, default = 'FCPOc3')
    parser.add_argument('--target', type = str, default = 'y')
    parser.add_argument('--metric', type = str, default = 'MAPE')    
    parser.add_argument('--quality', type = str, default = 'low_quality')
    args = parser.parse_args()     

    logger.info("### Argument Info ###")
    logger.info(f"args.output_dir: {args.output_dir}")
    logger.info(f"args.output_data_dir: {args.output_data_dir}")    
    logger.info(f"args.model_dir: {args.model_dir}")        
    logger.info(f"args.train_dir: {args.train_dir}")   
    logger.info(f"args.test_dir: {args.test_dir}")   
    logger.info(f"args.item: {args.item}")   
    logger.info(f"args.target: {args.target}")    
    logger.info(f"args.metric: {args.metric}")   
    logger.info(f"args.quality: {args.quality}")   
    
    output_dir = args.output_dir
    output_data_dir = args.output_data_dir
    model_dir = args.model_dir
    train_dir = args.train_dir
    test_dir = args.test_dir
    item = args.item
    target = args.target
    metric = args.metric
    quality = args.quality
    
    logger.info("### Reading input data")
    df_train= pd.read_csv(os.path.join(train_dir, 'train.csv'))
    df_test = pd.read_csv(os.path.join(test_dir, 'test.csv'))        
    
    logger.info("### Convert TimeSeriesDataFrame")
    df_train.loc[:, "ds"] = pd.to_datetime(df_train.loc[:, "ds"])
    df_test.loc[:, "ds"] = pd.to_datetime(df_test.loc[:, "ds"])
    tdf_train = TimeSeriesDataFrame.from_data_frame(
        df_train,
        id_column="ric",
        timestamp_column="ds",
    )
    tdf_test = TimeSeriesDataFrame.from_data_frame(
        df_test,
        id_column="ric",
        timestamp_column="ds",
    )

    logger.info("### Show the range of date for training and test")    
    logger.info('Item:\t', item)
    logger.info('Target:\t', target)   
    logger.info('Train:\t',tdf_train.loc[item][target].index.min(),'~',tdf_train.loc[item][target].index.max())
    logger.info('Test:\t',tdf_test.loc[item][target].index.min(),'~',tdf_test.loc[item][target].index.max())
    logger.info('The number of test data:',len(tdf_test.loc[item][target]))
    
    logger.info("### Training AutoGluon Model")    
    predictor = TimeSeriesPredictor(
        path = model_dir,
        target = target,
        prediction_length = len(tdf_test.loc[item][target]),
        eval_metric = metric,
    )
    predictor.fit(
        train_data = tdf_train,
        presets = quality
    )    
    logger.info("Saving model to {}".format(model_dir))
    
    # 원래라면 Validation dataset이 input으로 들어와서 leaderboard와 prediction을 해야한다.
    # 근데, 여기서는 아니다. 이번 사이클에서는 test data까지 모두 산출한다음에 넣는것으로 진행하자.
    predictor_leaderboard = predictor.leaderboard(tdf_test, silent = True)
    predictor_leaderboard.to_csv(os.path.join(output_data_dir,'leaderboard.csv'), index = False)

Overwriting src/v1.0/train.py


### 0-3. validation

In [3]:
%%writefile src/v1.1/model_validation.py

import glob
import os
import pandas as pd
import time
from datetime import datetime as dt
import argparse
import json
import boto3
from io import StringIO, BytesIO
import joblib
import sys
import subprocess
import logging
import logging.handlers

import tarfile


###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()

def get_secret():
    secret_name = "prod/sagemaker"
    region_name = "ap-northeast-2"

    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId='prod/sagemaker',
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret

def convert_series_to_description(leaderboard : pd.Series):
    return ','.join(leaderboard.loc[0,['model','score_test','score_val']].to_string().split())

def get_bucket_key_from_uri(uri):
    uri_aws_path = uri.split('//')[1]
    uri_bucket = uri_aws_path.rsplit('/')[0]
    uri_file_path = '/'.join(uri_aws_path.rsplit('/')[1:])
    return uri_bucket, uri_file_path

if __name__=='__main__':
    ################################
    ###### 커맨드 인자 파싱   ##########
    ################################    
    parser = argparse.ArgumentParser()
    parser.add_argument('--base_input_path', type=str, default="/opt/ml/processing/input")   
    parser.add_argument('--s3_model_uri', type=str, default="/opt/ml/processing/model")   
    parser.add_argument('--model_package_group_name', type=str, default='palm-oil-price-forecast')   
    args = parser.parse_args()     

    logger.info("######### Argument Info ####################################")
    logger.info(f"args.base_input_path: {args.base_input_path}")
    logger.info(f"args.s3_model_uri: {args.s3_model_uri}")
    logger.info(f"args.model_package_group_name: {args.model_package_group_name}")
    
    base_input_path = args.base_input_path
    s3_model_uri = args.s3_model_uri
    model_package_group_name = args.model_package_group_name
    
    ############################################
    ###### Secret Manager에서 키값 가져오기  #######
    ########################################### 
    logger.info(f"\n### Loading Key value from Secret Manager")
    
    keychain = json.loads(get_secret())
    ACCESS_KEY_ID = keychain['ACCESS_KEY_ID_ent']
    ACCESS_SECRET_KEY = keychain['ACCESS_SECRET_KEY_ent']

    BUCKET_NAME_USECASE = keychain['BUCKET_NAME_USECASE_ent']
    S3_PATH_STAGE = keychain['S3_PATH_STAGE']
    S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
    S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
    S3_PATH_log = keychain['S3_PATH_LOG']
    boto3_session = boto3.Session(ACCESS_KEY_ID, ACCESS_SECRET_KEY)

    region = boto3_session.region_name

    s3_resource = boto3_session.resource('s3')
    s3_client = boto3_session.client('s3')
    sm_client = boto3.client('sagemaker',
                             aws_access_key_id = ACCESS_KEY_ID,
                             aws_secret_access_key = ACCESS_SECRET_KEY,
                             region_name = 'ap-northeast-2')
    
    ############################################
    ##### Model, Leaderboard 파일 가져오기 #####
    ########################################### 
    logger.info(f"\n### Loading Model, Leaderboard zip files ")
    logger.info(f"\n#### Extract output.tar.gz and Read a Leaderboard ")
    ## 22.11.29 추가: 이전 step인, step_train에서 model.tar.gz의 uri는 가져올 수 있었지만, output.tar.gz는 못가져왔다. 이를 model.tar.gz에서 output.tar.gz으로 바꾸는방식으로 우회하자
    leaderboard_uri = s3_model_uri.replace('model.tar.gz','output.tar.gz')#,f'{base_input_path}/output.tar.gz'
    logger.info(f"\n#### output.tar.gz uri : {leaderboard_uri}")
    output_bucket, output_key = get_bucket_key_from_uri(leaderboard_uri)  
    output_obj = s3_client.get_object(Bucket = output_bucket, Key = output_key)
   
    logger.info("\n######### Model zip file extraction ####################################")
    with tarfile.open(fileobj=output_obj['Body'], mode='r|gz') as file:
        file.extractall(base_input_path)    
    logger.info(f"file list in {base_input_path}: {os.listdir(base_input_path)}")        
    
    # if leaderboard_path.endswith("tar.gz"):
    #     tar = tarfile.open(leaderboard_path, "r:gz")
    #     tar.extractall(base_input_path)
    #     tar.close()
    # elif leaderboard_path.endswith("tar"):
    #     tar = tarfile.open(leaderboard_path, "r:")
    #     tar.extractall(base_input_path)
    #     tar.close()

    leaderboard = pd.read_csv(f'{base_input_path}/leaderboard.csv').sort_values(by = ['score_val', 'score_test'],
                                                                                ascending = False)
    logger.info(f"leaderboard train sample: head(5) \n {leaderboard.head()}")
    logger.info(f"\n#### Set  ")
    model_package_group_name = model_package_group_name
    modelpackage_inference_specification =  {
        "InferenceSpecification": {
            "Containers": [
                {
                    "Image": '763104351884.dkr.ecr.ap-northeast-2.amazonaws.com/autogluon-inference:0.4-cpu-py38',
                    "ModelDataUrl": s3_model_uri#'#args.model_path_uri
                }
            ],
            "SupportedContentTypes": [ "text/csv" ],
            "SupportedResponseMIMETypes": [ "text/csv" ],
        }
    }
    if len(leaderboard[leaderboard['score_val'] > -0.13]) > 0:
        logger.info(f"\n#### Pass the first performance filtering")
        
        create_model_package_input_dict = {
            "ModelPackageGroupName" : model_package_group_name,
            "ModelPackageDescription" : convert_series_to_description(leaderboard),
            "ModelApprovalStatus" : "PendingManualApproval"
        }
        create_model_package_input_dict.update(modelpackage_inference_specification)
        create_model_package_response = sm_client.create_model_package(**create_model_package_input_dict)
        model_package_arn = create_model_package_response["ModelPackageArn"]
        logger.info('### Passed ModelPackage Version ARN : {}'.format(model_package_arn))
        
    else:
        logger.info(f"\n#### None of them passed the filtering")
        create_model_package_input_dict = {
            "ModelPackageGroupName" : model_package_group_name,
            "ModelPackageDescription" : convert_series_to_description(leaderboard),
            "ModelApprovalStatus" : "Rejected"
        }
        create_model_package_input_dict.update(modelpackage_inference_specification)
        create_model_package_response = sm_client.create_model_package(**create_model_package_input_dict)
        model_package_arn = create_model_package_response["ModelPackageArn"]
        logger.info('### Rejected ModelPackage Version ARN : {}'.format(model_package_arn))

Overwriting src/v1.0/model_validation.py


### 0-4. prediction

In [4]:
%%writefile src/v1.1/prediction.py

import argparse
import os
import requests
import tempfile
import subprocess, sys
import json

import glob
import pandas as pd
import joblib
import pickle
import tarfile
from io import StringIO, BytesIO

import logging
import logging.handlers

import time
from datetime import datetime as dt

import boto3


###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    '''
    로깅을 위해 파이썬 로거를 사용
    # https://stackoverflow.com/questions/17745914/python-logging-module-is-printing-lines-multiple-times
    '''
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()


def get_bucket_key_from_uri(uri):
    uri_aws_path = uri.split('//')[1]
    uri_bucket = uri_aws_path.rsplit('/')[0]
    uri_file_path = '/'.join(uri_aws_path.rsplit('/')[1:])
    return uri_bucket, uri_file_path

def get_secret():
    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
    secret_name = "prod/sagemaker"
    region_name = "ap-northeast-2"
    
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId='prod/sagemaker',
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret
        
if __name__=='__main__':
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'autogluon==0.6.0'])
    from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

    ############################################
    ###### Secret Manager에서 키값 가져오기  #######
    ########################################### 
    logger.info(f"\n### Loading Key value from Secret Manager")
    
    keychain = json.loads(get_secret())
    ACCESS_KEY_ID = keychain['ACCESS_KEY_ID_ent']
    ACCESS_SECRET_KEY = keychain['ACCESS_SECRET_KEY_ent']

    BUCKET_NAME_USECASE = keychain['BUCKET_NAME_USECASE_ent']
    S3_PATH_STAGE = keychain['S3_PATH_STAGE']
    S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
    S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
    S3_PATH_log = keychain['S3_PATH_LOG']
    boto3_session = boto3.Session(ACCESS_KEY_ID, ACCESS_SECRET_KEY)

    region = boto3_session.region_name

    s3_resource = boto3_session.resource('s3')
    s3_client = boto3_session.client('s3')
    sm_client = boto3.client('sagemaker',
                             aws_access_key_id = ACCESS_KEY_ID,
                             aws_secret_access_key = ACCESS_SECRET_KEY,
                             region_name = 'ap-northeast-2')
    
    ################################
    ###### 커맨드 인자 파싱   ##########
    ################################    
    parser = argparse.ArgumentParser()
    parser.add_argument('--base_input_dir', type=str, default="/opt/ml/processing/input", help='train,testset 불러오는곳')
    parser.add_argument('--output_dir', type = str, default = "/opt/ml/processing/output", help='예측 결과값이 저장되는 곳, test dataset과 prediction 결과가 merge되서 저장된다.')
    parser.add_argument('--model_package_group_name', type=str, default='palm-oil-price-forecast')   
    args = parser.parse_args()     
    logger.info("\n######### Argument Info ####################################")
    logger.info(f"args.base_input_dir: {args.base_input_dir}")
    logger.info(f"args.output_dir: {args.output_dir}")
    logger.info(f"args.model_package_group_name: {args.model_package_group_name}")

    base_input_dir = args.base_input_dir
    output_dir = args.output_dir
    model_package_group_name = args.model_package_group_name
    model_dir = '/opt/ml/model'
    
    ##########################################################
    ###### 적합한 모델의 URI 찾고, 탑 성능 모델 이름 가져오기 ##########
    #########################################################
    logger.info("\n######### Finding suitable model uri ####################################")
    logger.info(f"Model Group name: {model_package_group_name}")
    model_registry_list = sm_client.list_model_packages(ModelPackageGroupName = model_package_group_name)['ModelPackageSummaryList']
    for model in model_registry_list:
        if (model['ModelPackageGroupName'] == model_package_group_name and
            model['ModelApprovalStatus'] == 'Approved'):
            mr_arn = model['ModelPackageArn']
            break
    describe_model = sm_client.describe_model_package(ModelPackageName=mr_arn)
    s3_model_uri = describe_model['InferenceSpecification']['Containers'][0]['ModelDataUrl']
    top_model_name = describe_model['ModelPackageDescription'].split(',')[1]

    logger.info(f"Found suitable model uri: {s3_model_uri}")
    logger.info(f"And top model name: {top_model_name}")
    
    logger.info("\n#########Download suitable model file  ####################################")
    model_bucket, model_key = get_bucket_key_from_uri(s3_model_uri)  
    model_obj = s3_client.get_object(Bucket = model_bucket, Key = model_key)
    
    ##########################################################
    ###### 모델 압축 풀고 TimeseriesDataFrame으로 변환 ##########
    #########################################################
    logger.info("\n######### Model zip file extraction ####################################")
    with tarfile.open(fileobj=model_obj['Body'], mode='r|gz') as file:
        file.extractall(model_dir)    
    logger.info(f"list in /opt/ml/model: {os.listdir(model_dir)}")        
    
    logger.info("\n######### Convert df_test dataframe into TimeSeriesDataFrame  ###########")        
    df_train = pd.read_csv(os.path.join(f'{base_input_dir}/train/train.csv'))
    df_train.loc[:, "ds"] = pd.to_datetime(df_train.loc[:, "ds"])
    tdf_train = TimeSeriesDataFrame.from_data_frame(
        df_train,
        id_column="ric",
        timestamp_column="ds",
    )
    df_test = pd.read_csv(f"{base_input_dir}/test/test.csv")
    df_test.loc[:, "ds"] = pd.to_datetime(df_test.loc[:, "ds"])
    tdf_test = TimeSeriesDataFrame.from_data_frame(
        df_test,
        id_column="ric",
        timestamp_column="ds",
    )
    logger.info(f"df_test sample: tail(2) \n {tdf_train.tail(2)}")
    logger.info(f"df_test sample: head(2) \n {tdf_test.head(2)}")
    
    ################################
    ###### Prediction 시작 ##########
    ###############################
    logger.info("\n######### Start prediction  ###########")        
    loaded_trainer = pickle.load(open(f"{model_dir}/models/trainer.pkl", 'rb'))
    logger.info(f"loaded_trainer: {loaded_trainer}")
    prediction_ag_model = loaded_trainer.predict(data = tdf_train,
                                                 model = top_model_name)
    logger.info(f"prediction_ag_model sample: head(2) \n {prediction_ag_model.head(2)}")

    prediction_result = pd.merge(tdf_test.loc['FCPOc3']['y'], prediction_ag_model.loc['FCPOc3'],
                                 left_index = True, right_index = True, how = 'left')
    prediction_result.to_csv(f'{output_dir}/prediction_result.csv')

Overwriting src/v1.0/prediction.py


# 1. 환경설정


## 1.1 라이브러리 및 변수 로딩

In [1]:
import argparse
import os
import requests
import tempfile
import subprocess, sys

import pandas as pd
import numpy as np
from glob import glob
import copy
from collections import OrderedDict
from pathlib import Path
import joblib

import logging
import logging.handlers

import json
import base64
import boto3
import sagemaker
from botocore.client import Config
from botocore.exceptions import ClientError

import time
from datetime import datetime as dt
import datetime
from pytz import timezone
from dateutil.relativedelta import *

from sagemaker import get_execution_role
from sagemaker.mxnet import MXNet
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker import get_execution_role
from sagemaker.image_uris import retrieve
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.inputs import TrainingInput

In [2]:
# 한국 시간
KST = dt.today() + relativedelta(hours=9)
print(f"Start job time: {KST}")

Start job time: 2022-12-20 21:00:44.486344


In [3]:
# 코드 버전
code_version = '1.0'
print(f"Code version: {code_version}")

Code version: 1.0


In [4]:
def get_secret():
    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
    secret_name = "prod/sagemaker"
    region_name = "ap-northeast-2"
    
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId='prod/sagemaker',
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret

keychain = json.loads(get_secret())
ACCESS_KEY_ID = keychain['ACCESS_KEY_ID_ent']
ACCESS_SECRET_KEY = keychain['ACCESS_SECRET_KEY_ent']

BUCKET_NAME_USECASE = keychain['BUCKET_NAME_USECASE_ent']
S3_PATH_STAGE = keychain['S3_PATH_STAGE']
S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
S3_PATH_log = keychain['S3_PATH_LOG']
S3_PATH_FORECAST = keychain['S3_PATH_FORECAST']

boto3_session = boto3.Session(ACCESS_KEY_ID, ACCESS_SECRET_KEY)
sm_session = sagemaker.Session(boto_session = boto3_session)
region = boto3_session.region_name

s3_resource = boto3_session.resource('s3')
bucket = s3_resource.Bucket(BUCKET_NAME_USECASE)
s3_client = boto3_session.client('s3')
sm_client = boto3.client('sagemaker',
                         aws_access_key_id = ACCESS_KEY_ID,
                         aws_secret_access_key = ACCESS_SECRET_KEY,
                         region_name = 'ap-northeast-2')

노트북에 저장된 변수를 확인

In [5]:
estimator_output_path = f"s3://{BUCKET_NAME_USECASE}/{S3_PATH_TRAIN}/{KST.strftime('%Y/%m/%d')}"
prediction_output_path = f"s3://{BUCKET_NAME_USECASE}/{S3_PATH_FORECAST}/{KST.strftime('%Y/%m/%d')}"
print("estimator_output_path: ", estimator_output_path)
print("prediction_output_path: ", prediction_output_path)

estimator_output_path:  s3://palm-oil-price-forecast/trained-model/2022/12/20
prediction_output_path:  s3://palm-oil-price-forecast/forecasted-data/2022/12/20


In [9]:
# stage_data_uri = f"{base}/{keychain['S3_PATH_STAGE']}"
# train_data_uri = f"{preproc_data_dir}/train.csv"
# test_data_uri = f"{preproc_data_dir}/test.csv"

In [10]:
# %store stage_data_uri
# %store train_data_uri
# %store test_data_uri

In [11]:
# print(stage_data_uri)
# print(train_data_uri)
# print(test_data_uri)
# print(train_model_uri)
# print(leaderboard_uri)
# print(project_prefix)

In [12]:
# # 한국 시간
# KST = dt.today() + relativedelta(hours=9)
# print(f"Start job time: {KST}")
# # 프로젝트 변수
# project_prefix = bucket
# base = f"s3://{bucket}"

# # 전처리 결과 데이터 위치(Golden data path)
# preproc_data_dir = f"{base}/{keychain['S3_PATH_GOLDEN']}/{KST.strftime('%Y/%m/%d')}"

# # stage_data_uri= f"{preproc_data_dir}/stage.csv"
# stage_data_uri = f"{base}/{keychain['S3_PATH_STAGE']}"
# train_data_uri = f"{preproc_data_dir}/train.csv"
# test_data_uri = f"{preproc_data_dir}/test.csv"

In [13]:
%store

Stored variables and their in-db values:
bucket                       -> 'palm-oil-price-forecast'
preproc_data_dir             -> 's3://palm-oil-price-forecast/golden-data/2022/12/
project_prefix               -> 'palm-oil-price-forecast'
stage_data_uri               -> 's3://palm-oil-price-forecast/staged-data'
test_data_uri                -> 's3://palm-oil-price-forecast/golden-data/2022/12/
train_data_uri               -> 's3://palm-oil-price-forecast/golden-data/2022/12/


노트북에 저장된 변수를 로딩

In [14]:
# %store stage_data_uri)
# %store train_data_uri)
# %store test_data_uri)
# %store train_model_uri)
# %store leaderboard_uri)
# %store project_prefix)

In [15]:
# %store -r

# 2. 모델 빌딩 파이프라인 의 스텝(Step) 생성

## 2.1 모델 빌딩 파이프라인 변수 생성

In [19]:
preprocessing_code = f'src/v{code_version}/preprocessing.py'
training_code = f'src/v{code_version}/train.py'
model_validation_code = f'src/v{code_version}/model_validation.py'
model_prediction_code = f'src/v{code_version}/prediction.py'
print('code_version:',code_version)
print('preprocessing_code:',preprocessing_code)
print('training_code:',training_code)
print('model_validation_code:',model_validation_code)
print('model_prediction_code:',model_prediction_code)

code_version: 1.0
preprocessing_code: src/v1.0/preprocessing.py
training_code: src/v1.0/train.py
model_validation_code: src/v1.0/model_validation.py
model_prediction_code: src/v1.0/prediction.py


In [16]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)
if code_version == '1.0':
    ###################
    # 0) 변수 선언 ###
    ##################
    project_prefix = BUCKET_NAME_USECASE # 프로젝트 변수
    base = f"s3://{BUCKET_NAME_USECASE}"
    KST = dt.today() + relativedelta(hours=9) # 한국 시간

    stage_data_dir = f"{base}/{keychain['S3_PATH_STAGE']}" # Staging data path
    preproc_data_dir = f"{base}/{keychain['S3_PATH_GOLDEN']}/{KST.strftime('%Y/%m/%d')}" # Golden data path
    train_data_uri = f"{preproc_data_dir}/train.csv" 
    test_data_uri = f"{preproc_data_dir}/test.csv"
    train_data_dir = f"{base}/{keychain['S3_PATH_TRAIN']}/{KST.strftime('%Y/%m/%d')}"
    train_model_uri = f"{train_data_dir}/output/output.tar.gz" # 이건 전 단계에서 path를 가지고 와야함
    leaderboard_uri = f"{train_data_dir}/output/model.tar.gz"

    print(f"Start job time: {KST}")
    print(f"code verison: {code_version}")

    ###################
    ## 1) 데이터 전처리를 위한 파이프라인 변수  ####################################
    ###################
    processing_instance_type = ParameterString(
        name = "ProcessingInstanceType",
        default_value = "ml.m5.xlarge"
    )
    processing_instance_count = ParameterInteger(
        name = "ProcessingInstanceCount",
        default_value = 1
    )
    input_stage_data = ParameterString(
        name = "InputStageData",
        default_value = stage_data_dir,
    )
    ###################
    ## 2) 데이터 학습을 위한 파이프라인 변수  ####################################
    ###################
    train_instance_type = ParameterString(
        name = "TrainingInstanceType",
        default_value = "ml.m5.xlarge"
    )
    train_instance_count = ParameterInteger(
        name = "TrainInstanceCount",
        default_value = 1
    )
    input_train_data = ParameterString(
        name = "InputTrainData",
        default_value = train_data_uri,
    )
    input_test_data = ParameterString(
        name = "InputTestData",
        default_value = test_data_uri,
    )
    ###################
    ## 3) 모델 검증을 위한 파이프라인 변수  ####################################
    ###################
    model_validation_instance_type = ParameterString(
        name="ModelValidationInstanceType",
        default_value='ml.c5.xlarge'
    )
    model_validation_instance_count = ParameterInteger(
        name="ModelValidationInstanceCount",
        default_value=1
    )
    input_model_data = ParameterString(
        name="InputModelData",
        default_value = train_model_uri,
    )
    input_leaderboard_data = ParameterString(
        name="InputLeaderboardData",
        default_value = leaderboard_uri,
    )
    ###################
    ## 4) 모델 예측을 위한 파이프라인 변수  ####################################
    ###################
    prediction_instance_type = ParameterString(
        name = "PredctionInstanceType",
        default_value = "ml.m5.xlarge"
    )
    prediction_instance_count = ParameterInteger(
        name = "PredctionInstanceCount",
        default_value = 1
    )

Start job time: 2022-12-20 21:23:26.227101
code verison: 1.0


In [17]:
# from sagemaker.workflow.steps import CacheConfig

# cache_config = CacheConfig(enable_caching=True, 
#                            expire_after="7d")

## 2.3 프로세서 단계 정의

In [17]:
role = sagemaker.get_execution_role()
skframework_version = "0.23-1"
image_uri = retrieve(framework='mxnet',
                     region='ap-northeast-2',
                     version='1.9.0',
                     py_version='py38',
                     image_scope='training',
                     instance_type=prediction_instance_type)

###################
# 1) 데이터 전처리 ###
##################
skprocessor_preprocessing = SKLearnProcessor(
    framework_version = skframework_version,
    instance_type = processing_instance_type,
    instance_count = processing_instance_count,
    base_job_name = f"{project_prefix}-preprocessing",
    role = role,
)

################
# 2) 모델 학습 ###
###############
mxnet_estimator = MXNet(
    base_job_name = f"{project_prefix}-training-autogluon060", # prefix
    entry_point = 'train.py',
    source_dir = "src",
    code_location = estimator_output_path,
    output_path = estimator_output_path,
    instance_type = train_instance_type,
    instance_count = train_instance_count,
    framework_version = '1.9.0',
    py_version = 'py38',
    role = role,
)
################
# 3) 모델 검증 ###
###############
skprocessor_model_validation = SKLearnProcessor(
    framework_version = skframework_version,
    instance_type = model_validation_instance_type,
    instance_count = model_validation_instance_count,
    base_job_name = f"{project_prefix}-model_validation",
    role = role,
)
###############
# 4) 모델 예측 ##
##############
script_processor_prediction = ScriptProcessor(
    command=['python3'],
    image_uri=image_uri,
    instance_type = prediction_instance_type,
    instance_count = prediction_instance_count,
    base_job_name = f"{project_prefix}-prediction",
    role = role,
)

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.
The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.
The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


## 2.4 파이프라인 스텝 단계 정의

In [22]:
if code_version == '1.0':
    split_date = '2022-10-31'

    ###################
    # 1) 데이터 전처리 ###
    ##################
    step_preprocessing = ProcessingStep(
        name = f"{project_prefix}-processing",
        processor = skprocessor_preprocessing,
        inputs = [
            ProcessingInput(source = stage_data_dir,
                            destination = '/opt/ml/processing/input'),
        ],
        outputs = [
            ProcessingOutput(output_name = "stage",
                             source = '/opt/ml/processing/output/stage',
                             destination = preproc_data_dir),
            ProcessingOutput(output_name = "train",
                             source = '/opt/ml/processing/output/train',
                             destination = preproc_data_dir),
            ProcessingOutput(output_name = "test",
                             source = '/opt/ml/processing/output/test',
                             destination = preproc_data_dir),
        ],
        job_arguments=["--split_date", split_date,
                      ],    
        code = preprocessing_code
    )
    ################
    # 2) 모델 학습 ###
    ###############
    step_train = TrainingStep(
        name = f"{project_prefix}-train-autogluon060",
        estimator = mxnet_estimator,
        inputs = {
            "train" : TrainingInput(
                s3_data = step_preprocessing.properties.ProcessingOutputConfig.Outputs[
                    "train"
                ].S3Output.S3Uri,
                content_type = "text/csv"
            ),
            "test" : TrainingInput(
                s3_data = step_preprocessing.properties.ProcessingOutputConfig.Outputs[
                    "test"
                ].S3Output.S3Uri,
                content_type = "text/csv"
            ),
        },
    )

    ################
    # 3) 모델 검증 ###
    ###############
    # 참조:https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DescribeTrainingJob.html#API_DescribeTrainingJob_ResponseSyntax
    step_model_validaion = ProcessingStep(
        name = f"{project_prefix}-model_validation",
        processor = skprocessor_model_validation,
        job_arguments=["--s3_model_uri", step_train.properties.ModelArtifacts.S3ModelArtifacts],    
        code = model_validation_code
    )
    ###############
    # 4) 모델 예측 ##
    ##############
    step_prediction = ProcessingStep(
        name = f"{project_prefix}-prediction",
        processor = script_processor_prediction,
        inputs=[
            ProcessingInput(source = step_preprocessing.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
                            destination = "/opt/ml/processing/input/train"),
            ProcessingInput(source = step_preprocessing.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
                            destination = "/opt/ml/processing/input/test"),
        ],
        outputs=[
            ProcessingOutput(output_name = "prediction_data",
                             source = "/opt/ml/processing/output",
                             destination = prediction_output_path)
            ],
        code = model_prediction_code
    )

## 2.5 최종 파이프라인 정의 및 실행

### 최종 파이프라인 정의
1. Processing: staged data내에서 데이터를 추출하여 데이터 통합 그리고 데이터 전처리 진행
2. Training: 
3. Model Validation:
4. Model Prediction(Infernece):
     - 굳이 Endpoint를 생성할 필요가 없다.
     
참조하자: [MLOps Pipeline](https://aws.amazon.com/ko/blogs/machine-learning/deploy-an-mlops-solution-that-hosts-your-model-endpoints-in-aws-lambda/)

In [24]:
from sagemaker.workflow.pipeline import Pipeline

if code_version == '1.0':
    pipeline_name = project_prefix
    pipeline = Pipeline(
        name=pipeline_name,
        parameters=[
            # 1) preprocessing's parameters 
            processing_instance_type, 
            processing_instance_count,
            input_stage_data,
            # 2) training's parameters        
            train_instance_type,        
            train_instance_count,   
            input_train_data,
            input_test_data,
            # 3) model validating's parameters
            model_validation_instance_type,
            model_validation_instance_count,
            # 4) predicion's parameters(inference)
            prediction_instance_type,
            prediction_instance_count,
        ],
       steps=[step_preprocessing,
              step_train,
              step_model_validaion,
              step_prediction]
    )


### 파이프라인 정의 확인

In [25]:
import json

definition = json.loads(pipeline.definition())
definition

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'ProcessingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'InputStageData',
   'Type': 'String',
   'DefaultValue': 's3://palm-oil-price-forecast/staged-data'},
  {'Name': 'TrainingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'TrainInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'InputTrainData',
   'Type': 'String',
   'DefaultValue': 's3://palm-oil-price-forecast/golden-data/2022/12/20/train.csv'},
  {'Name': 'InputTestData',
   'Type': 'String',
   'DefaultValue': 's3://palm-oil-price-forecast/golden-data/2022/12/20/test.csv'},
  {'Name': 'ModelValidationInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.c5.xlarge'},
  {'Name': 'ModelValidationInstanceCount',
   'Type': 'Integer',
   'DefaultValue': 1},
  {'Name': 'PredctionInstanceType',
   

### 파이프라인 정의를 제출하고 실행하기
- 요청만 하고 기다리진 않음

In [23]:
# # Async IO in Python
# pipeline.upsert(role_arn=role)
# execution = pipeline.start()

In [None]:
%%time
start = time.time()
pipeline.upsert(role_arn = sagemaker.get_execution_role())
execution = pipeline.start()
#실행이 완료될 때까지 기다린다.
execution.wait() 
end = time.time()

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


In [28]:
print(f"데이터 전처리~모델 검증시간 : {end - start:.1f} sec")
print(f"데이터 전처리~모델 검증시간 : {((end - start)/60):.1f} min")

데이터 전처리~모델 검증시간 : 1749.8 sec
데이터 전처리~모델 검증시간 : 29.2 min


[22년 11월 30일 1차 테스트]    
- 데이터 전처리~모델 검증 시간(초) : 1688.9 sec
- 데이터 전처리~모델 검증 시간(분) : 28.1 min   

[22년 11월 30일 2차 테스트]    
- 데이터 전처리~모델 검증 시간(초) : 1719.9 sec
- 데이터 전처리~모델 검증 시간(분) : 28.7 min

[22년 12월 20일 1차 테스트]    
- 데이터 전처리~모델 검증 시간(초) : 1749.8 sec
- 데이터 전처리~모델 검증 시간(분) : 29.2 min

### Debug Model Validation 

In [None]:
execution.describe()

In [None]:
execution.list_steps()

In [None]:
train_response

In [None]:
train_response['ModelArtifacts']['S3ModelArtifacts'].replace('model','output')

In [None]:
train_response = execution.list_steps()[2]
train_arn = train_response['Metadata']['TrainingJob']['Arn'] # index -1은 가장 처음 실행 step
train_job_name = train_arn.split('/')[-1] # Processing job name만 추출
train_response = sm_client.describe_training_job(TrainingJobName = train_job_name)
train_response

# 3. Future Works
참고사항: 앞으로 얼마나 예측을 할것인지에 대해서 split_date를 설정하여 앞으로 몇일을 예측할 수 있다.(현재 분기점은 '2022-10-31' 기준으로 되어있다.)

- 1 iter때 Prediction까지 진행을 하고 Model Registry 내 PendingManualApproval 모델이 저장이되면, 예측값과 실제값을 Quicksight 내 보여주고 Approve시 PendingManualApproval -> Approve 상태 전환
    => 사내에서는 보안 이슈로 인해서 data 업로드가 어려움
- Model Registry 내 model 상태가 업데이트 되면, Deployment 파이프라인 수행
- CodeCommit을 통한 소스코드 관리
    => 이거 첫번째로 찍먹해보자

- Multi-model을 적용하는 파이프라인
    => 

In [None]:
x = pd.read_csv(prediction_output_path+'/prediction_result.csv')

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.style.use('_mpl-gallery')

# make data
np.random.seed(1)
x = np.linspace(0, 8, 16)
y1 = 3 + 4*x/8 + np.random.uniform(0.0, 0.5, len(x))
y2 = 1 + 2*x/8 + np.random.uniform(0.0, 0.5, len(x))

# plot
fig, ax = plt.subplots()

ax.fill_between(x, y1, y2, alpha=.5, linewidth=0)
ax.plot(x, (y1 + y2)/2, linewidth=2)

ax.set(xlim=(0, 8), xticks=np.arange(1, 8),
       ylim=(0, 8), yticks=np.arange(1, 8))

plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

N = 1000
x = np.linspace(0, 10, N)
y = x**2
ones = np.ones(N)

vals = [30, 20, 10] # Values to iterate over and add/subtract from y.

fig, ax = plt.subplots()

for i, val in enumerate(vals):
    alpha = 0.5*(i+1)/len(vals) # Modify the alpha value for each iteration.
    ax.fill_between(x, y+ones*val, y-ones*val, color='red', alpha=alpha)

ax.plot(x, y, color='red') # Plot the original signal

plt.show()