In [1]:
DEFAULT_BUCKET = False
LOCAL_MODE = False

# 0. 환경설정

In [2]:
# ! pip list | grep sagemaker

In [3]:
# ! pip install --upgrade pip sagemaker

In [4]:
# !pip install --upgrade sagemaker-experiments

In [2]:
import sagemaker
import os
import pandas as pd
from datetime import datetime as dt
from dateutil.relativedelta import *
import time
import boto3
import json

In [3]:
def get_secret():
    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
    secret_name = "prod/sagemaker"
    region_name = "ap-northeast-2"
    
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId='prod/sagemaker',
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret

keychain = json.loads(get_secret())
ACCESS_KEY_ID = keychain['ACCESS_KEY_ID_ent']
ACCESS_SECRET_KEY = keychain['ACCESS_SECRET_KEY_ent']

BUCKET_NAME_USECASE = keychain['BUCKET_NAME_USECASE_ent']
S3_PATH_STAGE = keychain['S3_PATH_STAGE']
S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
S3_PATH_log = keychain['S3_PATH_LOG']
S3_PATH_FORECAST = keychain['S3_PATH_FORECAST']

boto3_session = boto3.Session(ACCESS_KEY_ID, ACCESS_SECRET_KEY)
sm_session = sagemaker.Session(boto_session = boto3_session)
region = boto3_session.region_name

s3_resource = boto3_session.resource('s3')
bucket = s3_resource.Bucket(BUCKET_NAME_USECASE)
s3_client = boto3_session.client('s3')
sm_client = boto3.client('sagemaker',
                         aws_access_key_id = ACCESS_KEY_ID,
                         aws_secret_access_key = ACCESS_SECRET_KEY,
                         region_name = 'ap-northeast-2')

# 1. 데이터 준비

In [4]:
if DEFAULT_BUCKET:
    sagemaker_session = sagemaker.session.Session()
    bucket = sagemaker_session.default_bucket()
else:
    bucket = keychain['BUCKET_NAME_USECASE_ent']
print("bucket: ", bucket)

bucket:  palm-oil-price-forecast


## 1) 변수 설정

In [5]:
# 한국 시간
KST = dt.today() + relativedelta(hours=9)
print(f"Start job time: {KST}")
# 프로젝트 변수
project_prefix = bucket
base = f"s3://{bucket}"

# 전처리 결과 데이터 위치(Golden data path)
preproc_data_dir = f"{base}/{keychain['S3_PATH_GOLDEN']}/{KST.strftime('%Y/%m/%d')}"

# stage_data_uri= f"{preproc_data_dir}/stage.csv"
stage_data_uri = f"{base}/{keychain['S3_PATH_STAGE']}"
train_data_uri = f"{preproc_data_dir}/train.csv"
test_data_uri = f"{preproc_data_dir}/test.csv"


Start job time: 2022-12-09 07:49:03.790647


## 2) 변수 저장

In [6]:
%store -z

In [7]:
%store project_prefix
%store preproc_data_dir

%store stage_data_uri
%store train_data_uri
%store test_data_uri

%store bucket

Stored 'project_prefix' (str)
Stored 'preproc_data_dir' (str)
Stored 'stage_data_uri' (str)
Stored 'train_data_uri' (str)
Stored 'test_data_uri' (str)
Stored 'bucket' (str)


# 2. 데이터 전처리

## 1) 전처리에 사용할 데이터를 확인


In [8]:
%store -r

In [9]:
%store

Stored variables and their in-db values:
bucket                       -> 'palm-oil-price-forecast'
preproc_data_dir             -> 's3://palm-oil-price-forecast/golden-data/2022/12/
project_prefix               -> 'palm-oil-price-forecast'
stage_data_uri               -> 's3://palm-oil-price-forecast/staged-data'
test_data_uri                -> 's3://palm-oil-price-forecast/golden-data/2022/12/
train_data_uri               -> 's3://palm-oil-price-forecast/golden-data/2022/12/


### (1) 전처리용 python script
- 인자값: https://engineer-mole.tistory.com/213

In [10]:
%%writefile src/preprocessing.py

import argparse
import os
import requests
import tempfile
import subprocess, sys

import pandas as pd
import numpy as np
from glob import glob
import copy
from collections import OrderedDict
from pathlib import Path
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

import logging
import logging.handlers

import json
import base64
import boto3
from botocore.client import Config
from botocore.exceptions import ClientError

import time
from datetime import datetime as dt
import datetime
from pytz import timezone
from dateutil.relativedelta import *

###############################
######### 전역변수 설정 ##########
###############################
KST = dt.today() + relativedelta(hours=9)
ric_list = ['BOc1', 'BOc2', 'BOc3','BOPLKL','BRRTSc1', 'BRRTSc2', 'BRRTSc3', 'CAD=', 'EUR=', 'JPY=', 'KRW=', 'MYR=', 'GBP=', 'INR=','Cc1', 'Cc2', 'Cc3','CCMc1', 'CCMc2', 'CCMc3',
            'CLc1', 'CLc2', 'CLc3','CNY=','COMc1', 'COMc2','COMc3','CTc1', 'CTc2', 'CTc3', 'DJCI', 'DJCIBR', 'DJCICL', 'DJCICN', 'DJCIEN', 'DJCIGR', 'DJCIIA', 'DJCING', 
            'DJCISO', 'DJCIWH', 'DJT','FCHI','FCPOc1', 'FCPOc2', 'FCPOc3','FGVHKL',
            'FKLIc1', 'FKLIc2', 'FKLIc3','FTSE','GCc1', 'GCc2', 'GCc3','GDAXI','GENMKL','HSI','IOIBKL','IXIC','JNIc1','JNIc2','JNIc3','KCc1', 'KCc2', 'KCc3','KLKKKL','KLSE','KQ11', 'KS11',
            'KWc1', 'KWc2', 'KWc3','LCOc1', 'LCOc2', 'LCOc3','LWBc1', 'LWBc2', 'LWBc3','MCCc1', 'MCCc2', 'MCCc3','MXSCKL','Oc1', 'Oc2', 'Oc3','PEPTKL','RRc1', 'RRc2', 'RRc3','RSc1', 'RSc2', 'RSc3',
            'Sc1', 'Sc2', 'Sc3','SIMEKL','SOPSKL','SSEC', 'THPBKL', 'Wc1', 'Wc2', 'Wc3'
           ]

col_names_asis = ['ds','high','low','open','ric']
col_names_tobe = ['ds','high','low','open','y']

###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    '''
    로깅을 위해 파이썬 로거를 사용
    # https://stackoverflow.com/questions/17745914/python-logging-module-is-printing-lines-multiple-times
    '''
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()

def download_object(file_name):
    try:
        s3_client = boto3.client("s3")
        download_path = Path('test') / file_name.replace('/','_')
        s3_client.download_file(
            BUCKET_NAME_USECASE,
            file_name,
            str(download_path)
        )
        return "Success"
    except Exception as e:
        return e

def download_parallel_multiprocessing(path_list):
    with ProcessPoolExecutor() as executor:
        future_to_key = {executor.submit(download_object, key): key for key in path_list}
        for future in futures.as_completed(future_to_key):
            key = future_to_key[future]
            exception = future.exception()
            if not exception:
                yield key, future.result()
            else:
                yield key, exception
                                
def get_list_in_s3(key_id : str,
                   secret_key_id : str,
                   bucket_name : str,
                   s3_path : str) -> list:
    
    s3 = boto3.client('s3',
                      aws_access_key_id = ACCESS_KEY_ID,
                      aws_secret_access_key = ACCESS_SECRET_KEY,
                      region_name = 'ap-northeast-2')
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket = bucket_name,
                               Prefix = s3_path)  # 원하는 bucket 과 하위경로에 있는 object list # dict type
    contents_list = [] # object list의 Contents를 가져옴
    for page in pages:
        for obj in page['Contents']:
            contents_list.append(obj)
    return contents_list

def get_file_folders(s3_client, bucket_name, prefix=""):
    file_names = []
    folders = []

    default_kwargs = {
        "Bucket": bucket_name,
        "Prefix": prefix
    }
    next_token = ""

    while next_token is not None:
        updated_kwargs = default_kwargs.copy()
        if next_token != "":
            updated_kwargs["ContinuationToken"] = next_token

        response = s3_client.list_objects_v2(**default_kwargs)
        contents = response.get("Contents")

        for result in contents:
            key = result.get("Key")
            if key[-1] == "/":
                folders.append(key)
            else:
                file_names.append(key)

        next_token = response.get("NextContinuationToken")

    return file_names, folders


def download_files(s3_client, bucket_name, local_path, file_names, folders):

    local_path = Path(local_path)

    for folder in folders:
        folder_path = Path.joinpath(local_path, folder)
        folder_path.mkdir(parents=True, exist_ok=True)

    for file_name in file_names:
        file_path = Path.joinpath(local_path, file_name)
        file_path.parent.mkdir(parents=True, exist_ok=True)
        s3_client.download_file(
            bucket_name,
            file_name,
            str(file_path)
        )
        
def get_dataframe(base_preproc_input_dir, file_name_prefix ):    
    '''
    파일 이름이 들어가 있는 csv 파일을 모두 저장하여 데이터 프레임을 리턴
    '''
    
    input_files = glob('{}/{}*.csv'.format(base_preproc_input_dir, file_name_prefix))
    #claim_input_files = glob('{}/dataset*.csv'.format(base_preproc_input_dir))    
    logger.info(f"input_files: \n {input_files}")    
    
    if len(input_files) == 0:
        raise ValueError(('There are no files in {}.\n' +
                          'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                          'the data specification in S3 was incorrectly specified or the role specified\n' +
                          'does not have permission to access the data.').format(base_preproc_input_dir, "train"))
        
    raw_data = [ pd.read_csv(file, index_col=0) for file in input_files ]
    df = pd.concat(raw_data)
   
    logger.info(f"dataframe shape \n {df.shape}")    
    logger.info(f"dataset sample \n {df.head(2)}")        
    #logger.info(f"df columns \n {df.columns}")    
    
    return df

def get_secret():
    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
    secret_name = "prod/sagemaker"
    region_name = "ap-northeast-2"
    
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId='prod/sagemaker',
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret
        
def fill_missing_dates(df_in : pd.DataFrame,
                       freq : str
                      ) -> pd.DataFrame : 
    df = df_in.copy()
    if df["ds"].dtype == np.int64:
            df.loc[:, "ds"] = df.loc[:, "ds"].astype(str)
    df.loc[:, "ds"] = pd.to_datetime(df.loc[:, "ds"])
    r = pd.date_range(start = df["ds"].min(),
                      end = df["ds"].max(),
                      freq = freq)
    df = df.set_index("ds").reindex(r).rename_axis("ds").reset_index()
    return df

def fill_missing_price_value(df: pd.DataFrame, col: str, limit_linear : int = 20 ) -> pd.DataFrame :
    initial_is_na = sum(df[col].isnull())
    series = df.loc[:, col].astype(float)
    series = series.interpolate(method="linear", limit=limit_linear, limit_direction="both")
    series = [0 if v < 0 else v for v in series]
    df[col] = series
    return df

def scaling_value(df : pd.DataFrame,
                  col_name : str,
                  ric,
                  s3_resource,
                  BUCKET_NAME_USECASE,
                  S3_PATH_GOLDEN) -> tuple:

    series = df[col_name].values
    scaler = MinMaxScaler()
    series = series.reshape(-1,1)
    scaler.fit(series)
    series = scaler.transform(series)
    with tempfile.TemporaryFile() as fp:
        joblib.dump(scaler, fp)
        fp.seek(0)
        s3_resource.put_object(Body = fp.read(),
                               Bucket = BUCKET_NAME_USECASE,
                               Key = f"{S3_PATH_GOLDEN}/{KST.strftime('%Y/%m/%d')}/scaler-files/{ric}_{col_name}_scaler.pkl")
    return series

def convert_type(raw, cols, type_target):
    '''
    해당 데이터 타입으로 변경
    '''
    df = raw.copy()
    
    for col in cols:
        df[col] = df[col].astype(type_target)
    
    return df

if __name__=='__main__':
    ################################
    ###### 커맨드 인자 파싱   ##########
    ################################
    split_date_default = dt.today() + relativedelta(hours = 9) - relativedelta(months=1)
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--base_output_dir', type=str, default="/opt/ml/processing/output")
    parser.add_argument('--base_preproc_input_dir', type=str, default="/opt/ml/processing/input")   
    parser.add_argument('--split_date', type=str, default=split_date_default.strftime('%Y-%m-%d'))       
    parser.add_argument('--label_column', type=str, default="ric") 
    parser.add_argument("--scaler_switch", type = str, default = 1, help = '1이면 Scaling ON, 0이면 Scaling OFF')
        
    # parse arguments
    args = parser.parse_args()     

    logger.info("######### Argument Info ####################################")
    logger.info(f"args.base_output_dir: {args.base_output_dir}")
    logger.info(f"args.base_preproc_input_dir: {args.base_preproc_input_dir}")    
    logger.info(f"args.label_column: {args.label_column}")        
    logger.info(f"args.split_date: {args.split_date}")   
    logger.info(f"args.scaler_switch: {args.scaler_switch}")   
    
    base_output_dir = args.base_output_dir
    base_preproc_input_dir = args.base_preproc_input_dir
    label_column = args.label_column
    split_date = args.split_date    
    scaler_switch = int(args.scaler_switch)
    ############################################
    ###### Secret Manager에서 키값 가져오기  #######
    ########################################### 
    logger.info(f"\n### Loading the key value using Secret Manager")

    keychain = json.loads(get_secret())
    ACCESS_KEY_ID = keychain['ACCESS_KEY_ID_ent']
    ACCESS_SECRET_KEY = keychain['ACCESS_SECRET_KEY_ent']

    BUCKET_NAME_USECASE = keychain['BUCKET_NAME_USECASE_ent']
    S3_PATH_STAGE = keychain['S3_PATH_STAGE']
    S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
    S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
    S3_PATH_log = keychain['S3_PATH_LOG']

    boto_session = boto3.Session(ACCESS_KEY_ID, ACCESS_SECRET_KEY)
    region = boto_session.region_name
    s3_resource = boto_session.resource('s3')
    s3_client = boto_session.client('s3')
    ############################################
    ###### 1. 데이터 Integration  #######
    ########################################### 
    total_start = time.time()
    start = time.time()
    stage_dir = f'{base_output_dir}/stage'
    logger.info(f"\n### Data Integration")
    path_list = []
    df_sum = pd.DataFrame()

    for (path, dir, files) in os.walk(base_preproc_input_dir):
        for filename in files:
            ext = os.path.splitext(filename)[-1]
            if ext == '.csv':
                path_list.append("%s/%s" % (path, filename))
                
    logger.info(f"The number for data : {len(path_list)}")
    for file in path_list:
        df_tmp= pd.read_csv(file, encoding='utf-8') 
        df_sum = pd.concat([df_sum, df_tmp])
    df_sum = df_sum.sort_values(by='Date').reset_index(drop=True)
    df_sum.to_csv(f"{stage_dir}/stage_integrated.csv", index = False)
    end = time.time()
    
    logger.info(f"Data Integration is done")
    logger.info(f"Runtime : {end - start:.1f} sec({((end - start)/60):.1f} min)")
    logger.info(f"The number for data : {len(path_list)}")
    logger.info(f"Integrated data sample: head(2) \n {df_sum.head(2)}")
    logger.info(f"Integrated data sample: tail(2) \n {df_sum.tail(2)}")
    
    #################################
    ####   2. 첫번쨰 전처리 단계     ####
    ####   품목선별, 열 삭제, 형변환  ####
    ################################    
    start = time.time()
    logger.info(f"\n ### RIC Item selection")    
    df_sum = df_sum[df_sum['RIC'].isin(ric_list)].reset_index()
    logger.info(f"The number for data after RIC Item selection : {df_sum.shape}")

    logger.info(f"\n ### Column selection")    
    df_sum = df_sum[['Date','HIGH', 'LOW', 'OPEN', 'CLOSE','RIC']]
    logger.info(f"The number for data after Column selection : {df_sum.shape}")
    logger.info(f"\n ### type conversion")    
    df_sum.loc[:, "Date"] = pd.to_datetime(df_sum.loc[:, "Date"])
    df_sum.loc[:, "HIGH"] = df_sum.loc[:, "HIGH"].astype(np.float32)
    df_sum.loc[:, "LOW"] = df_sum.loc[:, "LOW"].astype(np.float32)
    df_sum.loc[:, "OPEN"] = df_sum.loc[:, "OPEN"].astype(np.float32)
    df_sum.loc[:, "CLOSE"] = df_sum.loc[:, "CLOSE"].astype(np.float32)
    ####################################################
    ####   3. Autogluon timeseries 데이터 셋으로 만들기  ####
    ####################################################
    logger.info(f"\n ### Autogluon timeseriesdataframe Conversion")        
    df_list = OrderedDict()
    for name in ric_list:
        df_tmp = df_sum[df_sum['RIC'] == name]
        df_tmp = df_tmp.drop('RIC', axis=1)
        df_list[name] = df_tmp[df_tmp['Date'] >= '2014-07-02'].reset_index(drop = True)
    ####################################################
    ############   4. 열 이름 변경, 결측치 처리  ############
    ###################################################
    logger.info(f"\n ### Rename columns")        
    col_names = ['ds','high','low','open','y']
    for name, value in df_list.items():
        df_list[name].columns = col_names

    logger.info(f"\n ### Fill missing value (Date)")        
    for name, value in df_list.items():
        df_list[name]  = fill_missing_dates(value, 'B')
        num_added = len(df_list[name]) - len(value)
        is_na = sum(df_list[name]['y'].isnull())
    
    logger.info(f"\n ### Fill missing value (Price)")        
    for name, value in df_list.items():
        df_proc1 = fill_missing_price_value(value, 'y')
        df_proc1 = fill_missing_price_value(value, 'high')
        df_proc1 = fill_missing_price_value(value, 'low')
        df_proc1 = fill_missing_price_value(value, 'open')
        df_list[name] = df_proc1
        
    ####################################################
    #################   5. Scaling  ###################
    ###################################################
    if int(scaler_switch) == 1:
        logger.info(f"\n ### Scaling")            
        scale_dir = f"{base_output_dir}/scaler-files"
        os.makedirs(scale_dir, exist_ok=True)
        for name, value in df_list.items():
            for col in ['y','high','open','low']:
                value.loc[:, col] = scaling_value(value, col, name, s3_client, BUCKET_NAME_USECASE, S3_PATH_GOLDEN)
            df_list[name] = value
    else:
        logger.info(f"\n ### No Scaling")
    end = time.time()
    logger.info(f"\n### All Date Transform is done")
    print(f"All Date Transform Run time : {end - start:.1f} sec({((end - start)/60):.1f} min)")

    #################################################
    #####   6. 훈련, 테스트 데이터 세트로 분리 및 저장  ######
    #################################################
    logger.info(f"\n ### Split train, test dataset")            
    df_golden = pd.DataFrame()
    for name, value in df_list.items():
        value = value.assign(ric = name)
        df_golden = pd.concat([df_golden, value])
    df_golden = df_golden.reset_index(drop = True)
    
    # train 데이터 나누기
    df_train = df_golden[df_golden['ds'] < split_date]
    df_train.to_csv(f"{base_output_dir}/train/train.csv", index = False)
    
    df_test = df_golden[df_golden['ds'] >= split_date]
    df_test.to_csv(f"{base_output_dir}/test/test.csv", index = False)
    
    logger.info(f"\n ### Final result for train dataset ")
    logger.info(f"\n ####preprocessed train shape \n {df_train.shape}")        
    logger.info(f"preprocessed train sample: head(2) \n {df_train.head(2)}")
    logger.info(f"preprocessed train sample: tail(2) \n {df_train.tail(2)}")
    
    logger.info(f"\n ####preprocessed test shape \n {df_test.shape}")            
    logger.info(f"preprocessed test sample: head(2) \n {df_test.head(2)}")
    logger.info(f"preprocessed test sample: tail(2) \n {df_test.tail(2)}")

    logger.info(f"\n### End All of data preprocessing")
    total_end = time.time()
    print(f"Run time 시간 : {total_end - total_start:.1f} sec({((total_end - total_start)/60):.1f} min)\n")
    

Overwriting src/preprocessing.py


In [19]:
%%writefile src/preprocessing_fold.py

import argparse
import os
import requests
import tempfile
import subprocess, sys

import pandas as pd
import numpy as np
from glob import glob
import copy
from collections import OrderedDict
from pathlib import Path
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

import logging
import logging.handlers

import json
import base64
import boto3
from botocore.client import Config
from botocore.exceptions import ClientError

import time
from datetime import datetime as dt
import datetime
from pytz import timezone
from dateutil.relativedelta import *

###############################
######### 전역변수 설정 ##########
###############################
KST = dt.today() + relativedelta(hours=9)
ric_list = ['BOc1', 'BOc2', 'BOc3','BOPLKL','BRRTSc1', 'BRRTSc2', 'BRRTSc3', 'CAD=', 'EUR=', 'JPY=', 'KRW=', 'MYR=', 'GBP=', 'INR=','Cc1', 'Cc2', 'Cc3','CCMc1', 'CCMc2', 'CCMc3',
            'CLc1', 'CLc2', 'CLc3','CNY=','COMc1', 'COMc2','COMc3','CTc1', 'CTc2', 'CTc3', 'DJCI', 'DJCIBR', 'DJCICL', 'DJCICN', 'DJCIEN', 'DJCIGR', 'DJCIIA', 'DJCING', 
            'DJCISO', 'DJCIWH', 'DJT','FCHI','FCPOc1', 'FCPOc2', 'FCPOc3','FGVHKL',
            'FKLIc1', 'FKLIc2', 'FKLIc3','FTSE','GCc1', 'GCc2', 'GCc3','GDAXI','GENMKL','HSI','IOIBKL','IXIC','JNIc1','JNIc2','JNIc3','KCc1', 'KCc2', 'KCc3','KLKKKL','KLSE','KQ11', 'KS11',
            'KWc1', 'KWc2', 'KWc3','LCOc1', 'LCOc2', 'LCOc3','LWBc1', 'LWBc2', 'LWBc3','MCCc1', 'MCCc2', 'MCCc3','MXSCKL','Oc1', 'Oc2', 'Oc3','PEPTKL','RRc1', 'RRc2', 'RRc3','RSc1', 'RSc2', 'RSc3',
            'Sc1', 'Sc2', 'Sc3','SIMEKL','SOPSKL','SSEC', 'THPBKL', 'Wc1', 'Wc2', 'Wc3'
           ]

col_names_asis = ['ds','high','low','open','ric']
col_names_tobe = ['ds','high','low','open','y']

###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    '''
    로깅을 위해 파이썬 로거를 사용
    # https://stackoverflow.com/questions/17745914/python-logging-module-is-printing-lines-multiple-times
    '''
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()

def download_object(file_name):
    try:
        s3_client = boto3.client("s3")
        download_path = Path('test') / file_name.replace('/','_')
        s3_client.download_file(
            BUCKET_NAME_USECASE,
            file_name,
            str(download_path)
        )
        return "Success"
    except Exception as e:
        return e

def download_parallel_multiprocessing(path_list):
    with ProcessPoolExecutor() as executor:
        future_to_key = {executor.submit(download_object, key): key for key in path_list}
        for future in futures.as_completed(future_to_key):
            key = future_to_key[future]
            exception = future.exception()
            if not exception:
                yield key, future.result()
            else:
                yield key, exception
                                
def get_list_in_s3(key_id : str,
                   secret_key_id : str,
                   bucket_name : str,
                   s3_path : str) -> list:
    
    s3 = boto3.client('s3',
                      aws_access_key_id = ACCESS_KEY_ID,
                      aws_secret_access_key = ACCESS_SECRET_KEY,
                      region_name = 'ap-northeast-2')
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket = bucket_name,
                               Prefix = s3_path)  # 원하는 bucket 과 하위경로에 있는 object list # dict type
    contents_list = [] # object list의 Contents를 가져옴
    for page in pages:
        for obj in page['Contents']:
            contents_list.append(obj)
    return contents_list

def get_file_folders(s3_client, bucket_name, prefix=""):
    file_names = []
    folders = []

    default_kwargs = {
        "Bucket": bucket_name,
        "Prefix": prefix
    }
    next_token = ""

    while next_token is not None:
        updated_kwargs = default_kwargs.copy()
        if next_token != "":
            updated_kwargs["ContinuationToken"] = next_token

        response = s3_client.list_objects_v2(**default_kwargs)
        contents = response.get("Contents")

        for result in contents:
            key = result.get("Key")
            if key[-1] == "/":
                folders.append(key)
            else:
                file_names.append(key)

        next_token = response.get("NextContinuationToken")

    return file_names, folders


def download_files(s3_client, bucket_name, local_path, file_names, folders):

    local_path = Path(local_path)

    for folder in folders:
        folder_path = Path.joinpath(local_path, folder)
        folder_path.mkdir(parents=True, exist_ok=True)

    for file_name in file_names:
        file_path = Path.joinpath(local_path, file_name)
        file_path.parent.mkdir(parents=True, exist_ok=True)
        s3_client.download_file(
            bucket_name,
            file_name,
            str(file_path)
        )
        
def get_dataframe(base_preproc_input_dir, file_name_prefix ):    
    '''
    파일 이름이 들어가 있는 csv 파일을 모두 저장하여 데이터 프레임을 리턴
    '''
    
    input_files = glob('{}/{}*.csv'.format(base_preproc_input_dir, file_name_prefix))
    #claim_input_files = glob('{}/dataset*.csv'.format(base_preproc_input_dir))    
    logger.info(f"input_files: \n {input_files}")    
    
    if len(input_files) == 0:
        raise ValueError(('There are no files in {}.\n' +
                          'This usually indicates that the channel ({}) was incorrectly specified,\n' +
                          'the data specification in S3 was incorrectly specified or the role specified\n' +
                          'does not have permission to access the data.').format(base_preproc_input_dir, "train"))
        
    raw_data = [ pd.read_csv(file, index_col=0) for file in input_files ]
    df = pd.concat(raw_data)
   
    logger.info(f"dataframe shape \n {df.shape}")    
    logger.info(f"dataset sample \n {df.head(2)}")        
    #logger.info(f"df columns \n {df.columns}")    
    
    return df

def get_secret():
    # See https://docs.aws.amazon.com/secretsmanager/latest/apireference/API_GetSecretValue.html
    secret_name = "prod/sagemaker"
    region_name = "ap-northeast-2"
    
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )
    try:
        get_secret_value_response = client.get_secret_value(
            SecretId='prod/sagemaker',
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret
        
def fill_missing_dates(df_in : pd.DataFrame,
                       freq : str
                      ) -> pd.DataFrame : 
    df = df_in.copy()
    if df["ds"].dtype == np.int64:
            df.loc[:, "ds"] = df.loc[:, "ds"].astype(str)
    df.loc[:, "ds"] = pd.to_datetime(df.loc[:, "ds"])
    r = pd.date_range(start = df["ds"].min(),
                      end = df["ds"].max(),
                      freq = freq)
    df = df.set_index("ds").reindex(r).rename_axis("ds").reset_index()
    return df

def fill_missing_price_value(df: pd.DataFrame, col: str, limit_linear : int = 20 ) -> pd.DataFrame :
    initial_is_na = sum(df[col].isnull())
    series = df.loc[:, col].astype(float)
    series = series.interpolate(method="linear", limit=limit_linear, limit_direction="both")
    series = [0 if v < 0 else v for v in series]
    df[col] = series
    return df

def scaling_value(df : pd.DataFrame,
                  col_name : str,
                  ric,
                  s3_resource,
                  BUCKET_NAME_USECASE,
                  S3_PATH_GOLDEN) -> tuple:

    series = df[col_name].values
    scaler = MinMaxScaler()
    series = series.reshape(-1,1)
    scaler.fit(series)
    series = scaler.transform(series)
    with tempfile.TemporaryFile() as fp:
        joblib.dump(scaler, fp)
        fp.seek(0)
        s3_resource.put_object(Body = fp.read(),
                               Bucket = BUCKET_NAME_USECASE,
                               Key = f"{S3_PATH_GOLDEN}/{KST.strftime('%Y/%m/%d')}/scaler-files/{ric}_{col_name}_scaler.pkl")
    return series

def convert_type(raw, cols, type_target):
    '''
    해당 데이터 타입으로 변경
    '''
    df = raw.copy()
    
    for col in cols:
        df[col] = df[col].astype(type_target)
    
    return df

if __name__=='__main__':
    ################################
    ###### 커맨드 인자 파싱   ##########
    ################################
    split_date_default = dt.today() + relativedelta(hours = 9) - relativedelta(months=1)
    
    parser = argparse.ArgumentParser()
    parser.add_argument('--base_output_dir', type=str, default="/opt/ml/processing/output")
    parser.add_argument('--base_preproc_input_dir', type=str, default="/opt/ml/processing/input")   
    parser.add_argument('--split_date', type=str, default=split_date_default.strftime('%Y-%m-%d'))       
    parser.add_argument('--label_column', type=str, default="ric") 
    parser.add_argument("--scaler_switch", type = str, default = 1, help = '1이면 Scaling ON, 0이면 Scaling OFF')
        
    # parse arguments
    args = parser.parse_args()     

    logger.info("######### Argument Info ####################################")
    logger.info(f"args.base_output_dir: {args.base_output_dir}")
    logger.info(f"args.base_preproc_input_dir: {args.base_preproc_input_dir}")    
    logger.info(f"args.label_column: {args.label_column}")        
    logger.info(f"args.split_date: {args.split_date}")   
    logger.info(f"args.scaler_switch: {args.scaler_switch}")   
    
    base_output_dir = args.base_output_dir
    base_preproc_input_dir = args.base_preproc_input_dir
    label_column = args.label_column
    split_date = args.split_date    
    scaler_switch = int(args.scaler_switch)
    ############################################
    ###### Secret Manager에서 키값 가져오기  #######
    ########################################### 
    logger.info(f"\n### Loading the key value using Secret Manager")

    keychain = json.loads(get_secret())
    ACCESS_KEY_ID = keychain['ACCESS_KEY_ID_ent']
    ACCESS_SECRET_KEY = keychain['ACCESS_SECRET_KEY_ent']

    BUCKET_NAME_USECASE = keychain['BUCKET_NAME_USECASE_ent']
    S3_PATH_STAGE = keychain['S3_PATH_STAGE']
    S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
    S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
    S3_PATH_log = keychain['S3_PATH_LOG']

    boto_session = boto3.Session(ACCESS_KEY_ID, ACCESS_SECRET_KEY)
    region = boto_session.region_name
    s3_resource = boto_session.resource('s3')
    s3_client = boto_session.client('s3')
    ############################################
    ###### 1. 데이터 Integration  #######
    ########################################### 
    total_start = time.time()
    start = time.time()
    stage_dir = f'{base_output_dir}/stage'
    logger.info(f"\n### Data Integration")
    path_list = []
    df_sum = pd.DataFrame()

    for (path, dir, files) in os.walk(base_preproc_input_dir):
        for filename in files:
            ext = os.path.splitext(filename)[-1]
            if ext == '.csv':
                path_list.append("%s/%s" % (path, filename))
                
    logger.info(f"The number for data : {len(path_list)}")
    for file in path_list:
        df_tmp= pd.read_csv(file, encoding='utf-8') 
        df_sum = pd.concat([df_sum, df_tmp])
    df_sum = df_sum.sort_values(by='Date').reset_index(drop=True)
    df_sum.to_csv(f"{stage_dir}/stage_integrated.csv", index = False)
    end = time.time()
    
    logger.info(f"Data Integration is done")
    logger.info(f"Runtime : {end - start:.1f} sec({((end - start)/60):.1f} min)")
    logger.info(f"The number for data : {len(path_list)}")
    logger.info(f"Integrated data sample: head(2) \n {df_sum.head(2)}")
    logger.info(f"Integrated data sample: tail(2) \n {df_sum.tail(2)}")
    
    #################################
    ####   2. 첫번쨰 전처리 단계     ####
    ####   품목선별, 열 삭제, 형변환  ####
    ################################    
    start = time.time()
    logger.info(f"\n ### RIC Item selection")    
    df_sum = df_sum[df_sum['RIC'].isin(ric_list)].reset_index()
    logger.info(f"The number for data after RIC Item selection : {df_sum.shape}")

    logger.info(f"\n ### Column selection")    
    df_sum = df_sum[['Date','HIGH', 'LOW', 'OPEN', 'CLOSE','RIC']]
    logger.info(f"The number for data after Column selection : {df_sum.shape}")
    logger.info(f"\n ### type conversion")    
    df_sum.loc[:, "Date"] = pd.to_datetime(df_sum.loc[:, "Date"])
    df_sum.loc[:, "HIGH"] = df_sum.loc[:, "HIGH"].astype(np.float32)
    df_sum.loc[:, "LOW"] = df_sum.loc[:, "LOW"].astype(np.float32)
    df_sum.loc[:, "OPEN"] = df_sum.loc[:, "OPEN"].astype(np.float32)
    df_sum.loc[:, "CLOSE"] = df_sum.loc[:, "CLOSE"].astype(np.float32)
    ####################################################
    ####   3. Autogluon timeseries 데이터 셋으로 만들기  ####
    ####################################################
    logger.info(f"\n ### Autogluon timeseriesdataframe Conversion")        
    df_list = OrderedDict()
    for name in ric_list:
        df_tmp = df_sum[df_sum['RIC'] == name]
        df_tmp = df_tmp.drop('RIC', axis=1)
        df_list[name] = df_tmp[df_tmp['Date'] >= '2014-07-02'].reset_index(drop = True)
    ####################################################
    ############   4. 열 이름 변경, 결측치 처리  ############
    ###################################################
    logger.info(f"\n ### Rename columns")        
    col_names = ['ds','high','low','open','y']
    for name, value in df_list.items():
        df_list[name].columns = col_names

    logger.info(f"\n ### Fill missing value (Date)")        
    for name, value in df_list.items():
        df_list[name]  = fill_missing_dates(value, 'B')
        num_added = len(df_list[name]) - len(value)
        is_na = sum(df_list[name]['y'].isnull())
    
    logger.info(f"\n ### Fill missing value (Price)")        
    for name, value in df_list.items():
        df_proc1 = fill_missing_price_value(value, 'y')
        df_proc1 = fill_missing_price_value(value, 'high')
        df_proc1 = fill_missing_price_value(value, 'low')
        df_proc1 = fill_missing_price_value(value, 'open')
        df_list[name] = df_proc1
        
    ####################################################
    #################   5. Scaling  ###################
    ###################################################
    if int(scaler_switch) == 1:
        logger.info(f"\n ### Scaling")            
        scale_dir = f"{base_output_dir}/scaler-files"
        os.makedirs(scale_dir, exist_ok=True)
        for name, value in df_list.items():
            for col in ['y','high','open','low']:
                value.loc[:, col] = scaling_value(value, col, name, s3_client, BUCKET_NAME_USECASE, S3_PATH_GOLDEN)
            df_list[name] = value
    else:
        logger.info(f"\n ### No Scaling")
    end = time.time()
    logger.info(f"\n### All Date Transform is done")
    print(f"All Date Transform Run time : {end - start:.1f} sec({((end - start)/60):.1f} min)")

    #################################################
    #####   6. 훈련, 테스트 데이터 세트로 분리 및 저장  ######
    #################################################
    logger.info(f"\n ### Split train, test dataset")            
    df_golden = pd.DataFrame()
    for name, value in df_list.items():
        value = value.assign(ric = name)
        df_golden = pd.concat([df_golden, value])
    df_golden = df_golden.reset_index(drop = True)
    
    df_train_fold1 = df_golden[df_golden['ds'] < split_date]
    df_train_fold1.to_csv(f"{base_output_dir}/train/train_fold1.csv", index = False)
    df_test_fold1 = df_golden[df_golden['ds'] >= split_date]
    df_test_fold1.to_csv(f"{base_output_dir}/test/test_fold2.csv", index = False)

    # train 데이터 나누기
    for cnt in range(1,5):
        logger.info(f"df_train_fold{cnt+1} = df_train_fold{cnt}[df_train_fold{cnt}['ds'] < {split_date}]")
        exec(f"df_train_fold{cnt+1} = df_train_fold{cnt}[df_train_fold{cnt}['ds'] < split_date]")
        exec(f"df_train_fold{cnt+1}.to_csv('{base_output_dir}/train/train_fold{cnt+1}.csv', index = False)")

        logger.info(f"df_test_fold{cnt+1} = df_train_fold{cnt}[df_train_fold{cnt}['ds'] >= {split_date}]")
        exec(f"df_test_fold{cnt+1} = df_train_fold{cnt}[df_train_fold{cnt}['ds'] >= split_date]")
        exec(f"df_test_fold{cnt+1}.to_csv('{base_output_dir}/test/test_fold{cnt+1}.csv', index = False)")
    
    logger.info(f"\n ### Final result for train dataset ")
    logger.info(f"\n ####preprocessed train shape \n {df_train.shape}")        
    logger.info(f"preprocessed train sample: head(2) \n {df_train.head(2)}")
    logger.info(f"preprocessed train sample: tail(2) \n {df_train.tail(2)}")
    
    logger.info(f"\n ####preprocessed test shape \n {df_test.shape}")            
    logger.info(f"preprocessed test sample: head(2) \n {df_test.head(2)}")
    logger.info(f"preprocessed test sample: tail(2) \n {df_test.tail(2)}")

    logger.info(f"\n### End All of data preprocessing")
    total_end = time.time()
    print(f"Run time 시간 : {total_end - total_start:.1f} sec({((total_end - total_start)/60):.1f} min)\n")
    

Writing src/preprocessing_fold.py


In [12]:
(dt.now() - relativedelta(days=60)).strftime('%Y-%m-%d')

'2022-10-09'

In [15]:
split_date

In [14]:
# split_date = '2022-10-31'
split_date = (dt.now() - relativedelta(days=30)).strftime('%Y-%m-%d') 


In [15]:
preprocessing_code = 'src/preprocessing.py'
%store preprocessing_code

Stored 'preprocessing_code' (str)


## 2) 전처리 로직 로컬에서 실행

### (1) SageMaker Processing의 Docker Container와 같은 환경 구성
도커 컨테이너의 출력 폴더와 비슷한 환경 기술
- 로컬 경로 : opt/ml/processing/output
- 도커 경로 : /opt/ml/processing/output

In [46]:
if LOCAL_MODE:
    # 도커 컨테이너 입력 폴더: staged data가 들어가는 부분
    base_preproc_input_dir = 'opt/ml/processing/input'
    os.makedirs(base_preproc_input_dir, exist_ok=True)

    # 도커 컨테이너 기본 출력 폴더
    base_output_dir = 'opt/ml/processing/output'
    os.makedirs(base_output_dir, exist_ok=True)

    # 도커 컨테이너 출력 폴더: stage 데이터셋이 들어가는 부분
    base_preproc_output_stage_dir = f'{base_output_dir}/stage'
    os.makedirs(base_preproc_output_stage_dir, exist_ok=True)

    # 도커 컨테이너 출력 폴더: train 데이터셋이 들어가는 부분
    base_preproc_output_train_dir = f'{base_output_dir}/train'
    os.makedirs(base_preproc_output_train_dir, exist_ok=True)

    # 도커 컨테이너 출력 폴더: test 데이터셋이 들어가는 부분
    base_preproc_output_test_dir =  f'{base_output_dir}/test'
    os.makedirs(base_preproc_output_test_dir, exist_ok=True)



In [48]:
%%time
!aws s3 cp 's3://palm-oil-price-forecast/staged-data/' 'opt/ml/processing/input2' --recursive --quiet

CPU times: user 535 ms, sys: 131 ms, total: 666 ms
Wall time: 48.6 s


In [49]:
%%time
!python src/preprocessing.py --base_preproc_input_dir 'opt/ml/processing/input2' \
                                 --base_output_dir 'opt/ml/processing/output' \
                                 --split_date '2022-10-31' \
                                 # --scaler_switch 0 \

######### Argument Info ####################################
args.base_output_dir: opt/ml/processing/output
args.base_preproc_input_dir: opt/ml/processing/input2
args.label_column: ric
args.split_date: 2022-10-31
args.scaler_switch: 0

### Loading the key value using Secret Manager

### Data Integration
The number for data : 12531
Data Integration is done
Runtime : 236.7 sec(3.9 min)
The number for data : 12531
Integrated data sample: head(2) 
          Date    HIGH     LOW    OPEN   CLOSE  VOLUME  RIC
0  1980-01-01  107.94  107.94  107.94  107.94     NaN  SPX
1  1980-01-01     NaN     NaN     NaN  879.38     NaN  HSI
Integrated data sample: tail(2) 
               Date      HIGH        LOW  ...      CLOSE        VOLUME   RIC
896249  2022-12-06  1473.520  1467.1300  ...  1471.5500  2.252947e+08  KLSE
896250  2022-12-06  3224.822  3195.0788  ...  3212.5334  3.735903e+10  SSEC

[2 rows x 7 columns]

 ### RIC Item selection
The number for data after RIC Item selection : (877162, 8)

 ### 

### (2) 전처리된 데이터 확인

In [214]:
if 1:#LOCAL_MODE:
    preprocessed_stage_path = os.path.join(base_preproc_output_stage_dir + '/stage.csv')
    preprocessed_train_path = os.path.join(base_preproc_output_train_dir + '/train.csv')
    preprocessed_test_path = os.path.join(base_preproc_output_test_dir + '/test.csv')

    preprocessed_stage_df = pd.read_csv(preprocessed_stage_path)
    preprocessed_train_df = pd.read_csv(preprocessed_train_path)
    preprocessed_test_df = pd.read_csv(preprocessed_test_path)

In [182]:
if LOCAL_MODE:
    print("##Stage Data Set: ##")
    print(preprocessed_stage_df[['RIC']].value_counts())

    print("\n##Train Data Set: ##")
    print(preprocessed_train_df[['ric']].value_counts())

    print("\n##Test Data Set: ##")
    print(preprocessed_test_df[['ric']].value_counts())

### (3) 실험: Fold 나누기

In [215]:
preprocessed_stage_df

Unnamed: 0,ds,high,low,open,y,ric
0,2014-07-02,38.959999,38.480000,38.869999,38.470001,BOc1
1,2014-07-03,38.660000,38.340000,38.470001,38.560001,BOc1
2,2014-07-04,38.590000,38.219999,38.495001,38.460001,BOc1
3,2014-07-07,38.520000,38.099998,38.520000,38.360001,BOc1
4,2014-07-08,38.410000,37.770000,38.410000,37.799999,BOc1
...,...,...,...,...,...,...
219403,2022-10-24,885.250000,863.750000,882.500000,868.500000,Wc3
219404,2022-10-25,874.000000,857.750000,871.500000,863.250000,Wc3
219405,2022-10-26,874.750000,854.750000,867.250000,871.500000,Wc3
219406,2022-10-27,885.750000,863.250000,872.500000,868.000000,Wc3


In [228]:
split_date = (dt.now() - relativedelta(days=60)).strftime('%Y-%m-%d') 
print(split_date)

df_train00 = preprocessed_train_df[preprocessed_train_df['ds'] < split_date]
df_valid00 = preprocessed_train_df[preprocessed_train_df['ds'] >= split_date]

for cnt in range(4):
    split_date = (dt.strptime(split_date, '%Y-%m-%d') - relativedelta(days=30)).strftime('%Y-%m-%d')
    print(split_date)
    print(f"df_train{str(cnt+1).zfill(2)} = df_train{str(cnt).zfill(2)}[df_train{str(cnt).zfill(2)}['ds'] < 'split_date]")
    exec(f"df_train{str(cnt+1).zfill(2)} = df_train{str(cnt).zfill(2)}[df_train{str(cnt).zfill(2)}['ds'] < split_date]")
    print(f"df_valid{str(cnt+1).zfill(2)} = df_train{str(cnt).zfill(2)}[df_train{str(cnt).zfill(2)}['ds'] >= split_date]")
    exec(f"df_valid{str(cnt+1).zfill(2)} = df_train{str(cnt).zfill(2)}[df_train{str(cnt).zfill(2)}['ds'] >= split_date]")
    print()

2022-10-09
2022-09-09
df_train01 = df_train00[df_train00['ds'] < 'split_date]
df_valid01 = df_train00[df_train00['ds'] >= split_date]

2022-08-10
df_train02 = df_train01[df_train01['ds'] < 'split_date]
df_valid02 = df_train01[df_train01['ds'] >= split_date]

2022-07-11
df_train03 = df_train02[df_train02['ds'] < 'split_date]
df_valid03 = df_train02[df_train02['ds'] >= split_date]

2022-06-11
df_train04 = df_train03[df_train03['ds'] < 'split_date]
df_valid04 = df_train03[df_train03['ds'] >= split_date]



In [224]:
print(df_train01.head(2))
print(df_train01.tail(2))
print()
print(len(df_train01))

           ds       high    low       open          y   ric
0  2014-07-02  38.959999  38.48  38.869999  38.470001  BOc1
1  2014-07-03  38.660000  38.34  38.470001  38.560001  BOc1
                ds    high     low    open       y  ric
219370  2022-09-07  885.25  824.25  831.25  857.50  Wc3
219371  2022-09-08  869.50  833.25  850.75  843.75  Wc3

215772


In [225]:
print(df_train00.tail(1))
print(df_train01.tail(1))
print(df_train02.tail(1))
print(df_train03.tail(1))
print(df_train04.tail(1))
print()
print(len(df_train00))
print(len(df_train01))
print(len(df_train02))
print(len(df_train03))
print(len(df_train04))

                ds   high     low    open       y  ric
219392  2022-10-07  918.0  895.75  904.75  903.25  Wc3
                ds   high     low    open       y  ric
219371  2022-09-08  869.5  833.25  850.75  843.75  Wc3
                ds   high    low   open      y  ric
219349  2022-08-09  838.0  810.5  822.0  816.5  Wc3
                ds   high    low   open      y  ric
219327  2022-07-08  909.5  854.0  854.0  906.5  Wc3
                ds     high      low     open       y  ric
219307  2022-06-10  1107.25  1078.75  1099.75  1098.0  Wc3

217893
215772
213550
211328
209308


In [227]:
print(df_valid00.head(1))
print(df_valid01.head(1))
print(df_valid02.head(1))
print(df_valid03.head(1))
print(df_valid04.head(1))
print()
print(len(df_valid00))
print(len(df_valid01))
print(len(df_valid02))
print(len(df_valid03))
print(len(df_valid04))

              ds       high        low       open          y   ric
2158  2022-10-10  69.910004  69.800003  69.800003  69.910004  BOc1
              ds  high   low  open      y   ric
2137  2022-09-09  70.0  70.0  70.0  70.25  BOc1
              ds       high        low       open          y   ric
2115  2022-08-10  71.440002  69.809998  69.809998  70.559998  BOc1
              ds       high   low       open          y   ric
2093  2022-07-11  66.510002  65.0  65.910004  65.209999  BOc1
              ds   high        low       open          y   ric
2073  2022-06-13  81.07  78.720001  80.919998  79.510002  BOc1

1515
2121
2222
2222
2020


In [168]:
df_valid01['ric'].value_counts()

BOc1      22
KLKKKL    22
LWBc1     22
LCOc3     22
LCOc2     22
          ..
DJCIBR    22
DJCI      22
CTc3      22
CTc2      22
Wc3       22
Name: ric, Length: 101, dtype: int64

In [167]:
df_valid00['ric'].value_counts()

BOc1      15
KLKKKL    15
LWBc1     15
LCOc3     15
LCOc2     15
          ..
DJCIBR    15
DJCI      15
CTc3      15
CTc2      15
Wc3       15
Name: ric, Length: 101, dtype: int64

In [153]:
print(preprocessed_test_df.head(2))
print(preprocessed_test_df.tail(2))
print()
print(len(preprocessed_test_df))

           ds       high        low   open          y   ric
0  2022-10-31  74.279999  72.500000  72.50  73.239998  BOc1
1  2022-11-01  74.570000  72.739998  73.18  73.290001  BOc1
              ds    high     low   open       y  ric
2725  2022-12-05  779.75  745.75  775.0  751.75  Wc3
2726  2022-12-06  754.50  735.00  751.0  741.75  Wc3

2727


## 3) 모델 빌딩 파이프라인 의 스텝(Step) 생성
### 3.1) 모델 빌딩 파이프라인 변수 생성
파이프라인에서 사용할 파이프라인 파라미터를 정의합니다. 파이프라인을 스케줄하고 실행할 때 파라미터를 이용하여 실행조건을 커스마이징할 수 있습니다. 파라미터를 이용하면 파이프라인 실행시마다 매번 파이프라인 정의를 수정하지 않아도 됩니다.

지원되는 파라미터 타입은 다음과 같습니다:

- ParameterString - 파이썬 타입에서 str
- ParameterInteger - 파이썬 타입에서 int
- ParameterFloat - 파이썬 타입에서 float
이들 파라미터를 정의할 때 디폴트 값을 지정할 수 있으며 파이프라인 실행시 재지정할 수도 있습니다. 지정하는 디폴트 값은 파라미터 타입과 일치하여야 합니다.

파이프라인의 각 스텝에서 사용할 변수를 파라미터 변수로서 정의 합니다.

In [21]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
)

processing_instance_count = ParameterInteger(
    name = "ProcessingInstanceCount",
    default_value = 1
)
processing_instance_type = ParameterString(
    name = "ProcessingInstanceType",
    default_value = "ml.m5.xlarge"
)
input_stage_data = ParameterString(
    name = "InputStageData",
    default_value = stage_data_uri,
)

### 3.2) 전처리 스텝 프로세서 정의
전처리의 내장 SKLearnProcessor 를 통해서 sklearn_processor 오브젝트를 생성 합니다.

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor

framework_version = "0.23-1"

sklearn_processor = SKLearnProcessor(
    framework_version = framework_version,
    instance_type = processing_instance_type,
    instance_count = processing_instance_count,
    base_job_name = "Palm_oil_forecast-Data_transform",
    role = sagemaker.get_execution_role(),
)

The input argument instance_type of function (sagemaker.image_uris.retrieve) is a pipeline variable (<class 'sagemaker.workflow.parameters.ParameterString'>), which is not allowed. The default_value of this Parameter object will be used to override it. Please make sure the default_value is valid.


### 3.3) 전처리 스텝 단계 정의
처리 단계에서는 아래와 같은 주요 인자가 있습니다.
단계 이름
- processor 기술: 위에서 생성한 processor 오브젝트를 제공
- inputs: S3의 경로를 기술하고, 다커안에서의 다운로드 폴더(destination)을 기술 합니다.
- outputs: 처리 결과가 저장될 다커안에서의 폴더 경로를 기술합니다.

도커안의 결과 파일이 저장 후에 자동으로 S3로 업로딩을 합니다.
- job_arguments: 사용자 정의의 인자를 기술 합니다.
- code: 전처리 코드의 경로를 기술 합니다.
처리 단계의 상세한 사항은 여기를 보세요. --> 처리 단계, Processing Step

In [44]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
    
step_process = ProcessingStep(
    name = "Palm_oil_forecast-Processing",
    processor = sklearn_processor,
    inputs = [
        ProcessingInput(source = stage_data_uri,
                        destination = '/opt/ml/processing/input'),
    ],
    outputs = [
        ProcessingOutput(output_name = "stage",
                         source = '/opt/ml/processing/output/stage',
                         destination = preproc_data_dir),
        ProcessingOutput(output_name = "train",
                         source = '/opt/ml/processing/output/train',
                         destination = preproc_data_dir),
        ProcessingOutput(output_name = "test",
                         source = '/opt/ml/processing/output/test',
                         destination = preproc_data_dir),
    ],
    job_arguments=["--split_date", split_date],    
    code = preprocessing_code
)

## 4) 파리마터, 단계, 조건을 조합하여 최종 파이프라인 정의 및 실행
이제 지금까지 생성한 단계들을 하나의 파이프라인으로 조합하고 실행하도록 하겠습니다.

파이프라인은 name, parameters, steps 속성이 필수적으로 필요합니다. 여기서 파이프라인의 이름은 (account, region) 조합에 대하여 유일(unique))해야 합니다.

주의:

- 정의에 사용한 모든 파라미터가 존재해야 합니다.
- 파이프라인으로 전달된 단계(step)들은 실행순서와는 무관합니다. SageMaker Pipeline은 단계가 실행되고 완료될 수 있도록 의존관계를를 해석합니다.
- [알림] 정의한 stpes 이 복수개이면 복수개를 기술합니다. 만약에 step 간에 의존성이 있으면, 명시적으로 기술하지 않아도 같이 실행 됩니다.

In [None]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = project_prefix
pipeline = Pipeline(name = pipeline_name,
                    parameters = [
                        processing_instance_type, 
                        processing_instance_count,
                        input_stage_data,
                    ],
                    steps = [step_process],
)

In [None]:
import json

definition = json.loads(pipeline.definition())
definition

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'ProcessingInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'InputStageData',
   'Type': 'String',
   'DefaultValue': 's3://palm-oil-price-forecast/staged-data'}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'Palm_oil_forecast-Processing',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': {'Get': 'Parameters.ProcessingInstanceType'},
      'InstanceCount': {'Get': 'Parameters.ProcessingInstanceCount'},
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '366743142698.dkr.ecr.ap-northeast-2.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3',
     'ContainerArguments': ['--split_date',
      '2022-10-31',
      'scaler_switch',
   

In [None]:
%%time
start = time.time()

pipeline.upsert(role_arn = sagemaker.get_execution_role())
execution = pipeline.start()
execution.wait() 
end = time.time()

WaiterError: Waiter PipelineExecutionComplete failed: Waiter encountered a terminal failure state: For expression "PipelineExecutionStatus" we matched expected path: "Failed"

In [27]:
print(f"preprocessing 시간 : {end - start:.1f} sec")
print(f"preprocessing 시간 : {((end - start)/60):.1f} min")

preprocessing 시간 : 935.4 sec
preprocessing 시간 : 15.6 min


In [28]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:ap-northeast-2:276114397529:pipeline/palm-oil-price-forecast',
 'PipelineExecutionArn': 'arn:aws:sagemaker:ap-northeast-2:276114397529:pipeline/palm-oil-price-forecast/execution/otrsacpxffbz',
 'PipelineExecutionDisplayName': 'execution-1670401588923',
 'PipelineExecutionStatus': 'Succeeded',
 'PipelineExperimentConfig': {'ExperimentName': 'palm-oil-price-forecast',
  'TrialName': 'otrsacpxffbz'},
 'CreationTime': datetime.datetime(2022, 12, 7, 8, 26, 28, 861000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2022, 12, 7, 8, 41, 52, 799000, tzinfo=tzlocal()),
 'CreatedBy': {},
 'LastModifiedBy': {},
 'ResponseMetadata': {'RequestId': '89db4cad-ed61-4aa1-aa8c-7229a8a20028',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '89db4cad-ed61-4aa1-aa8c-7229a8a20028',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '520',
   'date': 'Wed, 07 Dec 2022 08:42:02 GMT'},
  'RetryAttempts': 0}}

# 3. 전처리 결과파일 경로 추출

In [29]:
def get_proc_artifact(execution, client, kind=0):
    '''
    kind: 0 --> stage
    kind: 1 --> train
    kind: 2 --> test
    '''
    response = execution.list_steps()
    proc_arn = response[-1]['Metadata']['ProcessingJob']['Arn'] # index -1은 가장 처음 실행 step
    proc_job_name = proc_arn.split('/')[-1] # Processing job name만 추출
    response = client.describe_processing_job(ProcessingJobName = proc_job_name)
    file_uri = response['ProcessingOutputConfig']['Outputs'][kind]['S3Output']['S3Uri']
    return file_uri

In [30]:
preprocessed_stage_uri = get_proc_artifact(execution, sm_client, kind=0) + '/stage.csv'
preprocessed_train_uri = get_proc_artifact(execution, sm_client, kind=1) + '/train.csv'
preprocessed_test_uri = get_proc_artifact(execution, sm_client, kind=2) + '/test.csv'

print("train_preproc_dir_artifact: ", preprocessed_stage_uri)
print("\ntrain_preproc_dir_artifact: ", preprocessed_train_uri)
print("\ntest_preproc__dir_artifact: ", preprocessed_test_uri)

train_preproc_dir_artifact:  s3://palm-oil-price-forecast/golden-data/2022/12/07/stage.csv

train_preproc_dir_artifact:  s3://palm-oil-price-forecast/golden-data/2022/12/07/train.csv

test_preproc__dir_artifact:  s3://palm-oil-price-forecast/golden-data/2022/12/07/test.csv


In [31]:
%store preprocessed_stage_uri
%store preprocessed_train_uri
%store preprocessed_test_uri

Stored 'preprocessed_stage_uri' (str)
Stored 'preprocessed_train_uri' (str)
Stored 'preprocessed_test_uri' (str)
