In [1]:
LOCAL_MODE = False

# 0. 환경설정

In [3]:
import argparse
import os
import requests
import tempfile
import subprocess, sys

import pandas as pd
import numpy as np
from glob import glob
import copy
from collections import OrderedDict
from pathlib import Path
import joblib

import logging
import logging.handlers

import json
import base64
import boto3
import sagemaker
from botocore.client import Config
from botocore.exceptions import ClientError

import time
from datetime import datetime as dt
import datetime
from pytz import timezone
from dateutil.relativedelta import *

In [4]:
# 한국 시간
KST = dt.today() + relativedelta(hours=9)
KST_aday_before = KST - relativedelta(days=1) 
yyyy, mm, dd = str(KST_aday_before.year), str(KST_aday_before.month).zfill(2), str(KST_aday_before.day).zfill(2)
print(f"Start job time: {KST}")

Start job time: 2023-03-21 13:13:44.426338


In [99]:
def get_secret():
    secret_name = "dev/ForecastPalmOilPrice"
    region_name = "ap-northeast-2"
    
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name,
    )

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret

keychain = json.loads(get_secret())
ACCESS_KEY_ID = keychain['AWS_ACCESS_KEY_ID']
ACCESS_SECRET_KEY = keychain['AWS_ACCESS_SECRET_KEY']

BUCKET_NAME_USECASE = keychain['PROJECT_BUCKET_NAME']
DATALAKE_BUCKET_NAME = keychain['DATALAKE_BUCKET_NAME']

S3_PATH_REUTER = keychain['S3_PATH_REUTER']
S3_PATH_WWO = keychain['S3_PATH_WWO']
S3_PATH_STAGE = keychain['S3_PATH_STAGE']
S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
S3_PATH_FORECAST = keychain['S3_PATH_PREDICTION']

region = 'ap-northeast-2'
boto3_session = boto3.Session(aws_access_key_id = ACCESS_KEY_ID,
                              aws_secret_access_key = ACCESS_SECRET_KEY,
                              region_name = region)
sm_session = sagemaker.Session(boto_session = boto3_session)

s3_resource = boto3_session.resource('s3')
palmoil_bucket = s3_resource.Bucket(BUCKET_NAME_USECASE)
datalake_bucket = s3_resource.Bucket(DATALAKE_BUCKET_NAME)

sm_client = boto3_session.client('sagemaker')
qs_client = boto3_session.client('quicksight')
s3_client = boto3_session.client('s3')
sts_client = boto3_session.client("sts")

In [15]:
%%writefile src/v1.2/prediction-autogluon.py

import argparse
import os
import requests
import tempfile
import subprocess, sys
import json

import glob
import pandas as pd
import joblib
import pickle
import tarfile
from io import StringIO, BytesIO

import logging
import logging.handlers

import time
from datetime import datetime as dt

import boto3


###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()

def get_bucket_key_from_uri(uri):
    uri_aws_path = uri.split('//')[1]
    uri_bucket = uri_aws_path.rsplit('/')[0]
    uri_file_path = '/'.join(uri_aws_path.rsplit('/')[1:])
    return uri_bucket, uri_file_path

def get_secret():
    secret_name = "dev/ForecastPalmOilPrice"
    region_name = "ap-northeast-2"
    
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name,
    )

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret
        
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_dir', type = str, default = "/opt/ml/processing/output", help='예측 결과값이 저장되는 곳, Inference 결과가 저장된다.')
    parser.add_argument('--ml_algorithm_name', type=str, default = 'Autogluon')  
    parser.add_argument('--model_package_group_name', type=str, default = BUCKET_NAME_USECASE)  
    parser.add_argument('--qs_data_name', type=str, default = 'forecast result')    

    return parser.parse_args()
        
if __name__=='__main__':
    ########################################### 
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'autogluon==0.6.1'])
    from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
    
    ############################################
    ###### Secret Manager에서 키값 가져오기  #######
    ########################################### 
    logger.info(f"\n### Loading Key value from Secret Manager")
    keychain = json.loads(get_secret())
    ACCESS_KEY_ID = keychain['AWS_ACCESS_KEY_ID']
    ACCESS_SECRET_KEY = keychain['AWS_ACCESS_SECRET_KEY']
    BUCKET_NAME_USECASE = keychain['PROJECT_BUCKET_NAME']
    DATALAKE_BUCKET_NAME = keychain['DATALAKE_BUCKET_NAME']
    S3_PATH_REUTER = keychain['S3_PATH_REUTER']
    S3_PATH_WWO = keychain['S3_PATH_WWO']
    S3_PATH_STAGE = keychain['S3_PATH_STAGE']
    S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
    S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
    S3_PATH_FORECAST = keychain['S3_PATH_PREDICTION']
    
    boto3_session = boto3.Session(aws_access_key_id = ACCESS_KEY_ID,
                                  aws_secret_access_key = ACCESS_SECRET_KEY,
                                  region_name = 'ap-northeast-2')
    
    s3_client = boto3_session.client('s3')
    sm_client = boto3_session.client('sagemaker')
    qs_client = boto3_session.client('quicksight')

    sts_client = boto3_session.client("sts")
    user_account_id = sts_client.get_caller_identity()["Account"]
    ######################################
    ## 커맨드 인자, Hyperparameters 처리 ##
    ######################################  
    logger.info("######### Argument Info ####################################")
    logger.info("### start training code")    
    logger.info("### Argument Info ###")
    args = parse_args()             
    logger.info(f"args.output_dir: {args.output_dir}")
    logger.info(f"args.ml_algorithm_name: {args.ml_algorithm_name}")
    logger.info(f"args.model_package_group_name: {args.model_package_group_name}")
    logger.info(f"args.qs_data_name: {args.qs_data_name}")
# prediction_output_path = f"s3://crude-palm-oil-prices-forecast/predicted-data/2023/03/19/1679292475.0/result"

    output_dir = args.output_dir
    ml_algorithm_name = args.ml_algorithm_name
    model_package_group_name = args.model_package_group_name
    qs_data_name = args.qs_data_name

    model_dir = f'{output_dir}/model'
    os.makedirs(model_dir, exist_ok=True)
    
    result_dir = f'{output_dir}/result'
    os.makedirs(result_dir, exist_ok=True)

    manifest_base_path = f'{output_dir}/manifest'
    os.makedirs(manifest_base_path, exist_ok=True)
    ##########################################################
    ###### 적합한 모델의 URI 찾고, 탑 성능 모델 이름 가져오기 ##########
    #########################################################
    logger.info("\n######### Finding suitable model uri ####################################")
    model_registry_list = sm_client.list_model_packages(ModelPackageGroupName = model_package_group_name)['ModelPackageSummaryList']
    for model in model_registry_list:
        if (model['ModelPackageGroupName'] == model_package_group_name and
            model['ModelApprovalStatus'] == 'Approved' and
            model['ModelPackageDescription'] == ml_algorithm_name):
            mr_arn = model['ModelPackageArn']
            break
    model_spec = sm_client.describe_model_package(ModelPackageName=mr_arn)
    train_data_dir = model_spec['CustomerMetadataProperties']['train_data']
    test_data_dir = model_spec['CustomerMetadataProperties']['test_data']
    
    s3_model_uri = model_spec['InferenceSpecification']['Containers'][0]['ModelDataUrl']
    champion_model = model_spec['CustomerMetadataProperties']['champion_model']

    logger.info(f"Found suitable model uri: {s3_model_uri}")
    logger.info(f"And top model name: {champion_model}")
    
    logger.info("\n#########Download suitable model file  ####################################")
    model_bucket, model_key = get_bucket_key_from_uri(s3_model_uri)  
    logger.info(f"\nmodel_bucket: {model_bucket}, model_key: {model_key}  ####################################")
    model_obj = s3_client.get_object(Bucket = model_bucket, Key = model_key)
    
    ##########################################################
    ###### 모델 압축 풀고 TimeseriesDataFrame으로 변환 ##########
    #########################################################
    logger.info("\n######### Model zip file extraction ####################################")
    with tarfile.open(fileobj=model_obj['Body'], mode='r|gz') as file:
        file.extractall(output_dir)
    logger.info(f"list in {model_dir}: {os.listdir(model_dir)}")
    logger.info("\n######### Convert df_test dataframe into TimeSeriesDataFrame  ###########")        
    df_train = pd.read_csv(os.path.join(train_data_dir, 'train_fold1.csv'))
    df_test = pd.read_csv(os.path.join(test_data_dir, 'test_fold1.csv'))
    sum_df = pd.concat([df_train, df_test]).reset_index(drop = True)
    sum_df.loc[:, "ds"] = pd.to_datetime(sum_df.loc[:, "ds"])
    
    tdf_train = TimeSeriesDataFrame.from_data_frame(
        sum_df,
        id_column="ric",
        timestamp_column="ds",
    )
    logger.info(f"sum_df sample: head(2) \n {sum_df.head(2)}")
    logger.info(f"sum_df sample: tail(2) \n {sum_df.tail(2)}")

    ################################
    ###### Prediction 시작 ##########
    ###############################
    logger.info("\n######### Start prediction  ###########")        
    loaded_trainer = pickle.load(open(f"{model_dir}/models/trainer.pkl", 'rb'))
    logger.info(f"loaded_trainer: {loaded_trainer}")
    prediction_ag_model = loaded_trainer.predict(data = tdf_train,
                                                 model = champion_model)
    logger.info(f"prediction_ag_model sample: head(2) \n {prediction_ag_model.head(2)}")
    prediction_result = prediction_ag_model.loc['FCPOc3']
    logger.info(f"prediction_ag_model sample: head(2) \n {prediction_result.head(2)}")
    prediction_result.to_csv(f'{result_dir}/prediction_result.csv')

Overwriting src/v1.2/prediction-autogluon.py


In [23]:
%%writefile src/v1.2/visualization.py

import argparse
import os
import requests
import tempfile
import subprocess, sys
import json

import glob
import pandas as pd
import joblib
import pickle
import tarfile
from io import StringIO, BytesIO

import logging
import logging.handlers

import time
import calendar
from datetime import datetime as dt

import boto3


###############################
######### util 함수 설정 ##########
###############################
def _get_logger():
    loglevel = logging.DEBUG
    l = logging.getLogger(__name__)
    if not l.hasHandlers():
        l.setLevel(loglevel)
        logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))        
        l.handler_set = True
    return l  
logger = _get_logger()

def get_secret():
    secret_name = "dev/ForecastPalmOilPrice"
    region_name = "ap-northeast-2"
    
    # Create a Secrets Manager client
    session = boto3.session.Session()
    client = session.client(
        service_name='secretsmanager',
        region_name=region_name,
    )

    try:
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        if e.response['Error']['Code'] == 'DecryptionFailureException': # Secrets Manager can't decrypt the protected secret text using the provided KMS key.
            raise e
        elif e.response['Error']['Code'] == 'InternalServiceErrorException': # An error occurred on the server side.
            raise e
        elif e.response['Error']['Code'] == 'InvalidParameterException': # You provided an invalid value for a parameter.
            raise e
        elif e.response['Error']['Code'] == 'InvalidRequestException': # You provided a parameter value that is not valid for the current state of the resource.
            raise e
        elif e.response['Error']['Code'] == 'ResourceNotFoundException': # We can't find the resource that you asked for.
            raise e
    else:
        if 'SecretString' in get_secret_value_response:
            secret = get_secret_value_response['SecretString']
            return secret
        else:
            decoded_binary_secret = base64.b64decode(get_secret_value_response['SecretBinary'])
            return decoded_binary_secret
        

def register_manifest(source_path,
                      target_path,
                      s3_client,
                      BUCKET_NAME_USECASE):
    template_json = {"fileLocations": [{"URIPrefixes": []}],
                     "globalUploadSettings": {
                         "format": "CSV",
                         "delimiter": ","
                     }}
    paginator = s3_client.get_paginator('list_objects_v2')
    response_iterator = paginator.paginate(Bucket = BUCKET_NAME_USECASE,
                                           Prefix = source_path.split(BUCKET_NAME_USECASE+'/')[1]
                                          )
    for page in response_iterator:
        logger.info(f"\n#### page {page}")
        for content in page['Contents']:
            template_json['fileLocations'][0]['URIPrefixes'].append(f's3://{BUCKET_NAME_USECASE}/'+content['Key'])
    with open(f'./manifest_testing.manifest', 'w') as f:
        json.dump(template_json, f, indent=2)

    res = s3_client.upload_file('./manifest_testing.manifest',
                                BUCKET_NAME_USECASE,
                                f"{target_path.split(BUCKET_NAME_USECASE+'/')[1]}/visual_validation.manifest")
    return f"{target_path.split(BUCKET_NAME_USECASE+'/')[1]}/visual_validation.manifest"
    

def refresh_of_spice_datasets(user_account_id,
                              qs_data_name,
                              manifest_file_path,
                              BUCKET_NAME_USECASE,
                              qs_client):
    
    ds_list = qs_client.list_data_sources(AwsAccountId=user_account_id)
    datasource_ids = [summary["DataSourceId"] for summary in ds_list["DataSources"] if qs_data_name in summary["Name"]]    
    for datasource_id in datasource_ids:
        response = qs_client.update_data_source(
            AwsAccountId=user_account_id,
            DataSourceId=datasource_id,
            Name=qs_data_name,
            DataSourceParameters={
                'S3Parameters': {
                    'ManifestFileLocation': {
                        'Bucket': BUCKET_NAME_USECASE,
                        'Key':  manifest_file_path
                    },
                },
            })
        logger.info(f"datasource_id:{datasource_id} 의 manifest를 업데이트: {response}")
    
    res = qs_client.list_data_sets(AwsAccountId = user_account_id)
    datasets_ids = [summary["DataSetId"] for summary in res["DataSetSummaries"] if qs_data_name in summary["Name"]]
    ingestion_ids = []

    for dataset_id in datasets_ids:
        try:
            ingestion_id = str(calendar.timegm(time.gmtime()))
            qs_client.create_ingestion(DataSetId = dataset_id,
                                       IngestionId = ingestion_id,
                                       AwsAccountId = user_account_id)
            ingestion_ids.append(ingestion_id)
        except Exception as e:
            logger.info(e)
            pass
    for ingestion_id, dataset_id in zip(ingestion_ids, datasets_ids):
        while True:
            response = qs_client.describe_ingestion(DataSetId = dataset_id,
                                                    IngestionId = ingestion_id,
                                                    AwsAccountId = user_account_id)
            if response['Ingestion']['IngestionStatus'] in ('INITIALIZED', 'QUEUED', 'RUNNING'):
                time.sleep(5)     #change sleep time according to your dataset size
            elif response['Ingestion']['IngestionStatus'] == 'COMPLETED':
                print("refresh completed. RowsIngested {0}, RowsDropped {1}, IngestionTimeInSeconds {2}, IngestionSizeInBytes {3}".format(
                    response['Ingestion']['RowInfo']['RowsIngested'],
                    response['Ingestion']['RowInfo']['RowsDropped'],
                    response['Ingestion']['IngestionTimeInSeconds'],
                    response['Ingestion']['IngestionSizeInBytes']))
                break
            else:
                logger.info("refresh failed for {0}! - status {1}".format(dataset_id,
                                                                          response['Ingestion']['IngestionStatus']))
                break
    return response
        
def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--source_path", type=str, help='prediction_data')
    parser.add_argument("--qs_data_name", type=str, default='forecast_result')
    parser.add_argument('--model_package_group_name', type=str, default = BUCKET_NAME_USECASE)  
    return parser.parse_args()
        
if __name__=='__main__':
    ############################################
    ###### Secret Manager에서 키값 가져오기  #######
    ########################################### 
    logger.info(f"\n### Loading Key value from Secret Manager")
    keychain = json.loads(get_secret())
    ACCESS_KEY_ID = keychain['AWS_ACCESS_KEY_ID']
    ACCESS_SECRET_KEY = keychain['AWS_ACCESS_SECRET_KEY']
    BUCKET_NAME_USECASE = keychain['PROJECT_BUCKET_NAME']
    DATALAKE_BUCKET_NAME = keychain['DATALAKE_BUCKET_NAME']
    S3_PATH_REUTER = keychain['S3_PATH_REUTER']
    S3_PATH_WWO = keychain['S3_PATH_WWO']
    S3_PATH_STAGE = keychain['S3_PATH_STAGE']
    S3_PATH_GOLDEN = keychain['S3_PATH_GOLDEN']
    S3_PATH_TRAIN = keychain['S3_PATH_TRAIN']
    S3_PATH_FORECAST = keychain['S3_PATH_PREDICTION']
    
    boto3_session = boto3.Session(aws_access_key_id = ACCESS_KEY_ID,
                                  aws_secret_access_key = ACCESS_SECRET_KEY,
                                  region_name = 'ap-northeast-2')
    
    s3_client = boto3_session.client('s3')
    sm_client = boto3_session.client('sagemaker')
    qs_client = boto3_session.client('quicksight')

    sts_client = boto3_session.client("sts")
    user_account_id = sts_client.get_caller_identity()["Account"]
    ######################################
    ## 커맨드 인자, Hyperparameters 처리 ##
    ######################################  
    logger.info("######### Argument Info ####################################")
    logger.info("### start training code")    
    logger.info("### Argument Info ###")
    args = parse_args()             
    logger.info(f"args.source_path: {args.source_path}")
    logger.info(f"args.qs_data_name: {args.qs_data_name}")
    logger.info(f"args.model_package_group_name: {args.model_package_group_name}")
 
    source_path = args.source_path
    qs_data_name = args.qs_data_name    
    model_package_group_name = args.model_package_group_name
    
    target_path = source_path.rsplit('/',1)[0]+'/manifest'
    logger.info(f"\n#### target_path : {target_path}")

    logger.info(f"\n#### register_manifest")
    manifest_file_path = register_manifest(source_path, 
                                           target_path,
                                           s3_client,
                                           BUCKET_NAME_USECASE)
    logger.info(f'### manifest_file_path : {manifest_file_path}')
    logger.info(f"\n#### refresh_of_spice_datasets")
    res = refresh_of_spice_datasets(user_account_id,
                                    qs_data_name,
                                    manifest_file_path,
                                    BUCKET_NAME_USECASE,
                                    qs_client)
    logger.info(f'### refresh_of_spice_datasets : {res}')

Overwriting src/v1.2/visualization.py


In [24]:
!aws s3 cp 'src/v1.2' 's3://crude-palm-oil-prices-forecast/src' --recursive --exclude ".ipynb_checkpoints*"

upload: src/v1.2/prediction-autogluon.py to s3://crude-palm-oil-prices-forecast/src/prediction-autogluon.py
upload: src/v1.2/model_validation.py to s3://crude-palm-oil-prices-forecast/src/model_validation.py
upload: src/v1.2/train.py to s3://crude-palm-oil-prices-forecast/src/train.py
upload: src/v1.2/prediction.py to s3://crude-palm-oil-prices-forecast/src/prediction.py
upload: src/v1.2/visualization.py to s3://crude-palm-oil-prices-forecast/src/visualization.py
upload: src/v1.2/preprocessing.py to s3://crude-palm-oil-prices-forecast/src/preprocessing.py


In [25]:
prediction_code = 's3://crude-palm-oil-prices-forecast/src/prediction-autogluon.py'
visualization_code = 's3://crude-palm-oil-prices-forecast/src/visualization.py'
%store prediction_code
%store visualization_code

Stored 'prediction_code' (str)
Stored 'visualization_code' (str)


In [9]:
%store

Stored variables and their in-db values:
prediction_code                -> 's3://crude-palm-oil-prices-forecast/src/predictio
visualization_code             -> 's3://crude-palm-oil-prices-forecast/src/visualiza


In [10]:
%store -r

# 1. 모델 빌딩 파이프라인 의 스텝(Step) 생성
## 1) 모델 빌딩 파이프라인 변수 생성
파이프라인에서 사용할 파이프라인 파라미터를 정의합니다. 파이프라인을 스케줄하고 실행할 때 파라미터를 이용하여 실행조건을 커스마이징할 수 있습니다. 파라미터를 이용하면 파이프라인 실행시마다 매번 파이프라인 정의를 수정하지 않아도 됩니다.

지원되는 파라미터 타입은 다음과 같습니다:

- ParameterString - 파이썬 타입에서 str
- ParameterInteger - 파이썬 타입에서 int
- ParameterFloat - 파이썬 타입에서 float
이들 파라미터를 정의할 때 디폴트 값을 지정할 수 있으며 파이프라인 실행시 재지정할 수도 있습니다. 지정하는 디폴트 값은 파라미터 타입과 일치하여야 합니다.

본 노트북에서 사용하는 파라미터는 다음과 같습니다.

- processing_instance_type - 프로세싱 작업에서 사용할 ml.* 인스턴스 타입
- processing_instance_count - 프로세싱 작업에서 사용할 인스턴스 개수
- training_instance_type - 학습작업에서 사용할 ml.* 인스턴스 타입
- model_approval_status - 학습된 모델을 CI/CD를 목적으로 등록할 때의 승인 상태 (디폴트는 "PendingManualApproval")
- input_data - 입력데이터에 대한 S3 버킷 URI
파이프라인의 각 스텝에서 사용할 변수를 파라미터 변수로서 정의 합니다.

# 2. 파이프라인 정의 및 실행

In [26]:
from sagemaker.workflow.parameters import (ParameterInteger,
                                           ParameterString,
                                          )
prediction_instance_type = ParameterString(
    name = "PredctionInstanceType",
    default_value = "ml.m5.xlarge"
)
prediction_instance_count = ParameterInteger(
    name = "PredctionInstanceCount",
    default_value = 1
)
visualization_instance_type = ParameterString(
    name = "VisualizationInstanceType",
    default_value = "ml.m5.xlarge"
)
visualization_instance_count = ParameterInteger(
    name = "VisualizationInstanceCount",
    default_value = 1
)

In [12]:
prediction_input_path = f"s3://{BUCKET_NAME_USECASE}/{S3_PATH_FORECAST}/{yyyy}/{mm}/{dd}/1679292475.0/result"
manifest_base_path = f"s3://{BUCKET_NAME_USECASE}/{S3_PATH_FORECAST}/{yyyy}/{mm}/{dd}/1679292475.0/manifest"
print(prediction_input_path)
print(manifest_base_path)

s3://crude-palm-oil-prices-forecast/predicted-data/2023/03/19/1679292475.0/result
s3://crude-palm-oil-prices-forecast/predicted-data/2023/03/19/1679292475.0/manifest


## 1) 스텝정의

### (2) ScriptProcessor 진행

In [27]:
from sagemaker import get_execution_role
from sagemaker.image_uris import retrieve
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor

role = sagemaker.get_execution_role()
image_uri = retrieve(framework='mxnet',
                     region='ap-northeast-2',
                     version='1.9.0',
                     py_version='py38',
                     image_scope='training',
                     instance_type="ml.m5.xlarge")

script_processor_prediction = ScriptProcessor(
    command=['python3'],
    image_uri=image_uri,
    instance_type = prediction_instance_type,
    instance_count = prediction_instance_count,
    base_job_name = f"{BUCKET_NAME_USECASE}(Prediction)",
    role = role,
)


skframework_version = "1.0-1"#"0.23-1"
skprocessor_visualization = SKLearnProcessor(
    framework_version = skframework_version,
    instance_type = "ml.t3.medium",
    instance_count = 1,
    base_job_name = f"{BUCKET_NAME_USECASE}(Visualization)",
    role = role,
)

In [22]:
prediction_output_path = f"s3://crude-palm-oil-prices-forecast/predicted-data/2023/03/19/1679292475.0/result"

In [28]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep

step_prediction = ProcessingStep(
    name = f"{BUCKET_NAME_USECASE}-Prediction",
    processor = script_processor_prediction,
    inputs=[],
    outputs=[
        ProcessingOutput(output_name = "prediction_data",
                         source = "/opt/ml/processing/output/result",
                         destination = prediction_output_path)
        ],
    job_arguments = ["--model_package_group_name", BUCKET_NAME_USECASE,
                     "--ml_algorithm_name", 'Autogluon'],
    code = prediction_code
)

step_visualization = ProcessingStep(
    name = f"{BUCKET_NAME_USECASE}-Visualization",
    processor = skprocessor_visualization,
    job_arguments = ["--source_path", step_prediction.properties.ProcessingOutputConfig.Outputs["prediction_data"].S3Output.S3Uri,
                     "--qs_data_name", 'forecast result',
                     "--model_package_group_name", BUCKET_NAME_USECASE], 
    code = visualization_code,
)



### 1) 스텝 정의

### 1) 파이프라인 실행

In [29]:
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep

pipeline = Pipeline(name = BUCKET_NAME_USECASE,
                    parameters = [
                        visualization_instance_type,        
                        visualization_instance_count,
                        prediction_instance_type,        
                        prediction_instance_count,
                    ],
                    steps=[step_visualization,
                          step_prediction]
)

In [30]:
import json

definition = json.loads(pipeline.definition())
definition

{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'VisualizationInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'VisualizationInstanceCount', 'Type': 'Integer', 'DefaultValue': 1},
  {'Name': 'PredctionInstanceType',
   'Type': 'String',
   'DefaultValue': 'ml.m5.xlarge'},
  {'Name': 'PredctionInstanceCount', 'Type': 'Integer', 'DefaultValue': 1}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'crude-palm-oil-prices-forecast-Visualization',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': 'ml.t3.medium',
      'InstanceCount': 1,
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '366743142698.dkr.ecr.ap-northeast-2.amazonaws.com/sagemaker-scikit-learn:1.0-1-cpu-py3',
     'ContainerArguments': ['--source_path',
      {'Get': "Steps.crude-palm-oil-prices-

In [None]:
%%time
start = time.time()
pipeline.upsert(role_arn=sagemaker.get_execution_role())
execution = pipeline.start()
execution.wait() #실행이 완료될 때까지 기다린다.
end = time.time()

In [56]:
print(f"visualization 시간 : {end - start:.1f} sec")
print(f"visualization 시간 : {((end - start)/60):.1f} min")

visualization 시간 : 696.5 sec
visualization 시간 : 11.6 min


[2022년 11월 29일]
- prediction 시간 : 423.1 sec
- prediction 시간 : 7.1 min

In [57]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:ap-northeast-2:108594546720:pipeline/crude-palm-oil-prices-forecast',
 'PipelineExecutionArn': 'arn:aws:sagemaker:ap-northeast-2:108594546720:pipeline/crude-palm-oil-prices-forecast/execution/03tvfcaaprlz',
 'PipelineExecutionDisplayName': 'execution-1677737728133',
 'PipelineExecutionStatus': 'Succeeded',
 'PipelineExperimentConfig': {'ExperimentName': 'crude-palm-oil-prices-forecast',
  'TrialName': '03tvfcaaprlz'},
 'CreationTime': datetime.datetime(2023, 3, 2, 6, 15, 27, 687000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2023, 3, 2, 6, 26, 57, 655000, tzinfo=tzlocal()),
 'CreatedBy': {},
 'LastModifiedBy': {},
 'ResponseMetadata': {'RequestId': '78e05f6b-51a4-40ce-9736-816ef89db047',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '78e05f6b-51a4-40ce-9736-816ef89db047',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '541',
   'date': 'Thu, 02 Mar 2023 06:28:05 GMT'},
  'RetryAttempts': 0}}

In [58]:
execution.list_steps()

[{'StepName': 'crude-palm-oil-prices-forecast-Visualization',
  'StartTime': datetime.datetime(2023, 3, 2, 6, 15, 29, 846000, tzinfo=tzlocal()),
  'EndTime': datetime.datetime(2023, 3, 2, 6, 26, 57, 79000, tzinfo=tzlocal()),
  'StepStatus': 'Succeeded',
  'AttemptCount': 0,
  'Metadata': {'ProcessingJob': {'Arn': 'arn:aws:sagemaker:ap-northeast-2:108594546720:processing-job/pipelines-03tvfcaaprlz-crude-palm-oil-price-dbgzcq8j0f'}}}]

In [40]:
manifest_file_path = 'predicted-data/2023/03/19/1679292475.0/manifest'

In [140]:
qs_client.list_data_sets(AwsAccountId='108594546720')

{'ResponseMetadata': {'RequestId': '9fdcb5f0-d1c2-490e-ae28-427785bd6006',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 12:26:55 GMT',
   'content-type': 'application/json',
   'content-length': '2831',
   'connection': 'keep-alive',
   'x-amzn-requestid': '9fdcb5f0-d1c2-490e-ae28-427785bd6006'},
  'RetryAttempts': 0},
 'Status': 200,
 'DataSetSummaries': [{'Arn': 'arn:aws:quicksight:ap-northeast-2:108594546720:dataset/1cdca429-9aa2-4623-8ee0-d48452d5090e',
   'DataSetId': '1cdca429-9aa2-4623-8ee0-d48452d5090e',
   'Name': 'Sales Pipeline',
   'CreatedTime': datetime.datetime(2023, 2, 1, 1, 30, 9, 892000, tzinfo=tzlocal()),
   'LastUpdatedTime': datetime.datetime(2023, 2, 1, 1, 30, 25, 864000, tzinfo=tzlocal()),
   'ImportMode': 'SPICE',
   'RowLevelPermissionTagConfigurationApplied': False,
   'ColumnLevelPermissionRulesApplied': False},
  {'Arn': 'arn:aws:quicksight:ap-northeast-2:108594546720:dataset/2269eae0-2254-494a-a998-b94b7c4686e9',
   'DataSetId': '226

In [141]:
qs_client.list_data_sources(AwsAccountId='108594546720')

{'ResponseMetadata': {'RequestId': '9db99fe2-204d-4da4-af3c-6f68aeb9b7c7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 12:27:16 GMT',
   'content-type': 'application/json',
   'content-length': '12030',
   'connection': 'keep-alive',
   'x-amzn-requestid': '9db99fe2-204d-4da4-af3c-6f68aeb9b7c7'},
  'RetryAttempts': 0},
 'Status': 200,
 'DataSources': [{'Arn': 'arn:aws:quicksight:ap-northeast-2:108594546720:datasource/8c2f563f-9a63-4faf-80d2-c1fc61a9a8cd',
   'DataSourceId': '8c2f563f-9a63-4faf-80d2-c1fc61a9a8cd',
   'Name': 'Sales Pipeline',
   'Type': 'S3',
   'Status': 'UPDATE_SUCCESSFUL',
   'CreatedTime': datetime.datetime(2023, 2, 1, 1, 30, 8, 776000, tzinfo=tzlocal()),
   'LastUpdatedTime': datetime.datetime(2023, 2, 1, 1, 30, 8, 886000, tzinfo=tzlocal()),
   'DataSourceParameters': {'S3Parameters': {'ManifestFileLocation': {'Bucket': 'spaceneedle-samplefiles.prod.ap-northeast-2',
      'Key': 'sales/manifest.json'}}}},
  {'Arn': 'arn:aws:quicksight:ap-no

In [78]:
template_json = {"fileLocations": [{"URIPrefixes": []}],
                 "globalUploadSettings": {
                     "format": "CSV",
                     "delimiter": ","
                 }}
source_path = 's3://crude-palm-oil-prices-forecast/predicted-data/2023/03/22/1679559317.0/result'
target_path = source_path.rsplit('/',1)[0]+'/manifest'

paginator = s3_client.get_paginator('list_objects_v2')
response_iterator = paginator.paginate(Bucket = BUCKET_NAME_USECASE,
                                       Prefix = source_path.split(BUCKET_NAME_USECASE+'/')[1]
                                      )
for page in response_iterator:
    for content in page['Contents']:
        template_json['fileLocations'][0]['URIPrefixes'].append(f's3://{BUCKET_NAME_USECASE}/'+content['Key'])
    with open(f'./manifest_testing.manifest', 'w') as f:
        json.dump(template_json, f, indent=2)

    res = s3_client.upload_file('./manifest_testing.manifest',
                                BUCKET_NAME_USECASE,
                                f"{target_path.split(BUCKET_NAME_USECASE+'/')[1]}/visual_validation.manifest")
    manifest_file_path = f"{target_path.split(BUCKET_NAME_USECASE+'/')[1]}/visual_validation.manifest"

In [46]:
!aws s3 ls 's3://crude-palm-oil-prices-forecast/predicted-data/2023/03/19/1679292475.0/manifest/visual_validation.manifest'

2023-03-20 09:07:32        260 visual_validation.manifest


In [83]:
def refresh_of_spice_datasets(user_account_id,
                              qs_data_name,
                              manifest_file_path,
                              BUCKET_NAME_USECASE,
                              qs_client):
    
    ds_list = qs_client.list_data_sources(AwsAccountId=user_account_id)
    datasource_ids = [summary["DataSourceId"] for summary in ds_list["DataSources"] if qs_data_name in summary["Name"]]    
    for datasource_id in datasource_ids:
        response = qs_client.update_data_source(
            AwsAccountId=user_account_id,
            DataSourceId=datasource_id,
            Name=qs_data_name,
            DataSourceParameters={
                'S3Parameters': {
                    'ManifestFileLocation': {
                        'Bucket': BUCKET_NAME_USECASE,
                        'Key':  manifest_file_path
                    },
                },
            })
        print(f"datasource_id:{datasource_id} 의 manifest를 업데이트: {response}")
    
    res = qs_client.list_data_sets(AwsAccountId = user_account_id)
    datasets_ids = [summary["DataSetId"] for summary in res["DataSetSummaries"] if qs_data_name in summary["Name"]]
    ingestion_ids = []

    for dataset_id in datasets_ids:
        try:
            ingestion_id = str(calendar.timegm(time.gmtime()))
            qs_client.create_ingestion(DataSetId = dataset_id,
                                       IngestionId = ingestion_id,
                                       AwsAccountId = user_account_id)
            ingestion_ids.append(ingestion_id)
        except Exception as e:
            print(e)
            pass
    for ingestion_id, dataset_id in zip(ingestion_ids, datasets_ids):
        while True:
            response = qs_client.describe_ingestion(DataSetId = dataset_id,
                                                    IngestionId = ingestion_id,
                                                    AwsAccountId = user_account_id)
            if response['Ingestion']['IngestionStatus'] in ('INITIALIZED', 'QUEUED', 'RUNNING'):
                time.sleep(5)     #change sleep time according to your dataset size
            elif response['Ingestion']['IngestionStatus'] == 'COMPLETED':
                print("refresh completed. RowsIngested {0}, RowsDropped {1}, IngestionTimeInSeconds {2}, IngestionSizeInBytes {3}".format(
                    response['Ingestion']['RowInfo']['RowsIngested'],
                    response['Ingestion']['RowInfo']['RowsDropped'],
                    response['Ingestion']['IngestionTimeInSeconds'],
                    response['Ingestion']['IngestionSizeInBytes']))
                break
            else:
                print("refresh failed for {0}! - status {1}".format(dataset_id,
                                                                          response['Ingestion']['IngestionStatus']))
                break
    return response

In [107]:
def register_manifest(source_path,
                      target_path,
                      s3_client,
                      BUCKET_NAME_USECASE):
    template_json = {"fileLocations": [{"URIPrefixes": []}],
                     "globalUploadSettings": {
                         "format": "CSV",
                         "delimiter": ","
                     }}
    paginator = s3_client.get_paginator('list_objects_v2')
    response_iterator = paginator.paginate(Bucket = BUCKET_NAME_USECASE,
                                           Prefix = source_path.split(BUCKET_NAME_USECASE+'/')[1]
                                          )
    for page in response_iterator:
        for content in page['Contents']:
            template_json['fileLocations'][0]['URIPrefixes'].append(f's3://{BUCKET_NAME_USECASE}/'+content['Key'])
    with open(f'./manifest_testing.manifest', 'w') as f:
        json.dump(template_json, f, indent=2)

    res = s3_client.upload_file('./manifest_testing.manifest',
                                BUCKET_NAME_USECASE,
                                f"{target_path.split(BUCKET_NAME_USECASE+'/')[1]}/visual_validation.manifest")
    return f"{target_path.split(BUCKET_NAME_USECASE+'/')[1]}/visual_validation.manifest"
    
def refresh_of_spice_datasets(user_account_id,
                              qs_data_name,
                              manifest_file_path,
                              BUCKET_NAME_USECASE,
                              qs_client):
    
    ds_list = qs_client.list_data_sources(AwsAccountId='108594546720')
    datasource_ids = [summary["DataSourceId"] for summary in ds_list["DataSources"] if qs_data_name in summary["Name"]]    
    for datasource_id in datasource_ids:
        response = qs_client.update_data_source(
            AwsAccountId=user_account_id,
            DataSourceId=datasource_id,
            Name=qs_data_name,
            DataSourceParameters={
                'S3Parameters': {
                    'ManifestFileLocation': {
                        'Bucket': BUCKET_NAME_USECASE,
                        'Key':  manifest_file_path
                    },
                },
            })
        print(f"datasource_id:{datasource_id} 의 manifest를 업데이트: {response}")
    
    res = qs_client.list_data_sets(AwsAccountId = user_account_id)
    datasets_ids = [summary["DataSetId"] for summary in res["DataSetSummaries"] if qs_data_name in summary["Name"]]
    ingestion_ids = []

    for dataset_id in datasets_ids:
        try:
            ingestion_id = str(calendar.timegm(time.gmtime()))
            qs_client.create_ingestion(DataSetId = dataset_id,
                                       IngestionId = ingestion_id,
                                       AwsAccountId = user_account_id)
            ingestion_ids.append(ingestion_id)
        except Exception as e:
            print(e)
            pass
    for ingestion_id, dataset_id in zip(ingestion_ids, datasets_ids):
        while True:
            response = qs_client.describe_ingestion(DataSetId = dataset_id,
                                                    IngestionId = ingestion_id,
                                                    AwsAccountId = user_account_id)
            if response['Ingestion']['IngestionStatus'] in ('INITIALIZED', 'QUEUED', 'RUNNING'):
                time.sleep(5)     #change sleep time according to your dataset size
            elif response['Ingestion']['IngestionStatus'] == 'COMPLETED':
                print("refresh completed. RowsIngested {0}, RowsDropped {1}, IngestionTimeInSeconds {2}, IngestionSizeInBytes {3}".format(
                    response['Ingestion']['RowInfo']['RowsIngested'],
                    response['Ingestion']['RowInfo']['RowsDropped'],
                    response['Ingestion']['IngestionTimeInSeconds'],
                    response['Ingestion']['IngestionSizeInBytes']))
                break
            else:
                print("refresh failed for {0}! - status {1}".format(dataset_id,
                                                                          response['Ingestion']['IngestionStatus']))
                break
    return response

In [None]:
user_account_id = '108594546720'
qs_data_name = 'forecast_result'
datasource_id = 'dcb10d8e-7dd7-432d-9af1-05d31e836cc1'

source_path = 's3://crude-palm-oil-prices-forecast/predicted-data/2023/03/22/1679559317.0/result'
target_path = source_path.rsplit('/',1)[0]+'/manifest'
# source_path = 

In [101]:
source_path = 's3://crude-palm-oil-prices-forecast/trained-model/2023/03/22/1679551713.0/manifest'
target_path = 's3://crude-palm-oil-prices-forecast/trained-model/2023/03/22/1679551713.0/prediction'

In [104]:
manifest_file_path = register_manifest(source_path,
                  target_path,
                  s3_client,
                  BUCKET_NAME_USECASE)

In [108]:
refresh_of_spice_datasets(user_account_id,
                              qs_data_name,
                              manifest_file_path,
                              BUCKET_NAME_USECASE,
                              qs_client)

datasource_id:dcb10d8e-7dd7-432d-9af1-05d31e836cc1 의 manifest를 업데이트: {'ResponseMetadata': {'RequestId': 'b16d7d85-e878-437b-b055-3892bd2041c2', 'HTTPStatusCode': 202, 'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 06:00:29 GMT', 'content-type': 'application/json', 'content-length': '245', 'connection': 'keep-alive', 'x-amzn-requestid': 'b16d7d85-e878-437b-b055-3892bd2041c2'}, 'RetryAttempts': 0}, 'Status': 202, 'Arn': 'arn:aws:quicksight:ap-northeast-2:108594546720:datasource/dcb10d8e-7dd7-432d-9af1-05d31e836cc1', 'DataSourceId': 'dcb10d8e-7dd7-432d-9af1-05d31e836cc1', 'UpdateStatus': 'UPDATE_IN_PROGRESS', 'RequestId': 'b16d7d85-e878-437b-b055-3892bd2041c2'}
refresh completed. RowsIngested 14, RowsDropped 0, IngestionTimeInSeconds 17, IngestionSizeInBytes 1848


{'ResponseMetadata': {'RequestId': '755dfab4-be22-4b31-9d02-a5118fb78bef',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 06:00:51 GMT',
   'content-type': 'application/json',
   'content-length': '584',
   'connection': 'keep-alive',
   'x-amzn-requestid': '755dfab4-be22-4b31-9d02-a5118fb78bef'},
  'RetryAttempts': 0},
 'Status': 200,
 'Ingestion': {'Arn': 'arn:aws:quicksight:ap-northeast-2:108594546720:dataset/2269eae0-2254-494a-a998-b94b7c4686e9/ingestion/1679551229',
  'IngestionId': '1679551229',
  'IngestionStatus': 'COMPLETED',
  'ErrorInfo': {},
  'RowInfo': {'RowsIngested': 14, 'RowsDropped': 0, 'TotalRowsInDataset': 14},
  'CreatedTime': datetime.datetime(2023, 3, 23, 6, 0, 29, 769000, tzinfo=tzlocal()),
  'IngestionTimeInSeconds': 17,
  'IngestionSizeInBytes': 1848,
  'RequestSource': 'MANUAL',
  'RequestType': 'FULL_REFRESH'},
 'RequestId': '755dfab4-be22-4b31-9d02-a5118fb78bef'}

In [118]:
response = qs_client.create_data_source(
    AwsAccountId=user_account_id,
    DataSourceId='5308e376-ab57-440f-b38d-acf9eb635113',
    Name='final_forecast_result',
    Type='S3',
    DataSourceParameters={
        'S3Parameters': {
            'ManifestFileLocation': {
                'Bucket': BUCKET_NAME_USECASE,
                'Key': manifest_file_path
            }}}
    )

In [119]:
response

{'ResponseMetadata': {'RequestId': 'cc62258f-03e3-44f0-9537-1cecbecbe324',
  'HTTPStatusCode': 202,
  'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 06:11:00 GMT',
   'content-type': 'application/json',
   'content-length': '249',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'cc62258f-03e3-44f0-9537-1cecbecbe324'},
  'RetryAttempts': 0},
 'Status': 202,
 'Arn': 'arn:aws:quicksight:ap-northeast-2:108594546720:datasource/5308e376-ab57-440f-b38d-acf9eb635113',
 'DataSourceId': '5308e376-ab57-440f-b38d-acf9eb635113',
 'CreationStatus': 'CREATION_IN_PROGRESS',
 'RequestId': 'cc62258f-03e3-44f0-9537-1cecbecbe324'}

In [117]:
while True:
    response = qs_client.create_data_source(DataSourceId = dataset_id,
                                            IngestionId = ingestion_id,
                                            AwsAccountId = user_account_id)
    if response['Ingestion']['IngestionStatus'] in ('INITIALIZED', 'QUEUED', 'RUNNING'):
        time.sleep(5)     #change sleep time according to your dataset size
    elif response['Ingestion']['IngestionStatus'] == 'COMPLETED':
        print("refresh completed. RowsIngested {0}, RowsDropped {1}, IngestionTimeInSeconds {2}, IngestionSizeInBytes {3}".format(
            response['Ingestion']['RowInfo']['RowsIngested'],
            response['Ingestion']['RowInfo']['RowsDropped'],
            response['Ingestion']['IngestionTimeInSeconds'],
            response['Ingestion']['IngestionSizeInBytes']))
        break
    else:
        print("refresh failed for {0}! - status {1}".format(dataset_id,
                                                                  response['Ingestion']['IngestionStatus']))
        break

{'ResponseMetadata': {'RequestId': '985c791d-a9be-46cb-9540-dc7137654174',
  'HTTPStatusCode': 202,
  'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 06:08:40 GMT',
   'content-type': 'application/json',
   'content-length': '189',
   'connection': 'keep-alive',
   'x-amzn-requestid': '985c791d-a9be-46cb-9540-dc7137654174'},
  'RetryAttempts': 0},
 'Status': 202,
 'Arn': 'arn:aws:quicksight:ap-northeast-2:108594546720:datasource/string',
 'DataSourceId': 'string',
 'CreationStatus': 'CREATION_IN_PROGRESS',
 'RequestId': '985c791d-a9be-46cb-9540-dc7137654174'}

In [111]:
sagemaker.get_execuation_role()

'arn:aws:iam::108594546720:role/service-role/AmazonSageMaker-ExecutionRole-20220901T154875'

In [120]:
response

{'ResponseMetadata': {'RequestId': 'cc62258f-03e3-44f0-9537-1cecbecbe324',
  'HTTPStatusCode': 202,
  'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 06:11:00 GMT',
   'content-type': 'application/json',
   'content-length': '249',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'cc62258f-03e3-44f0-9537-1cecbecbe324'},
  'RetryAttempts': 0},
 'Status': 202,
 'Arn': 'arn:aws:quicksight:ap-northeast-2:108594546720:datasource/5308e376-ab57-440f-b38d-acf9eb635113',
 'DataSourceId': '5308e376-ab57-440f-b38d-acf9eb635113',
 'CreationStatus': 'CREATION_IN_PROGRESS',
 'RequestId': 'cc62258f-03e3-44f0-9537-1cecbecbe324'}

In [130]:
user_account_id = '108594546720'
qs_data_name = 'forecast_result'
datasource_id = 'dcb10d8e-7dd7-432d-9af1-05d31e836cc1'

ds_list = qs_client.list_data_sources(AwsAccountId=user_account_id)
datasource_ids = [summary["DataSourceId"] for summary in ds_list["DataSources"] if qs_data_name in summary["Name"]]    
for datasource_id in datasource_ids:
    while True:
        response = qs_client.update_data_source(
            AwsAccountId=user_account_id,
            DataSourceId=datasource_id,
            Name=qs_data_name,
            DataSourceParameters={
                'S3Parameters': {
                    'ManifestFileLocation': {
                        'Bucket': BUCKET_NAME_USECASE,
                        'Key':  manifest_file_path
                    },
                },
            }
        )
        if response['UpdateStatus'] in ('UPDATE_IN_PROGRESS', 'CREATION_IN_PROGRESS'):
            time.sleep(5)     #change sleep time according to your dataset size
            print('wait 5s')
        elif response['UpdateStatus'] == 'UPDATE_SUCCESSFUL':
            print("refresh completed. RowsIngested {0}, RowsDropped {1}, IngestionTimeInSeconds {2}, IngestionSizeInBytes {3}".format(
                response['Ingestion']['RowInfo']['RowsIngested'],
                response['Ingestion']['RowInfo']['RowsDropped'],
                response['Ingestion']['IngestionTimeInSeconds'],
                response['Ingestion']['IngestionSizeInBytes']))
            break
        else:
            print("refresh failed for {0}! - status {1}".format(dataset_id,
                                                                          response['Ingestion']['IngestionStatus']))
            break
    print(f"datasource_id:{datasource_id} 의 manifest를 업데이트: {response}")

wait 5s
wait 5s
wait 5s
wait 5s
wait 5s
wait 5s
wait 5s
wait 5s
wait 5s


KeyboardInterrupt: 

In [None]:
'CREATION_IN_PROGRESS'|'CREATION_SUCCESSFUL'|'CREATION_FAILED'|'UPDATE_IN_PROGRESS'|'UPDATE_SUCCESSFUL'|'UPDATE_FAILED'|'DELETED',

In [121]:
test = {
   "ResponseMetadata":{
      "RequestId":"cf098a88-9f8a-4d85-a7a1-5fbb0fd7be41",
      "HTTPStatusCode":202,
      "HTTPHeaders":{
         "date":"Thu, 23 Mar 2023 05:32:19 GMT",
         "content-type":"application/json",
         "content-length":"245",
         "connection":"keep-alive",
         "x-amzn-requestid":"cf098a88-9f8a-4d85-a7a1-5fbb0fd7be41"
      },
      "RetryAttempts":0
   },
   "Status":202,
   "Arn":"arn:aws:quicksight:ap-northeast-2:108594546720:datasource/dcb10d8e-7dd7-432d-9af1-05d31e836cc1",
   "DataSourceId":"dcb10d8e-7dd7-432d-9af1-05d31e836cc1",
   "UpdateStatus":"UPDATE_IN_PROGRESS",
   "RequestId":"cf098a88-9f8a-4d85-a7a1-5fbb0fd7be41"
}

In [122]:
test['UpdateStatus']

'UPDATE_IN_PROGRESS'

In [132]:
manifest_file_path

'trained-model/2023/03/22/1679551713.0/prediction/visual_validation.manifest'

In [131]:
user_account_id = '108594546720'
qs_data_name = 'forecast_result'
datasource_id = 'dcb10d8e-7dd7-432d-9af1-05d31e836cc1'

refresh_of_spice_datasets(user_account_id, qs_data_name, manifest_file_path, BUCKET_NAME_USECASE, qs_client)

datasource_id:5308e376-ab57-440f-b38d-acf9eb635113 의 manifest를 업데이트: {'ResponseMetadata': {'RequestId': '4a2ffa37-c67d-4825-9637-fb1ea0982db5', 'HTTPStatusCode': 202, 'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 07:46:27 GMT', 'content-type': 'application/json', 'content-length': '245', 'connection': 'keep-alive', 'x-amzn-requestid': '4a2ffa37-c67d-4825-9637-fb1ea0982db5'}, 'RetryAttempts': 0}, 'Status': 202, 'Arn': 'arn:aws:quicksight:ap-northeast-2:108594546720:datasource/5308e376-ab57-440f-b38d-acf9eb635113', 'DataSourceId': '5308e376-ab57-440f-b38d-acf9eb635113', 'UpdateStatus': 'UPDATE_IN_PROGRESS', 'RequestId': '4a2ffa37-c67d-4825-9637-fb1ea0982db5'}
datasource_id:dcb10d8e-7dd7-432d-9af1-05d31e836cc1 의 manifest를 업데이트: {'ResponseMetadata': {'RequestId': 'e5eea517-c6e7-4636-9828-ec3038f5ebf2', 'HTTPStatusCode': 202, 'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 07:46:28 GMT', 'content-type': 'application/json', 'content-length': '245', 'connection': 'keep-alive', 'x-amzn-requestid': 'e5ee

{'ResponseMetadata': {'RequestId': 'b4be940c-8803-46ce-9f72-b350e076b5be',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 07:46:50 GMT',
   'content-type': 'application/json',
   'content-length': '584',
   'connection': 'keep-alive',
   'x-amzn-requestid': 'b4be940c-8803-46ce-9f72-b350e076b5be'},
  'RetryAttempts': 0},
 'Status': 200,
 'Ingestion': {'Arn': 'arn:aws:quicksight:ap-northeast-2:108594546720:dataset/2269eae0-2254-494a-a998-b94b7c4686e9/ingestion/1679557588',
  'IngestionId': '1679557588',
  'IngestionStatus': 'COMPLETED',
  'ErrorInfo': {},
  'RowInfo': {'RowsIngested': 14, 'RowsDropped': 0, 'TotalRowsInDataset': 14},
  'CreatedTime': datetime.datetime(2023, 3, 23, 7, 46, 28, 628000, tzinfo=tzlocal()),
  'IngestionTimeInSeconds': 17,
  'IngestionSizeInBytes': 1848,
  'RequestSource': 'MANUAL',
  'RequestType': 'FULL_REFRESH'},
 'RequestId': 'b4be940c-8803-46ce-9f72-b350e076b5be'}

In [59]:
for datasource_id in datasource_ids:
    print(datasource_id)
    response = qs_client.update_data_source(
        AwsAccountId='108594546720',
        DataSourceId='dcb10d8e-7dd7-432d-9af1-05d31e836cc1',
        Name=qs_data_name,
        DataSourceParameters={
            'S3Parameters': {
                'ManifestFileLocation': {
                    'Bucket': 'crude-palm-oil-prices-forecast',
                    'Key':  'predicted-data/2023/03/19/1679292475.0/manifest/visual_validation.manifest'
                },
        }})
    print(f"datasource_id:{datasource_id} 의 manifest를 업데이트: {response}")


dcb10d8e-7dd7-432d-9af1-05d31e836cc1
datasource_id:dcb10d8e-7dd7-432d-9af1-05d31e836cc1 의 manifest를 업데이트: {'ResponseMetadata': {'RequestId': '0ab328aa-ba5c-406a-b63a-7f9674543971', 'HTTPStatusCode': 202, 'HTTPHeaders': {'date': 'Wed, 22 Mar 2023 23:16:09 GMT', 'content-type': 'application/json', 'content-length': '245', 'connection': 'keep-alive', 'x-amzn-requestid': '0ab328aa-ba5c-406a-b63a-7f9674543971'}, 'RetryAttempts': 0}, 'Status': 202, 'Arn': 'arn:aws:quicksight:ap-northeast-2:108594546720:datasource/dcb10d8e-7dd7-432d-9af1-05d31e836cc1', 'DataSourceId': 'dcb10d8e-7dd7-432d-9af1-05d31e836cc1', 'UpdateStatus': 'UPDATE_IN_PROGRESS', 'RequestId': '0ab328aa-ba5c-406a-b63a-7f9674543971'}


In [137]:
print('hi')

hi


In [135]:
ds_list = qs_client.list_data_sources(AwsAccountId=user_account_id)
datasource_ids = [summary["DataSourceId"] for summary in ds_list["DataSources"] if qs_data_name in summary["Name"]]    
for datasource_id in datasource_ids:
    response = qs_client.update_data_source(
        AwsAccountId=user_account_id,
        DataSourceId=datasource_id,
        Name=qs_data_name,
        DataSourceParameters={
            'S3Parameters': {
                'ManifestFileLocation': {
                    'Bucket': 'crude-palm-oil-prices-forecast',
                    'Key':  'predicted-data/2023/03/20/1679390006.0/manifest/visual_validation.manifest'
                },
            },
        })
    print(f"datasource_id:{datasource_id} 의 manifest를 업데이트: {response}")

datasource_id:5308e376-ab57-440f-b38d-acf9eb635113 의 manifest를 업데이트: {'ResponseMetadata': {'RequestId': 'f8ec7816-ddaf-469c-b54c-7e058ef2c72f', 'HTTPStatusCode': 202, 'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 11:24:23 GMT', 'content-type': 'application/json', 'content-length': '245', 'connection': 'keep-alive', 'x-amzn-requestid': 'f8ec7816-ddaf-469c-b54c-7e058ef2c72f'}, 'RetryAttempts': 0}, 'Status': 202, 'Arn': 'arn:aws:quicksight:ap-northeast-2:108594546720:datasource/5308e376-ab57-440f-b38d-acf9eb635113', 'DataSourceId': '5308e376-ab57-440f-b38d-acf9eb635113', 'UpdateStatus': 'UPDATE_IN_PROGRESS', 'RequestId': 'f8ec7816-ddaf-469c-b54c-7e058ef2c72f'}
datasource_id:dcb10d8e-7dd7-432d-9af1-05d31e836cc1 의 manifest를 업데이트: {'ResponseMetadata': {'RequestId': 'cfee156e-7c62-4c7f-a9c6-935171ef61bc', 'HTTPStatusCode': 202, 'HTTPHeaders': {'date': 'Thu, 23 Mar 2023 11:24:23 GMT', 'content-type': 'application/json', 'content-length': '245', 'connection': 'keep-alive', 'x-amzn-requestid': 'cfee

In [139]:
datasource_ids

['5308e376-ab57-440f-b38d-acf9eb635113',
 'dcb10d8e-7dd7-432d-9af1-05d31e836cc1',
 'string']

In [74]:
import calendar
user_account_id = '108594546720'
qs_data_name = 'forecast_result'
# datasource_id = 'dcb10d8e-7dd7-432d-9af1-05d31e836cc1'


res = qs_client.list_data_sets(AwsAccountId = user_account_id)
datasets_ids = [summary["DataSetId"] for summary in res["DataSetSummaries"] if qs_data_name in summary["Name"]]
ingestion_ids = []

for dataset_id in datasets_ids:
    try:
        ingestion_id = str(calendar.timegm(time.gmtime()))
        qs_client.create_ingestion(DataSetId = dataset_id,
                                   IngestionId = ingestion_id,
                                   AwsAccountId = user_account_id)
        ingestion_ids.append(ingestion_id)
    except Exception as e:
        print(e)
        pass
for ingestion_id, dataset_id in zip(ingestion_ids, datasets_ids):
    while True:
        response = qs_client.describe_ingestion(DataSetId = dataset_id,
                                                IngestionId = ingestion_id,
                                                AwsAccountId = user_account_id)
        if response['Ingestion']['IngestionStatus'] in ('INITIALIZED', 'QUEUED', 'RUNNING'):
            time.sleep(5)     #change sleep time according to your dataset size
        elif response['Ingestion']['IngestionStatus'] == 'COMPLETED':
            print("refresh completed. RowsIngested {0}, RowsDropped {1}, IngestionTimeInSeconds {2}, IngestionSizeInBytes {3}".format(
                response['Ingestion']['RowInfo']['RowsIngested'],
                response['Ingestion']['RowInfo']['RowsDropped'],
                response['Ingestion']['IngestionTimeInSeconds'],
                response['Ingestion']['IngestionSizeInBytes']))
            break
        else:
            print("refresh failed for {0}! - status {1}".format(dataset_id,
                                                                      response['Ingestion']['IngestionStatus']))
            break

refresh completed. RowsIngested 22, RowsDropped 0, IngestionTimeInSeconds 16, IngestionSizeInBytes 2904


In [None]:
model_approval_status = ParameterString(
    name="ModelApprovalStatus", default_value="PendingManualApproval"
)