## <B> Setup for SageMaker pipleline for training multi models </B>

## AutoReload

In [117]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 1. Check execution role

In [124]:
from sagemaker import get_execution_role

In [125]:
strSageMakerRoleName = get_execution_role().rsplit('/', 1)[-1]
print (f"SageMaker Execution Role Name: {strSageMakerRoleName}")

SageMaker Execution Role Name: AmazonSageMaker-ExecutionRole-20221004T162466


## 2. Set default parameters

In [126]:
import boto3
import sagemaker
from utils.ssm import parameter_store

In [127]:
strRegionName=boto3.Session().region_name
pm = parameter_store(strRegionName)

In [128]:
strPrefix = "SM-PIPELINE-MULTI-MODELS-"

In [129]:
pm.put_params(key="PREFIX", value=strPrefix, overwrite=True)
pm.put_params(key="".join([strPrefix, "REGION"]), value=strRegionName, overwrite=True)
pm.put_params(key="".join([strPrefix, "DEFAULT-BUCKET"]), value=sagemaker.Session().default_bucket(), overwrite=True)
pm.put_params(key="".join([strPrefix, "SAGEMAKER-ROLE-ARN"]), value=get_execution_role(), overwrite=True)
pm.put_params(key="".join([strPrefix, "ACCOUNT-ID"]), value=boto3.client("sts").get_caller_identity().get("Account"), overwrite=True)

'Store suceess'

## 2. Datasets
> 가정: 각 모델별 인풋은 모두 다르다. <br>
> 모델별 데이터를 s3에 다른 이름으로 저장한다.

### 2.1 Create bucket for input datasets

In [130]:
from utils.s3 import s3_handler

In [131]:
s3 = s3_handler(region_name=pm.get_params(key=strPrefix + "REGION"))
strDataBucketName = f"{strPrefix.lower()}datasets"

This is a S3 handler with [ap-northeast-2] region.


In [137]:
s3.create_bucket(bucket_name=strDataBucketName)
pm.put_params(key="".join([strPrefix, "DATA-BUCKET"]), value=strDataBucketName, overwrite=True)

CREATE:[sm-pipeline-multi-models-datasets] Bucket was created successfully


'Store suceess'

In [136]:
s3.delete_bucket(bucket_name=c) ## delete bucket as well as objects

DELETE: [sm-pipeline-multi-models-datasets] Bucket was deleted successfully


True

### 2.2. Store datasets

In [144]:
import concurrent.futures
import functools

In [133]:
nModels = 90

In [142]:
def worker(strDataBucketName, args):
    nIdx, nSpare= args
    print ("worker", nIdx, nSpare, strDataBucketName)
    s3.copy_object(
        source_obj="amazon-reviews-pds/tsv/amazon_reviews_us_Electronics_v1_00.tsv.gz",
        target_bucket=strDataBucketName,
        target_obj=f"model-{nIdx+1}/amazon_reviews_us_Electronics_v1_00.tsv.gz"
    )
    return f"job-{nIdx} was completed"

In [143]:
%%time
function = functools.partial(worker, strDataBucketName) # 반복되는 것은 먼저 쓰기 
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    #res = list(executor.map(function, [nIdx for nIdx in range(nModels)]))
    res = list(executor.map(function, [(nIdx, nSpare+1) for (nIdx, nSpare) in zip(range(nModels), range(nModels))]))

worker 0 1 sm-pipeline-multi-models-datasets
worker 1 2 sm-pipeline-multi-models-datasets
worker 2 3 sm-pipeline-multi-models-datasets
workerworker 4 5 sm-pipeline-multi-models-datasets
 3 4 sm-pipeline-multi-models-datasets
worker worker 6 7 sm-pipeline-multi-models-datasets
5 6 sm-pipeline-multi-models-datasets
worker 7 8 sm-pipeline-multi-models-datasets
worker 8worker 9 10 sm-pipeline-multi-models-datasets
 9 sm-pipeline-multi-models-datasets
worker 10 11 sm-pipeline-multi-models-datasets
worker 11 12 sm-pipeline-multi-models-datasets
worker 12 13 sm-pipeline-multi-models-datasets
workerworker 14 15 sm-pipeline-multi-models-datasets
 13 14 sm-pipeline-multi-models-datasets
worker 15worker  worker16 17 sm-pipeline-multi-models-datasets
16  17 18 sm-pipeline-multi-models-datasets
sm-pipeline-multi-models-datasetsworker 
18 worker worker 20 21 sm-pipeline-multi-models-datasetsworker 21 22 sm-pipeline-multi-models-datasets
1919
 sm-pipeline-multi-models-datasets
worker 22 23 sm-pipelin



CPU times: user 645 ms, sys: 54.1 ms, total: 699 ms
Wall time: 7min 38s
