### Set up

#### 1. Set  up  accounts and role

In [1]:
import sagemaker
import boto3
from datetime import datetime

sagemaker_session = sagemaker.Session()
account_id =  boto3.client('sts').get_caller_identity().get('Account')
region = boto3.session.Session().region_name


#role = sagemaker.get_execution_role()
role="arn:aws:iam::{}:role/service-role/AmazonSageMaker-ExecutionRole-20190118T115449".format(account_id)
step_func_role = "arn:aws:iam::{}:role/AmazonSageMaker-StepFunctionsWorkflowExecutionRole".format(account_id)



#### 2. Setup image and instance type

In [2]:
# pytorch_custom_image_name="ppi-extractor:gpu-1.0.0-201910130520"
instance_type = "ml.p3.2xlarge" 

In [3]:
# docker_repo = "{}.dkr.ecr.{}.amazonaws.com/{}".format(account_id, region, pytorch_custom_image_name)

#### 3. Configure train/ test and validation datasets

In [4]:
bucket = "aegovan-data"

In [5]:
# train = "s3://{}/counterfactuals/imdb/202304021657/24_0_0/train.json".format(bucket)
# val = "s3://{}/counterfactuals/imdb/202304021657/24_0_0/val.json".format(bucket)

# train = "s3://aegovan-data/counterfactuals/imdb/202304081856/9_0_0/train.json"
# val = "s3://aegovan-data/counterfactuals/imdb/202304081856/9_0_0/val.json"

# train = "s3://aegovan-data/counterfactuals/imdb/202304091143/original/train.json"
# val = "s3://aegovan-data/counterfactuals/imdb/202304091143/original/val.json"

# train = "s3://aegovan-data/counterfactuals/imdb/202304091342/0_0_0/train.json"
# val = "s3://aegovan-data/counterfactuals/imdb/202304091342/0_0_0/val.json"

# train = "s3://aegovan-data/counterfactuals/imdb/202304091853/9_0_0/train.json"
# val = "s3://aegovan-data/counterfactuals/imdb/202304091853/9_0_0/val.json"


# train = "s3://aegovan-data/counterfactuals/imdb/202304091926/0_0_0/train.json"
# val = "s3://aegovan-data/counterfactuals/imdb/202304091926/0_0_0/val.json"

# train = "s3://aegovan-data/counterfactuals/imdb/202304092106/23_0_0/train.json"
# val = "s3://aegovan-data/counterfactuals/imdb/202304092106/23_0_0/val.json"

# train = "s3://aegovan-data/counterfactuals/imdb/202304092153/9_0_0/train.json"
# val = "s3://aegovan-data/counterfactuals/imdb/202304092153/9_0_0/val.json"

# train = "s3://aegovan-data/counterfactuals/imdb/202304092153/0_0_0/train.json"
# val = "s3://aegovan-data/counterfactuals/imdb/202304092153/0_0_0/val.json"

#s3_data_prefix = "s3://aegovan-data/counterfactuals/imdb/2023041513/"
#s3_data_prefix = "s3://aegovan-data/counterfactuals/imdb/2023041618/"
# s3_data_prefix = "s3://aegovan-data/counterfactuals/imdb/2023042021/"

#s3_data_prefix = "s3://aegovan-data/counterfactuals/imdb/2023042312/"
#s3_data_prefix = "s3://aegovan-data/counterfactuals/imdb/2023042316/"

s3_data_prefix_2k = "s3://aegovan-data/counterfactuals/imdb/2023042316/"
s3_data_prefix_5h = "s3://aegovan-data/counterfactuals/imdb/202306041250/"

s3_data_prefix = s3_data_prefix_2k
dataset_job_prefix = "2K"

pretrained_bert="s3://{}/pretrained_models/bert-base-uncased/".format(bucket)
pretrained_roberta="s3://{}/pretrained_models/roberta-base/".format(bucket)




s3_output_path= "s3://{}/sagemakerresults/".format(bucket)
s3_code_path= "s3://{}/counterfactuals_imdb_bert_code".format(bucket)

### Start training

In [6]:
# inputs = {
#     "train" : train,
#     "val" : val,
#     "PRETRAINED_MODEL" : pretrained_bert
# }

In [7]:
RoBertNetworkFactoryhyperparameters = {
    "datasetfactory":"datasets.counterfact_imbd_dataset_factory.CounterfactImdbDatasetFactory",
    "modelfactory" :"models.roberta_model_factory.RobertaModelFactory",
    "batch": "8",
    "gradientaccumulationsteps" : "8",
    "epochs" : "100",   
    "log-level" : "INFO",
    "learningrate":.00001,
    "earlystoppingpatience":9,
    #"checkpointdir" : sm_localcheckpoint_dir,
    # Checkpoints once every n epochs
    "checkpointfreq": 2



}

In [8]:
BertNetworkFactoryhyperparameters = {
    "datasetfactory":"datasets.counterfact_imbd_dataset_factory.CounterfactImdbDatasetFactory",
    "modelfactory" :"models.bert_model_factory.BertModelFactory",
    "tokenisor_lower_case":1,
    "batch": "8",
    "gradientaccumulationsteps" : "8",
    "epochs" : "100",   
    "log-level" : "INFO",
    "learningrate":.00001,
    "earlystoppingpatience":9,
    #"checkpointdir" : sm_localcheckpoint_dir,
    # Checkpoints once every n epochs
    "checkpointfreq": 2



}

In [9]:
metric_definitions = [{"Name": "TrainLoss",
                     "Regex": "###score: train_loss### (\d*[.]?\d*)"}
                    ,{"Name": "ValidationLoss",
                     "Regex": "###score: val_loss### (\d*[.]?\d*)"}
                
                    ,{"Name": "TrainAccuracyScore",
                     "Regex": "###score: train_ResultScorerAccuracy_score### (\d*[.]?\d*)"}
                   ,{"Name": "ValidationAccuracyScore",
                     "Regex": "###score: val_ResultScorerAccuracy_score### (\d*[.]?\d*)"}
                  
                    ]

In [10]:
!git log -1 | head -1
!git log -1 | head -5 | tail -1

commit 61a4018d0e40cd2dec618994e66c2bcf082331c8
    Bugfix - Roberta model use local


In [11]:
# set True if you need spot instance
use_spot = False
train_max_run_secs =   5 *24 * 60 * 60
spot_wait_sec =  5 * 60
max_wait_time_secs = train_max_run_secs +  spot_wait_sec

if not use_spot:
    max_wait_time_secs = None
    
# During local mode, no spot.., use smaller dataset
if instance_type == 'local':
    use_spot = False
    max_wait_time_secs = 0
    wait = True
    # Use smaller dataset to run locally
    inputs = inputs_sample

In [12]:
# experiments = {
 
#       "counterfact-imdb" : {
#         "hp" :BertNetworkFactoryhyperparameters,
#         "inputs" : inputs
#     }
# }

In [13]:

# base_name = "counterfact-imdb"

# hyperparameters = experiments[base_name]["hp"]


In [14]:
#hyperparameters

In [15]:
#inputs

In [16]:
#job_base_name = f"{base_name}-{inputs['train'].split('/')[-2]}".replace("_", "-")

In [17]:
import datetime
date_fmt = datetime.datetime.today().strftime("%Y%m%d%H%M")

In [18]:
from sagemaker.s3 import S3Downloader

def create_hp(s3_data_prefix, hyperparameters, pretrained_model_uri):
    train_files = list(filter(lambda x:  x.endswith("/train.json"), S3Downloader.list(s3_data_prefix)))

    # sort by index so each loop contains all the rates
    train_files = sorted( train_files, key = lambda x: x.split("/")[-2].split("_")[-1])
                  
    
    experiments = []
    
    
    for f in train_files:
        base_suffix = f.split("/")[-2]

        experiments.append(   {
            "name" : f"{base_suffix}".replace("_", "-"),
            "hp" :hyperparameters,
            "inputs" :  {
                    "train" : f,
                    "val" : f.replace("train", "val"),
                    "PRETRAINED_MODEL" : pretrained_model_uri
                }
            }
            
        )

            
    return experiments


experiments_hp_bert = create_hp("{}/".format(s3_data_prefix.rstrip("/")), 
                                BertNetworkFactoryhyperparameters,
                                pretrained_bert)
experiments_hp_roberta = create_hp("{}/".format(s3_data_prefix.rstrip("/")), 
                                   RoBertNetworkFactoryhyperparameters,
                                  pretrained_roberta
                                  )

In [19]:
experiments_hp = experiments_hp_roberta
job_name_prefix = "imdb-rob-"+ dataset_job_prefix

In [20]:
experiments_hp

[{'name': '00-00-00-01',
  'hp': {'datasetfactory': 'datasets.counterfact_imbd_dataset_factory.CounterfactImdbDatasetFactory',
   'modelfactory': 'models.roberta_model_factory.RobertaModelFactory',
   'batch': '8',
   'gradientaccumulationsteps': '8',
   'epochs': '100',
   'log-level': 'INFO',
   'learningrate': 1e-05,
   'earlystoppingpatience': 9,
   'checkpointfreq': 2},
  'inputs': {'train': 's3://aegovan-data/counterfactuals/imdb/2023042316/00_00_00_01/train.json',
   'val': 's3://aegovan-data/counterfactuals/imdb/2023042316/00_00_00_01/val.json',
   'PRETRAINED_MODEL': 's3://aegovan-data/pretrained_models/roberta-base/'}},
 {'name': '09-00-00-01',
  'hp': {'datasetfactory': 'datasets.counterfact_imbd_dataset_factory.CounterfactImdbDatasetFactory',
   'modelfactory': 'models.roberta_model_factory.RobertaModelFactory',
   'batch': '8',
   'gradientaccumulationsteps': '8',
   'epochs': '100',
   'log-level': 'INFO',
   'learningrate': 1e-05,
   'earlystoppingpatience': 9,
   'check

In [21]:
from sagemaker.pytorch import PyTorch
from stepfunctions.steps import *
from stepfunctions.workflow import Workflow
import random

from sagemaker.pytorch import PyTorch

train_steps = []
variations = 5
for e in filter(lambda x: True  ,experiments_hp):
    
    job_name = job_name_prefix + "-" + date_fmt + "-" + e["name"].replace("_","-") 
    job_name = job_name[-63:]
    print(f"Running {job_name}")



    estimator = PyTorch(
    entry_point='main_train_pipeline.py',
                    source_dir = '../src',
                    dependencies =['../src/datasets', '../src/models','../src/utils', '../src/scorers'],
#                      git_config= git_config,
#                     image_name= docker_repo,
                    role=role,
                    framework_version ="1.4.0",
                    py_version='py3',
                    instance_count=1,
                    instance_type=instance_type,
                    hyperparameters = e["hp"],
                    output_path=s3_output_path,
                    metric_definitions=metric_definitions,
                    volume_size=30,
                    code_location=s3_code_path,
                    debugger_hook_config=False,
                    use_spot_instances = use_spot,
                    max_run =  train_max_run_secs,
                    max_wait = max_wait_time_secs,   
#                     checkpoint_s3_uri=s3_checkpoint,
#                     checkpoint_local_path=sm_localcheckpoint_dir
 )


    step_train = sagemaker.TrainingStep( f"Train-{job_name}", 
                                                       estimator, 
                                                       job_name, 
                                                       data=e["inputs"])


    train_steps.append(step_train)
    

parallel_steps = []
max_parallel = 7
for i in range(0, len(train_steps), max_parallel):
    p = states.Parallel(f"train-p-{i}")
    for pi, s in enumerate(train_steps[i: i+max_parallel]):
        w = (pi+1)*30
        p.add_branch(Chain([states.Wait(f"wait-{i}-{w}", seconds=w) , s]))
    parallel_steps.append(p)

    
basic_path = Chain(parallel_steps)


workflow_prefix = job_name_prefix 
basic_workflow = Workflow(
    name=f"{workflow_prefix}-{date_fmt}", definition=basic_path, role=step_func_role
)



Running imdb-rob-2K-202309101635-00-00-00-01
Running imdb-rob-2K-202309101635-09-00-00-01
Running imdb-rob-2K-202309101635-19-00-00-01
Running imdb-rob-2K-202309101635-28-00-00-01
Running imdb-rob-2K-202309101635-88-00-00-01
Running imdb-rob-2K-202309101635-00-00-00-02
Running imdb-rob-2K-202309101635-09-00-00-02
Running imdb-rob-2K-202309101635-18-00-00-02
Running imdb-rob-2K-202309101635-28-00-00-02
Running imdb-rob-2K-202309101635-88-00-00-02
Running imdb-rob-2K-202309101635-00-00-00-03
Running imdb-rob-2K-202309101635-09-00-00-03
Running imdb-rob-2K-202309101635-18-00-00-03
Running imdb-rob-2K-202309101635-28-00-00-03
Running imdb-rob-2K-202309101635-89-00-00-03
Running imdb-rob-2K-202309101635-00-00-00-04
Running imdb-rob-2K-202309101635-09-00-00-04
Running imdb-rob-2K-202309101635-18-00-00-04
Running imdb-rob-2K-202309101635-28-00-00-04
Running imdb-rob-2K-202309101635-88-00-00-04
Running imdb-rob-2K-202309101635-00-00-00-05
Running imdb-rob-2K-202309101635-09-00-00-05
Running im

In [22]:
# Render the workflow
basic_workflow.render_graph()

In [23]:
basic_workflow.create()

'arn:aws:states:us-east-2:324346001917:stateMachine:imdb-rob-2K-202309101635'

In [24]:
basic_workflow.execute()