In [15]:
import sagemaker.huggingface
import sagemaker
from transformers import AutoTokenizer

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::448807757624:role/service-role/AmazonSageMaker-ExecutionRole-20211202T101582
sagemaker bucket: sagemaker-us-east-2-448807757624
sagemaker session region: us-east-2


In [16]:
s3_prefix = 'datasets/xsum_corrupted'
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'

In [7]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'train_batch_size': 1,
                 'model_name':'facebook/bart-large',
                 'output_dir':'/opt/ml/checkpoints'
                 }

# s3 uri where our checkpoints will be uploaded during training
job_name = "bart-large-spot-p3-16xlarge"
checkpoint_s3_uri = f's3://{sess.default_bucket()}/{job_name}/checkpoints'

In [12]:
huggingface_estimator = HuggingFace(entry_point='train_xsum.py',
                            source_dir='./scripts',
                            instance_type='ml.p3.16xlarge',
                            instance_count=1,
                            base_job_name=job_name,
                            checkpoint_s3_uri=checkpoint_s3_uri,
#                             use_spot_instances=True,
#                             max_wait=7200, # This should be equal to or greater than max_run in seconds'
                            max_run=50000, # expected max run in seconds
                            role=role,
                            transformers_version='4.6',
                            pytorch_version='1.7',
                            py_version='py36',
                            hyperparameters = hyperparameters)

In [None]:
huggingface_estimator.fit({'train': training_input_path})

2021-12-06 15:40:23 Starting - Starting the training job...
2021-12-06 15:40:25 Starting - Launching requested ML instancesProfilerReport-1638805222: InProgress
.........
2021-12-06 15:42:06 Starting - Preparing the instances for training......
2021-12-06 15:43:24 Downloading - Downloading input data......
2021-12-06 15:44:07 Training - Downloading the training image............
2021-12-06 15:46:15 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-12-06 15:46:16,022 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-12-06 15:46:16,101 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-12-06 15:46:17,520 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-12-06 15:46:18,084 sagemaker-traini