In [1]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)
iam = boto3.Session().client(service_name='iam', region_name=region)

In [2]:
%store -r processed_train_data_s3_uri

In [3]:
try:
    processed_train_data_s3_uri
except NameError:
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')

In [4]:
print(processed_train_data_s3_uri)

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-train


In [5]:
%store -r processed_validation_data_s3_uri

In [6]:
try:
    processed_validation_data_s3_uri
except NameError:
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')

In [7]:
print(processed_validation_data_s3_uri)

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-validation


In [8]:
%store -r processed_test_data_s3_uri

In [9]:
try:
    processed_test_data_s3_uri
except NameError:
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')

In [10]:
print(processed_test_data_s3_uri)

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-test


In [11]:
%store -r max_seq_length

In [12]:
print(max_seq_length)

64


# Specify the Dataset in S3
We are using the train, validation, and test splits created in the previous section.

In [13]:
print(processed_train_data_s3_uri)

!aws s3 ls $processed_train_data_s3_uri/

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-train
2020-12-18 20:07:58     352211 part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord
2020-12-18 20:07:58      11710 part-algo-1-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord
2020-12-18 20:06:28      10676 part-algo-2-amazon_reviews_us_Digital_Software_v1_00.tfrecord


In [14]:
print(processed_validation_data_s3_uri)

!aws s3 ls $processed_validation_data_s3_uri/

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-validation
2020-12-18 20:07:58      20135 part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord
2020-12-18 20:07:58        657 part-algo-1-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord
2020-12-18 20:06:28        630 part-algo-2-amazon_reviews_us_Digital_Software_v1_00.tfrecord


In [15]:
print(processed_test_data_s3_uri)

!aws s3 ls $processed_test_data_s3_uri/

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-test
2020-12-18 20:07:59      19737 part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord
2020-12-18 20:07:59        648 part-algo-1-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord
2020-12-18 20:06:29        665 part-algo-2-amazon_reviews_us_Digital_Software_v1_00.tfrecord


# Specify S3 `Distribution Strategy`

In [16]:
from sagemaker.inputs import TrainingInput

s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, 
                                         distribution='ShardedByS3Key') 
s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, 
                                              distribution='ShardedByS3Key')
s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, 
                                        distribution='ShardedByS3Key')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-train', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-validation', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-01-09-904/output/bert-test', 'S3DataDistributionType': 'ShardedByS3Key'}}}


# Setup Hyper-Parameters for Classification Layer

In [17]:
print(max_seq_length)

64


In [18]:
epochs=1
learning_rate=0.00001
epsilon=0.00000001
train_batch_size=128
validation_batch_size=128
test_batch_size=128
train_steps_per_epoch=100
validation_steps=100
test_steps=100
train_instance_count=1
train_instance_type='ml.c5.9xlarge'
train_volume_size=1024
use_xla=True
use_amp=True
freeze_bert_layer=False
enable_sagemaker_debugger=True
enable_checkpointing=False
enable_tensorboard=False
input_mode='Pipe'
run_validation=True
run_test=True
run_sample_predictions=True

In [19]:
metrics_definitions = [
     {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
     {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
     {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
     {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'},
]

# Setup Our BERT + TensorFlow Script to Run on SageMaker
Prepare our TensorFlow model to run on the managed SageMaker service

In [20]:
assume_role_policy_doc = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "sagemaker.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
} 

In [21]:
import time

timestamp = int(time.time())

In [22]:
secure_iam_role_name = 'DSOAWS_Secure_Train_Role_{}'.format(timestamp)

In [23]:
import json
import time

from botocore.exceptions import ClientError

try:
    secure_iam_role = iam.create_role(
        RoleName=secure_iam_role_name,
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
        Description='DSOAWS Secure Role'
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        iam_role = iam.get_role(RoleName=secure_iam_role_name)
#        print("Role already exists")
    else:
        print("Unexpected error: %s" % e)

print(secure_iam_role)

time.sleep(30)

{'Role': {'Path': '/', 'RoleName': 'DSOAWS_Secure_Train_Role_1608488239', 'RoleId': 'AROA4E7HNG26OO4XRMFGW', 'Arn': 'arn:aws:iam::835319576252:role/DSOAWS_Secure_Train_Role_1608488239', 'CreateDate': datetime.datetime(2020, 12, 20, 18, 17, 19, tzinfo=tzlocal()), 'AssumeRolePolicyDocument': {'Version': '2012-10-17', 'Statement': [{'Effect': 'Allow', 'Principal': {'Service': 'sagemaker.amazonaws.com'}, 'Action': 'sts:AssumeRole'}]}}, 'ResponseMetadata': {'RequestId': 'e28d803b-6af1-4c19-b376-de67a1d1f808', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'e28d803b-6af1-4c19-b376-de67a1d1f808', 'content-type': 'text/xml', 'content-length': '827', 'date': 'Sun, 20 Dec 2020 18:17:19 GMT'}, 'RetryAttempts': 0}}


In [24]:
iam_policy_allow_s3 = {
        'Version': '2012-10-17',
        'Statement': [{
            'Sid': '',
            'Effect': 'Allow',
            'Action': [
                's3:*'
            ],
            'Resource': [
                'arn:aws:s3:::{}'.format(bucket)
            ]
        }]
    }

In [25]:
policy_allow_s3_name='DSOAWS_Secure_Train_Allow_S3_{}'.format(timestamp)

In [26]:
import time

response = iam.put_role_policy(
    RoleName=secure_iam_role_name,
    PolicyName=policy_allow_s3_name,
    PolicyDocument=json.dumps(iam_policy_allow_s3)
)

print(response)

time.sleep(30)

{'ResponseMetadata': {'RequestId': 'ba2a161a-3b9b-4042-9323-587616fe17cf', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'ba2a161a-3b9b-4042-9323-587616fe17cf', 'content-type': 'text/xml', 'content-length': '206', 'date': 'Sun, 20 Dec 2020 18:17:49 GMT'}, 'RetryAttempts': 0}}


In [27]:
# Create the bucket policy
policy_deny_create_training_job = {
    "Version": "2008-10-17",
    "Statement": [
        {
            "Effect": "Deny",
            "Action": [
                "sagemaker:CreateTrainingJob",
            ],
            "Resource": [
                "*"
            ]
        }
    ]
}

In [28]:
policy_deny_create_training_job_name='DSOAWS_Secure_Train_Deny_CreateTrainingJob_Role_{}'.format(timestamp)



In [29]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                       source_dir='src',
                       role=secure_iam_role,
                       instance_count=train_instance_count,
                       instance_type=train_instance_type,
                       volume_size=train_volume_size,
#                        use_spot_instances=True,
#                        max_wait=7200, # Seconds to wait for spot instances to become available
                       py_version='py3',
                       framework_version='2.1.0',
                       hyperparameters={'epochs': epochs,
                                        'learning_rate': learning_rate,
                                        'epsilon': epsilon,
                                        'train_batch_size': train_batch_size,
                                        'validation_batch_size': validation_batch_size,
                                        'test_batch_size': test_batch_size,                                             
                                        'train_steps_per_epoch': train_steps_per_epoch,
                                        'validation_steps': validation_steps,
                                        'test_steps': test_steps,
                                        'use_xla': use_xla,
                                        'use_amp': use_amp,                                             
                                        'max_seq_length': max_seq_length,
                                        'freeze_bert_layer': freeze_bert_layer,
                                        'enable_sagemaker_debugger': enable_sagemaker_debugger,
                                        'enable_checkpointing': enable_checkpointing,
                                        'enable_tensorboard': enable_tensorboard,                                        
                                        'run_validation': run_validation,
                                        'run_test': run_test,
                                        'run_sample_predictions': run_sample_predictions},
                       input_mode=input_mode,
#                       max_run=7200, # number of seconds
                      )

# Verify `CreateTrainingJob: AccessDenied`

In [30]:
estimator.fit(inputs={'train': s3_input_train_data, 
                      'validation': s3_input_validation_data,
                      'test': s3_input_test_data
              },              
              wait=False)

ParamValidationError: Parameter validation failed:
Invalid type for parameter RoleName, value: {'Role': {'Path': '/', 'RoleName': 'DSOAWS_Secure_Train_Role_1608488239', 'RoleId': 'AROA4E7HNG26OO4XRMFGW', 'Arn': 'arn:aws:iam::835319576252:role/DSOAWS_Secure_Train_Role_1608488239', 'CreateDate': datetime.datetime(2020, 12, 20, 18, 17, 19, tzinfo=tzlocal()), 'AssumeRolePolicyDocument': {'Version': '2012-10-17', 'Statement': [{'Effect': 'Allow', 'Principal': {'Service': 'sagemaker.amazonaws.com'}, 'Action': 'sts:AssumeRole'}]}}, 'ResponseMetadata': {'RequestId': 'e28d803b-6af1-4c19-b376-de67a1d1f808', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'e28d803b-6af1-4c19-b376-de67a1d1f808', 'content-type': 'text/xml', 'content-length': '827', 'date': 'Sun, 20 Dec 2020 18:17:19 GMT'}, 'RetryAttempts': 0}}, type: <class 'dict'>, valid types: <class 'str'>

In [None]:
training_job_name = estimator.latest_training_job.name
print('Training Job Name:  {}'.format(training_job_name))

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(bucket, training_job_name, region)))


In [None]:
%%time

estimator.latest_training_job.wait(logs=False)

# TODO:  Remove Policies and Show Success

# Cleanup Policies and Roles

In [None]:
response = iam.delete_role_policy(
    RoleName=secure_iam_role_name,
    PolicyName=policy_deny_create_training_job_name
)
print(response)

time.sleep(30)

In [None]:
response = iam.delete_role_policy(
    RoleName=secure_iam_role_name,
    PolicyName=policy_allow_s3_name
)
print(response)

time.sleep(30)

In [None]:
iam.delete_role(RoleName=secure_iam_role_name)

time.sleep(30)

# [INFO] _Feel free to continue to the next workshop section while this notebook is running._

In [None]:
!aws s3 cp s3://$bucket/$training_job_name/output/model.tar.gz ./model.tar.gz

In [None]:
!mkdir -p ./model/
!tar -xvzf ./model.tar.gz -C ./model/

In [None]:
!saved_model_cli show --all --dir ./model/tensorflow/saved_model/0/

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}