In [1]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)
iam = boto3.Session().client(service_name='iam', region_name=region)
ec2 = boto3.Session().client(service_name='ec2', region_name=region)

In [2]:
%store -r processed_train_data_s3_uri

In [3]:
try:
    processed_train_data_s3_uri
except NameError:
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')

In [4]:
print(processed_train_data_s3_uri)

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-19-47-606/output/bert-train


In [5]:
%store -r processed_validation_data_s3_uri

In [6]:
try:
    processed_validation_data_s3_uri
except NameError:
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')

In [7]:
print(processed_validation_data_s3_uri)

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-19-47-606/output/bert-validation


In [8]:
%store -r processed_test_data_s3_uri

In [9]:
try:
    processed_test_data_s3_uri
except NameError:
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')

In [10]:
print(processed_test_data_s3_uri)

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-19-47-606/output/bert-test


In [11]:
%store -r max_seq_length

In [12]:
print(max_seq_length)

64


# Specify the Dataset in S3
We are using the train, validation, and test splits created in the previous section.

In [13]:
print(processed_train_data_s3_uri)

!aws s3 ls $processed_train_data_s3_uri/

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-19-47-606/output/bert-train
2020-12-18 20:27:09     352955 part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord
2020-12-18 20:27:09      11967 part-algo-1-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord
2020-12-18 20:25:44      10907 part-algo-2-amazon_reviews_us_Digital_Software_v1_00.tfrecord


In [14]:
print(processed_validation_data_s3_uri)

!aws s3 ls $processed_validation_data_s3_uri/

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-19-47-606/output/bert-validation
2020-12-18 20:27:09      19963 part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord
2020-12-18 20:27:09        648 part-algo-1-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord
2020-12-18 20:25:44        644 part-algo-2-amazon_reviews_us_Digital_Software_v1_00.tfrecord


In [15]:
print(processed_test_data_s3_uri)

!aws s3 ls $processed_test_data_s3_uri/

s3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-19-47-606/output/bert-test
2020-12-18 20:27:10      19848 part-algo-1-amazon_reviews_us_Digital_Ebook_Purchase_v1_01.tfrecord
2020-12-18 20:27:10        720 part-algo-1-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord
2020-12-18 20:25:45        717 part-algo-2-amazon_reviews_us_Digital_Software_v1_00.tfrecord


# Specify S3 `Distribution Strategy`

In [16]:
from sagemaker.inputs import TrainingInput

s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, 
                                         distribution='ShardedByS3Key') 
s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, 
                                              distribution='ShardedByS3Key')
s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, 
                                        distribution='ShardedByS3Key')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-19-47-606/output/bert-train', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-19-47-606/output/bert-validation', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-12-18-20-19-47-606/output/bert-test', 'S3DataDistributionType': 'ShardedByS3Key'}}}


# Setup Hyper-Parameters for Classification Layer

In [17]:
print(max_seq_length)

64


In [18]:
epochs=1
learning_rate=0.00001
epsilon=0.00000001
train_batch_size=128
validation_batch_size=128
test_batch_size=128
train_steps_per_epoch=100
validation_steps=100
test_steps=100
train_instance_count=1
train_instance_type='ml.c5.9xlarge'
train_volume_size=1024
use_xla=True
use_amp=True
freeze_bert_layer=False
enable_sagemaker_debugger=True
enable_checkpointing=False
enable_tensorboard=False
#input_mode='Pipe'
input_mode='File'
run_validation=True
run_test=True
run_sample_predictions=True

In [19]:
metrics_definitions = [
     {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
     {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
     {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
     {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'},
]

# Setup Our BERT + TensorFlow Script to Run on SageMaker
Prepare our TensorFlow model to run on the managed SageMaker service

In [20]:
assume_role_policy_doc = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "sagemaker.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
} 

In [21]:
import time

timestamp = int(time.time())

In [22]:
secure_iam_role_name = 'DSOAWS_Secure_Train_VPC_{}'.format(timestamp)

In [23]:
import json
import time

from botocore.exceptions import ClientError

try:
    secure_iam_role = iam.create_role(
        RoleName=secure_iam_role_name,
        AssumeRolePolicyDocument=json.dumps(assume_role_policy_doc),
        Description='DSOAWS Secure Role'
    )
except ClientError as e:
    if e.response['Error']['Code'] == 'EntityAlreadyExists':
        iam_role = iam.get_role(RoleName=secure_iam_role_name)
#        print("Role already exists")
    else:
        print("Unexpected error: %s" % e)

print(secure_iam_role)

time.sleep(30)

{'Role': {'Path': '/', 'RoleName': 'DSOAWS_Secure_Train_VPC_1608505738', 'RoleId': 'AROA4E7HNG26EDX6GORNM', 'Arn': 'arn:aws:iam::835319576252:role/DSOAWS_Secure_Train_VPC_1608505738', 'CreateDate': datetime.datetime(2020, 12, 20, 23, 8, 59, tzinfo=tzlocal()), 'AssumeRolePolicyDocument': {'Version': '2012-10-17', 'Statement': [{'Effect': 'Allow', 'Principal': {'Service': 'sagemaker.amazonaws.com'}, 'Action': 'sts:AssumeRole'}]}}, 'ResponseMetadata': {'RequestId': 'cf080bf9-cebb-4e25-bb5e-09ded5f8b72a', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'cf080bf9-cebb-4e25-bb5e-09ded5f8b72a', 'content-type': 'text/xml', 'content-length': '825', 'date': 'Sun, 20 Dec 2020 23:08:58 GMT'}, 'RetryAttempts': 0}}


# TODO:  Lock this down to VPC

In [24]:
iam_policy_allow_s3 = {
        'Version': '2012-10-17',
        'Statement': [{
            'Sid': '',
            'Effect': 'Allow',
            'Action': [
                's3:*'
            ],
            'Resource': [
                'arn:aws:s3:::{}'.format(bucket)
            ]
        }]
    }

In [25]:
policy_allow_s3_name='DSOAWS_Secure_Train_Allow_S3_{}'.format(timestamp)

In [26]:
import time

response = iam.put_role_policy(
    RoleName=secure_iam_role_name,
    PolicyName=policy_allow_s3_name,
    PolicyDocument=json.dumps(iam_policy_allow_s3)
)

print(response)

time.sleep(30)

{'ResponseMetadata': {'RequestId': '5c6f74a5-a570-4459-9d7d-c170f92a8feb', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '5c6f74a5-a570-4459-9d7d-c170f92a8feb', 'content-type': 'text/xml', 'content-length': '206', 'date': 'Sun, 20 Dec 2020 23:09:28 GMT'}, 'RetryAttempts': 0}}


In [27]:
# different_vpc_id='blah'

In [28]:
# # Create the bucket policy
# policy_deny_create_training_job = {
#     "Version": "2008-10-17",
#     "Statement": [
#         {
#             "Effect": "Deny",
#             "Action": [
#                 "sagemaker:CreateTrainingJob",
#             ],
#             "Resource": [
#                 "*"
#             ],
#             "Condition": {
#                 "StringNotEquals": {
#                     "aws:sourceVpc": different_vpc_id
#                 }
#             }
#         }
#     ]
# }

# TODO:  Also show this way?

In [39]:
# Create the bucket policy
policy_deny_create_training_job = {
    "Version": "2008-10-17",
    "Statement": [
        {
            "Effect": "Deny",
            "Action": [
                "sagemaker:CreateTrainingJob",
            ],
            "Resource": [
                "*"
            ],
            "Condition": {
                "Null": {
                    "sagemaker:VpcSubnets": "true",
                    "sagemaker:VpcSecurityGroupIds": "true"
                }
            }
        }
    ]
}

In [40]:
policy_deny_create_training_job_name='DSOAWS_Secure_Train_Deny_CreateTrainingJob_VPC_{}'.format(timestamp)

In [41]:
import time

response = iam.put_role_policy(
    RoleName=secure_iam_role_name,
    PolicyName=policy_deny_create_training_job_name,
    PolicyDocument=json.dumps(policy_deny_create_training_job)
)

print(response)

time.sleep(30)

{'ResponseMetadata': {'RequestId': 'c47e0318-c08c-4caf-990b-deddf6f8c0e9', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'c47e0318-c08c-4caf-990b-deddf6f8c0e9', 'content-type': 'text/xml', 'content-length': '206', 'date': 'Sun, 20 Dec 2020 23:12:28 GMT'}, 'RetryAttempts': 0}}


In [32]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                       source_dir='src',
#                       role=role,
                       role=secure_iam_role_name,
                       instance_count=train_instance_count,
                       instance_type=train_instance_type,
                       volume_size=train_volume_size,
#                        use_spot_instances=True,
#                        max_wait=7200, # Seconds to wait for spot instances to become available
                       py_version='py3',
                       framework_version='2.1.0',
                       hyperparameters={'epochs': epochs,
                                        'learning_rate': learning_rate,
                                        'epsilon': epsilon,
                                        'train_batch_size': train_batch_size,
                                        'validation_batch_size': validation_batch_size,
                                        'test_batch_size': test_batch_size,                                             
                                        'train_steps_per_epoch': train_steps_per_epoch,
                                        'validation_steps': validation_steps,
                                        'test_steps': test_steps,
                                        'use_xla': use_xla,
                                        'use_amp': use_amp,                                             
                                        'max_seq_length': max_seq_length,
                                        'freeze_bert_layer': freeze_bert_layer,
                                        'enable_sagemaker_debugger': enable_sagemaker_debugger,
                                        'enable_checkpointing': enable_checkpointing,
                                        'enable_tensorboard': enable_tensorboard,                                        
                                        'run_validation': run_validation,
                                        'run_test': run_test,
                                        'run_sample_predictions': run_sample_predictions},
                       input_mode=input_mode,
#                       subnets=None,
#                       security_group_ids=None,
#                       max_run=7200, # number of seconds
                      )

# Verify `CreateTrainingJob: AccessDenied`

In [33]:
estimator.fit(
    inputs={'train': s3_input_train_data, 
            'validation': s3_input_validation_data,
            'test': s3_input_test_data
    },              
    wait=False)

In [34]:
training_job_name = estimator.latest_training_job.name
print('Training Job Name:  {}'.format(training_job_name))

Training Job Name:  tensorflow-training-2020-12-20-21-47-03-460


In [35]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [36]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [37]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(bucket, training_job_name, region)))


In [38]:
%%time

estimator.latest_training_job.wait(logs=False)


2020-12-20 21:47:04 Starting - Starting the training job
2020-12-20 21:47:07 Starting - Launching requested ML instances................
2020-12-20 21:48:34 Starting - Preparing the instances for training.........
2020-12-20 21:49:21 Downloading - Downloading input data
2020-12-20 21:49:30 Training - Downloading the training image..
2020-12-20 21:49:43 Training - Training image download completed. Training in progress...............................................................................................
2020-12-20 21:57:39 Uploading - Uploading generated training model...................
2020-12-20 21:59:20 Completed - Training job completed
CPU times: user 535 ms, sys: 29.7 ms, total: 565 ms
Wall time: 12min 19s


# Cleanup Policies and Roles

In [39]:
response = iam.delete_role_policy(
    RoleName=secure_iam_role_name,
    PolicyName=policy_deny_create_training_job_name
)
print(response)

time.sleep(30)

{'ResponseMetadata': {'RequestId': 'a258cd08-b006-4805-af07-af22d92eaa75', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'a258cd08-b006-4805-af07-af22d92eaa75', 'content-type': 'text/xml', 'content-length': '212', 'date': 'Sun, 20 Dec 2020 21:59:23 GMT'}, 'RetryAttempts': 0}}


In [40]:
response = iam.delete_role_policy(
    RoleName=secure_iam_role_name,
    PolicyName=policy_allow_s3_name
)
print(response)

time.sleep(30)

{'ResponseMetadata': {'RequestId': '63c5622b-9da5-4099-98da-5944276e9a80', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '63c5622b-9da5-4099-98da-5944276e9a80', 'content-type': 'text/xml', 'content-length': '212', 'date': 'Sun, 20 Dec 2020 21:59:53 GMT'}, 'RetryAttempts': 0}}


In [41]:
iam.delete_role(RoleName=secure_iam_role_name)

time.sleep(30)

In [31]:
import json
notebook_instance_name = None

try:
    with open('/opt/ml/metadata/resource-metadata.json') as notebook_info:
        data = json.load(notebook_info)
        resource_arn = data['ResourceArn']
        region = resource_arn.split(':')[3]
        notebook_instance_name = data['ResourceName']
    print('Notebook Instance Name: {}'.format(notebook_instance_name))
except:
    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    print('[ERROR]: COULD NOT RETRIEVE THE NOTEBOOK INSTANCE METADATA.')
    print('+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')

Notebook Instance Name: dsoaws-vpc


In [32]:
response = sm.describe_notebook_instance(
        NotebookInstanceName=notebook_instance_name
)

print(response)

{'NotebookInstanceArn': 'arn:aws:sagemaker:us-east-1:835319576252:notebook-instance/dsoaws-vpc', 'NotebookInstanceName': 'dsoaws-vpc', 'NotebookInstanceStatus': 'InService', 'Url': 'dsoaws-vpc.notebook.us-east-1.sagemaker.aws', 'InstanceType': 'ml.c5.2xlarge', 'SubnetId': 'subnet-0b8d836c', 'SecurityGroups': ['sg-5383e807'], 'RoleArn': 'arn:aws:iam::835319576252:role/service-role/AmazonSageMaker-ExecutionRole-20191006T135881', 'NetworkInterfaceId': 'eni-0f47f6e01be9f3a8c', 'LastModifiedTime': datetime.datetime(2020, 12, 18, 15, 14, 4, 994000, tzinfo=tzlocal()), 'CreationTime': datetime.datetime(2020, 12, 18, 15, 11, 35, 616000, tzinfo=tzlocal()), 'DirectInternetAccess': 'Enabled', 'VolumeSizeInGB': 250, 'RootAccess': 'Enabled', 'ResponseMetadata': {'RequestId': '735c6dd6-1a92-4b07-8e0c-263e8f96f94e', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '735c6dd6-1a92-4b07-8e0c-263e8f96f94e', 'content-type': 'application/x-amz-json-1.1', 'content-length': '605', 'date': 'Sun, 20 D

In [33]:
print('SubnetId: {}'.format(response['SubnetId']))
print('SecurityGroups: {}'.format(response['SecurityGroups']))
print('IAM Role: {}'.format(response['RoleArn']))
print('NetworkInterfaceId: {}'.format(response['NetworkInterfaceId']))
print('DirectInternetAccess: {}'.format(response['DirectInternetAccess']))

SubnetId: subnet-0b8d836c
SecurityGroups: ['sg-5383e807']
IAM Role: arn:aws:iam::835319576252:role/service-role/AmazonSageMaker-ExecutionRole-20191006T135881
NetworkInterfaceId: eni-0f47f6e01be9f3a8c
DirectInternetAccess: Enabled


In [34]:
subnet_id=response['SubnetId']
print(subnet_id)

subnet-0b8d836c


In [35]:
security_group_ids=response['SecurityGroups']
print(security_group_ids)

['sg-5383e807']


In [36]:
from pprint import pprint

all_vpcs = ec2.describe_vpcs()['Vpcs']

print(len(all_vpcs))

pprint(all_vpcs)


2
[{'CidrBlock': '10.71.0.0/16',
  'CidrBlockAssociationSet': [{'AssociationId': 'vpc-cidr-assoc-0f18f3a8a9b34427d',
                               'CidrBlock': '10.71.0.0/16',
                               'CidrBlockState': {'State': 'associated'}}],
  'DhcpOptionsId': 'dopt-5316d129',
  'InstanceTenancy': 'default',
  'IsDefault': False,
  'OwnerId': '835319576252',
  'State': 'available',
  'Tags': [{'Key': 'Name', 'Value': '10.71.0.0/16'},
           {'Key': 'aws:cloudformation:stack-name',
            'Value': 'cfregly-redshift'},
           {'Key': 'aws:cloudformation:stack-id',
            'Value': 'arn:aws:cloudformation:us-east-1:835319576252:stack/cfregly-redshift/575175b0-1c4c-11ea-8dcd-0eb5b83f8639'},
           {'Key': 'aws:cloudformation:logical-id', 'Value': 'VPC'}],
  'VpcId': 'vpc-011a72f256d2c6951'},
 {'CidrBlock': '172.31.0.0/16',
  'CidrBlockAssociationSet': [{'AssociationId': 'vpc-cidr-assoc-7ccbfc10',
                               'CidrBlock': '172.31.0.0/16',
 

In [37]:
vpc_id = ec2.describe_vpcs()['Vpcs'][-1]['VpcId']
print(vpc_id)

vpc-09a2f873


# Specify the VPC parameters and Verify Successful Training Job

In [49]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                       source_dir='src',
#                       role=secure_iam_role_name,
                       role=role,                       
                       instance_count=train_instance_count,
                       instance_type=train_instance_type,
                       volume_size=train_volume_size,
#                        use_spot_instances=True,
#                        max_wait=7200, # Seconds to wait for spot instances to become available
                       py_version='py3',
                       framework_version='2.1.0',
                       hyperparameters={'epochs': epochs,
                                        'learning_rate': learning_rate,
                                        'epsilon': epsilon,
                                        'train_batch_size': train_batch_size,
                                        'validation_batch_size': validation_batch_size,
                                        'test_batch_size': test_batch_size,                                             
                                        'train_steps_per_epoch': train_steps_per_epoch,
                                        'validation_steps': validation_steps,
                                        'test_steps': test_steps,
                                        'use_xla': use_xla,
                                        'use_amp': use_amp,                                             
                                        'max_seq_length': max_seq_length,
                                        'freeze_bert_layer': freeze_bert_layer,
                                        'enable_sagemaker_debugger': enable_sagemaker_debugger,
                                        'enable_checkpointing': enable_checkpointing,
                                        'enable_tensorboard': enable_tensorboard,                                        
                                        'run_validation': run_validation,
                                        'run_test': run_test,
                                        'run_sample_predictions': run_sample_predictions},
                       input_mode=input_mode,
                       subnets=[
                           subnet_id
                       ],
                       security_group_ids=security_group_ids
#                       max_run=7200, # number of seconds
                      )

# Verify Training Starts OK

# TODO:  Explain why this shows when we run on a Notebook Instance running within the VPC, but works OK when the notebook is outside of the VPC:  (ie. "outside of the VPC,  we are going through the public internet - which is not good)

```
UnexpectedStatusException: Error for Training job tensorflow-training-2020-12-20-23-13-52-444: Failed. Reason: ClientError: Data download failed:Please ensure that the subnet's route table has a route to an S3 VPC endpoint or a NAT device, and both the security groups and the subnet's network ACL allow outbound traffic to S3.                  
```

In [50]:
estimator.fit(inputs={'train': s3_input_train_data, 
                      'validation': s3_input_validation_data,
                      'test': s3_input_test_data
              },              
              wait=False)

In [51]:
training_job_name = estimator.latest_training_job.name
print('Training Job Name:  {}'.format(training_job_name))

Training Job Name:  tensorflow-training-2020-12-20-23-13-52-444


In [52]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [53]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [54]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(bucket, training_job_name, region)))


In [55]:
%%time

estimator.latest_training_job.wait(logs=False)


2020-12-20 23:13:53 Starting - Starting the training job
2020-12-20 23:13:56 Starting - Launching requested ML instances.............
2020-12-20 23:15:08 Starting - Preparing the instances for training.......
2020-12-20 23:15:49 Downloading - Downloading input data...........
2020-12-20 23:16:47 Failed - Training job failed


UnexpectedStatusException: Error for Training job tensorflow-training-2020-12-20-23-13-52-444: Failed. Reason: ClientError: Data download failed:Please ensure that the subnet's route table has a route to an S3 VPC endpoint or a NAT device, and both the security groups and the subnet's network ACL allow outbound traffic to S3.

# Wait Until the ^^ Training Job ^^ Completes Above!

# [INFO] _Feel free to continue to the next workshop section while this notebook is running._

In [None]:
!aws s3 cp s3://$bucket/$training_job_name/output/model.tar.gz ./model.tar.gz

In [None]:
!mkdir -p ./model/
!tar -xvzf ./model.tar.gz -C ./model/

In [None]:
!saved_model_cli show --all --dir ./model/tensorflow/saved_model/0/

In [None]:
%%javascript
Jupyter.notebook.save_checkpoint();
Jupyter.notebook.session.delete();