# Step 1
Carry over the S3 bucket stored from the first notebook and import dependencies

In [None]:
import time
import sagemaker
from sagemaker.tensorflow import TensorFlow
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

%store -r
print ('data_bucket_name=',data_bucket_name)

# Step 2
Establish a SageMaker Training job name, these must be unique

In [None]:
job_name='har-tf-'+time.strftime('%Y-%m-%d-%H-%M-%S')
print(job_name)

# Step 3
These settings work as is for demonstration purposes.  Setting the epochs to a higher number can yield better results.  Each epoch here, with this dataset (w=128, s=32 from prior notebook) on a ml.m5.large take approximately 40 seconds.

In [None]:
hyperparameter_json={
    'epochs': 5,
    'batch_size': 64}

# Step 4 Define the Tensorflow Environment
Create a tensorflow estimator.  As delivered, this will create one instance of a ml.m5.large.  This will also automatically push the entry script to S3.  Consult the link for more options such as specifing a fixed entry point script or training from a prior model save point, instead of starting training from a naive position.

https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html

In [None]:

tf_estimator = TensorFlow(use_spot_instances=False,
                          enable_sagemaker_metrics=True,
                          entry_point='train_tf.py', 
                          #model_uri='s3://bucket/folder/output/model.tar.gz',
                          role=role,
                          instance_count=1, 
                          instance_type='ml.m5.large',
                          framework_version='1.12', 
                          volume_size=8,
                          py_version='py3',
                          script_mode=True,
                          hyperparameters=hyperparameter_json
                         )

# Step 5 Start a Tensorflow Training Job
This step will initiate a SageMaker Training job.  Note the parameter of wait=True.  This will cause this notebook to wait on the training job to complete.  In a production scenario, this step would NOT wait, but would be a single step in a state machine.  This step can take about 5-7 minutes to complete, as delivered.  Factors such as epoch or other changes can effect runtime.

https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html#sagemaker.estimator.EstimatorBase.fit

In [None]:
tf_estimator.fit(
    inputs={
    'training': 's3://'+data_bucket_name+'/train',
    'test': 's3://'+data_bucket_name+'/test'
    },
    wait=True,
    job_name=job_name)

# Step 6
This step will loop, waiting on the training job to complete, in a case when the prior fit() step specified wait=False.

https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_DescribeTrainingJob.html

In [None]:
import boto3
import json
client = boto3.client('sagemaker')

response = client.describe_training_job(
    TrainingJobName=job_name
)

while (response['TrainingJobStatus'] not in ('Failed','Completed','Stopped','Stopping','Interrupted','MaxRuntimeExceeded')):
    print (response['TrainingJobStatus'])
    time.sleep(15)
    response = client.describe_training_job(TrainingJobName=job_name)
    
print('job_name=',job_name)
print('TrainingTimeInSeconds=',response['TrainingTimeInSeconds'])
print('TrainingJobStatus=',response['TrainingJobStatus'])
print('S3ModelArtifacts=',response['ModelArtifacts']['S3ModelArtifacts'])

# Step 7 Deploy Trained Model to an API Endpoint
When the training job is complete, harvest the model S3 location from the prior step and supply it as a parameter here.  This step deploys an API endpoint that serves model inference real-time.  This step can take about 5-7 minutes to complete.  Note the parameter of wait=True.  This will cause this notebook to wait on the API endpoint deployment to complete.  In a production scenario, this step would NOT wait, but would be a single step in a state machine.  

https://sagemaker.readthedocs.io/en/stable/frameworks/tensorflow/deploying_tensorflow_serving.html#deploying-directly-from-model-artifacts

In [None]:
from sagemaker.tensorflow import TensorFlowModel
tf_endpoint_name = 'ep-'+job_name

model_artifact = response['ModelArtifacts']['S3ModelArtifacts']

model = TensorFlowModel(model_data=model_artifact, role=role, framework_version='1.12')
                                       
predictor = model.deploy(initial_instance_count=1, 
                         instance_type='ml.t2.medium',
                         endpoint_name=tf_endpoint_name,
                         #accelerator_type='ml.eia1.medium'
                         wait=True)

# Step 8
This step waits for the endpoint to become In-Service.  The response is printed for observability.  The endpoint name is captured as a variable and will be passed to the third notebook.

In [None]:
import boto3
import json
import pprint
client = boto3.client('sagemaker')

response = client.describe_endpoint(
    EndpointName=tf_endpoint_name
)

while (response['EndpointStatus'] not in ('InService')):
    time.sleep(15)
    response = client.describe_endpoint(
    EndpointName=tf_endpoint_name)
    print(response['EndpointStatus'])
    
pprint.pprint(response)
print('tf_endpoint_name=',tf_endpoint_name)

# Step 9
Pass variables to next notebook.

In [None]:
%store tf_endpoint_name
%store data_bucket_name