In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [None]:
!pip install -q smdebug==0.7.2
!pip install -q sagemaker-experiments==0.1.11

# Specify the S3 Location of the Features

In [None]:
%store -r scikit_processing_job_s3_output_prefix

In [None]:
print('Previous Scikit Processing Job Name: {}'.format(scikit_processing_job_s3_output_prefix))

In [None]:
prefix_train = '{}/output/bert-train-all'.format(scikit_processing_job_s3_output_prefix)
prefix_validation = '{}/output/bert-validation-all'.format(scikit_processing_job_s3_output_prefix)
prefix_test = '{}/output/bert-test-all'.format(scikit_processing_job_s3_output_prefix)

path_train = './{}'.format(prefix_train)
path_validation = './{}'.format(prefix_validation)
path_test = './{}'.format(prefix_test)

train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)
validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)
test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)

In [None]:
s3_input_train_data = sagemaker.s3_input(s3_data=train_s3_uri, distribution='ShardedByS3Key') 
s3_input_validation_data = sagemaker.s3_input(s3_data=validation_s3_uri, distribution='ShardedByS3Key')
s3_input_test_data = sagemaker.s3_input(s3_data=test_s3_uri, distribution='ShardedByS3Key')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

In [None]:
!cat src/tf_bert_reviews.py

# Setup Debugger Rules

In [None]:
from sagemaker.debugger import Rule, CollectionConfig, rule_configs

model_output_path = 's3://{}/models/tf2-bert'.format(bucket)

rules=[
        Rule.sagemaker(
            rule_configs.loss_not_decreasing(),
            rule_parameters={
                'collection_names': 'losses,metrics',
                'use_losses_collection': 'true',
                'num_steps': '5',
                'diff_percent': '5'
            },
            collections_to_save=[
                CollectionConfig(name='losses',
                                 parameters={
                                     'save_interval': '100',
                                 }),
                CollectionConfig(name='metrics',
                                 parameters={
                                     'save_interval': '100',
                                 })
            ]
        ),
        Rule.sagemaker(
            rule_configs.overtraining(),
            rule_parameters={
                'collection_names': 'losses,metrics',
                'patience_train': '5',
                'patience_validation': '10',
                'delta': '0.1'
            },
            collections_to_save=[
                CollectionConfig(name='losses',
                                 parameters={
                                     'save_interval': '100',
                                 }),
                CollectionConfig(name='metrics',
                                 parameters={
                                     'save_interval': '100',
                                 })
            ]
        )
    ]

# Setup Hyper-Parameters

In [None]:
epochs=10
train_batch_size=128
validation_batch_size=128
test_batch_size=128
train_steps_per_epoch=1000
validation_steps=1000
test_steps=1000
train_instance_count=1
train_instance_type='ml.p3dn.24xlarge'
train_volume_size=1800
use_xla=False
use_amp=False
max_seq_length=128
freeze_bert_layer=False
enable_sagemaker_debugger=True
use_parameter_server=False
input_mode='File' # 'File' or 'Pipe'

In [None]:
# Track Experiments
import time
unique_id = '{}-{}'.format(input_mode, int(time.time()))

from smexperiments.experiment import Experiment
experiment=Experiment.create(
    experiment_name='train-reviews-bert-{}'.format(unique_id),
    description='Train Reviews BERT', 
    sagemaker_boto_client=sm)

In [None]:
from smexperiments.tracker import Tracker
tracker_display_name='Train-Reviews-BERT-Tracker-{}'.format(unique_id)
print(tracker_display_name)

tracker = Tracker.create(display_name=tracker_display_name, sagemaker_boto_client=sm)
tracker.log_parameters({
    'epochs': epochs,
    'train_batch_size': train_batch_size,
    'validation_batch_size': validation_batch_size,
    'test_batch_size': test_batch_size,
    'train_steps_per_epoch': train_steps_per_epoch,
    'validation_steps': validation_steps,
    'test_steps': test_steps,
    'train_instance_count': train_instance_count,
    'train_instance_type': train_instance_type,
    'train_volume_size': train_volume_size,
    'use_xla': use_xla,
    'use_amp': use_amp,
    'max_seq_length': max_seq_length,
    'freeze_bert_layer': freeze_bert_layer,
    'enable_sagemaker_debugger': enable_sagemaker_debugger,
    'use_parameter_server': use_parameter_server,
    'input_mode': input_mode # 'File' or 'Pipe'      
})
# we can log the s3 uri to the dataset we just uploaded
tracker.log_input(name='reviews_dataset_train', media_type='s3/uri', value=train_s3_uri)
tracker.log_input(name='reviews_dataset_validation', media_type='s3/uri', value=validation_s3_uri)
tracker.log_input(name='reviews_dataset_test', media_type='s3/uri', value=test_s3_uri)

In [None]:
from smexperiments.trial import Trial
trial_name='train-reviews-bert-training-job-{}'.format(unique_id)
trial = Trial.create(trial_name=trial_name, experiment_name=experiment.experiment_name, sagemaker_boto_client=sm)
trial.add_trial_component(tracker.trial_component)
trial_component_display_name='Train-Reviews-BERT-Trial-{}'.format(unique_id)
experiment_config={'ExperimentName': experiment.experiment_name,
                   'TrialName': trial.trial_name,
                   'TrialComponentDisplayName': trial_component_display_name}

In [None]:
from sagemaker.tensorflow import TensorFlow

#tensorboard_output_config = TensorBoardOutputConfig('s3://smdebug-dev-demo-pdx/mnist/tensorboard')

metrics_definitions = [
     {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
     {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
     {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
     {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'},
]

estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                            source_dir='src',
                            role=role,
                            train_instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available
                            train_instance_type=train_instance_type,
                            train_volume_size=train_volume_size,
                            py_version='py3',
                            framework_version='2.0.0',
                            output_path=model_output_path,
                            hyperparameters={'use-xla': use_xla,
                                             'use-amp': use_amp,
                                             'train-batch-size': train_batch_size,
                                             'validation-batch-size': validation_batch_size,
                                             'test-batch-size': test_batch_size,
                                             'epochs': epochs,
                                             'train-steps-per-epoch': train_steps_per_epoch,
                                             'validation-steps': validation_steps,
                                             'test-steps': test_steps,
                                             'max-seq-length': max_seq_length,
                                             'freeze-bert-layer': freeze_bert_layer,
                                             'enable-sagemaker-debugger': enable_sagemaker_debugger},
                            distributions={'parameter_server': {'enabled': use_parameter_server}},
                            input_mode=input_mode,
                            enable_cloudwatch_metrics=True,
                            metric_definitions=metrics_definitions,
                            rules=rules,
#                            tensorboard_output_config=tensorboard_output_config
                           )

# Train the Model

In [None]:
estimator.fit(inputs={'train': s3_input_train_data, 
                      'validation': s3_input_validation_data,
                      'test': s3_input_test_data
                     },
                     experiment_config=experiment_config,                   
                     wait=False)

In [None]:
training_job_name = estimator.latest_training_job.name
print('training_job_name:  {}'.format(training_job_name))

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a> After About 5 Minutes</b>'.format(region, training_job_name)))

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(region, training_job_name)))

In [None]:
from IPython.core.display import display, HTML

training_job_s3_output_prefix = 'models/tf2-bert/{}'.format(training_job_name) # 'models/tf-bert/script-mode/training-runs/{}'.format(training_job_name)

display(HTML('<b>Review <a href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(bucket, training_job_s3_output_prefix, region)))

# Analyze Debugger Results

In [None]:
# estimator.latest_training_job.rule_job_summary()

In [None]:
# from smdebug.trials import create_trial

# # this is where we create a Trial object that allows access to saved tensors
# trial = create_trial(estimator.latest_job_debugger_artifacts_path())


## Add Lambda Function to Stop the TrainingJob Early
In your AWS console, go to Lambda Management Console,
Create a new function by hitting Create Function,
Choose the language as Python 3.7 and put in the following sample code for stopping the training job if one of the Rule statuses is "IssuesFound".

### Cloudwatch Events for Rules
Rule status changes in a training job trigger CloudWatch Events. These events can be acted upon by configuring a CloudWatch Rule (different from Amazon SageMaker Debugger Rule) to trigger each time a Debugger Rule changes status. In this notebook we'll go through how you can create a CloudWatch Rule to direct Training Job State change events to a lambda function that stops the training job in case a rule triggers and has status "IssuesFound".

Create a new execution role for the Lambda, and
In your IAM console, search for the role and attach "AmazonSageMakerFullAccess" policy to the role. This is needed for the code in your Lambda function to stop the training job.


### Create a CloudWatch Rule to Trigger a Lamba
In your AWS Console, go to CloudWatch and select Rule from the left column,
Hit Create Rule. The console will redirect you to the Rule creation page,
For the Service Name, select "SageMaker".
For the Event Type, select "SageMaker Training Job State Change".
In the Targets select the Lambda function you created above, and
For this example notebook, we'll leave everything as is.


#### Create the Lambda

```
import json
import boto3
import logging

def lambda_handler(event, context):
    training_job_name = event.get("detail").get("TrainingJobName")
    eval_statuses = event.get("detail").get("DebugRuleEvaluationStatuses", None)

    if eval_statuses is None or len(eval_statuses) == 0:
        logging.info("Couldn't find any debug rule statuses, skipping...")
        return {
            'statusCode': 200,
            'body': json.dumps('Nothing to do')
        }

    client = boto3.client('sagemaker')

    for status in eval_statuses:
        if status.get("RuleEvaluationStatus") == "IssuesFound":
            logging.info(
                'Evaluation of rule configuration {} resulted in "IssuesFound". '
                'Attempting to stop training job {}'.format(
                    status.get("RuleConfigurationName"), training_job_name
                )
            )
            try:
                client.stop_training_job(
                    TrainingJobName=training_job_name
                )
            except Exception as e:
                logging.error(
                    "Encountered error while trying to "
                    "stop training job {}: {}".format(
                        training_job_name, str(e)
                    )
                )
                raise e
    return None
```

* Create a new execution role for the Lambda, and
* In your IAM console, search for the role and attach "AmazonSageMakerFullAccess" policy to the role. This is needed for the code in your Lambda function to stop the training job.

#### Create a CloudWatch Rule

* In your AWS Console, go to CloudWatch and select Rule from the left column,
* Hit Create Rule. The console will redirect you to the Rule creation page,
 * For the Service Name, select "SageMaker".
 * For the Event Type, select "SageMaker Training Job State Change".
* In the Targets select the Lambda function you created above, and
* For this example notebook, we'll leave everything as is.

SageMaker kicked off rule evaluation jobs, one for each of the SageMaker rules - `Overtraining` and `LossNotDecreasing` as specified in the estimator. If we setup a CloudWatch Rule to stop the training job, we would see the `TrainingJobStatus` change to `Stopped` once the `RuleEvaluationStatus` for changes to `IssuesFound`.

In [None]:
# # This utility gives the link to monitor the CW event
# def _get_rule_job_name(training_job_name, rule_configuration_name, rule_job_arn):
#         """Helper function to get the rule job name"""
#         return "{}-{}-{}".format(
#             training_job_name[:26], rule_configuration_name[:26], rule_job_arn[-8:]
#         )
    
# def _get_cw_url_for_rule_job(rule_job_name, region):
#     return "https://{}.console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/ProcessingJobs;prefix={};streamFilter=typeLogStreamPrefix".format(region, region, rule_job_name)


# def get_rule_jobs_cw_urls(estimator):
#     region = boto3.Session().region_name
#     training_job = estimator.latest_training_job
#     training_job_name = training_job.describe()["TrainingJobName"]
#     rule_eval_statuses = training_job.describe()["DebugRuleEvaluationStatuses"]
    
#     result={}
#     for status in rule_eval_statuses:
#         if status.get("RuleEvaluationJobArn", None) is not None:
#             rule_job_name = _get_rule_job_name(training_job_name, status["RuleConfigurationName"], status["RuleEvaluationJobArn"])
#             result[status["RuleConfigurationName"]] = _get_cw_url_for_rule_job(rule_job_name, region)
#     return result

# get_rule_jobs_cw_urls(estimator)

In [None]:
# estimator.latest_training_job.describe()["TrainingJobStatus"]


# Analyze Experiment

In [None]:
from sagemaker.analytics import ExperimentAnalytics

# # Might need to convert ' => "
# search_expression = {
#     'Filters':[
#         {
#             'Name': 'DisplayName',
#             'Operator': 'Equals',
#             'Value': 'Training'
#         }
#     ]
# }

trial_component_analytics = ExperimentAnalytics(
    sagemaker_session=sess, 
    experiment_name=experiment.experiment_name,
#    search_expression=search_expression,
    sort_by='metrics.validation:accuracy.max',
    sort_order='Descending',
    metric_names=['validation:accuracy'],
    parameter_names=['epochs', 'train_batch_size']
)

In [None]:
analytics_table = trial_component_analytics.dataframe()
analytics_table

In [None]:
# search_expression={
#     'Filters':[{
#         'Name': 'Parents.TrialName',
#         'Operator': 'Equals',
#         'Value': ??
#     }]
# },

lineage_table = ExperimentAnalytics(
    sagemaker_session=sess,
    experiment_name=experiment.experiment_name,
#    search_expression=search_expression,
    sort_by="CreationTime",
    sort_order="Ascending",
)
lineage_table.dataframe()

# Setup Hyper-Parameter Ranges to Explore


In [None]:
from sagemaker.tuner import IntegerParameter
from sagemaker.tuner import ContinuousParameter
from sagemaker.tuner import CategoricalParameter
from sagemaker.tuner import HyperparameterTuner

#  'use-amp': use_amp,
#  'train-batch-size': train_batch_size,
#  'validation-batch-size': validation_batch_size,
#  'test-batch-size': test_batch_size,
#  'epochs': epochs,
#  'train-steps-per-epoch': train_steps_per_epoch,
#  'validation-steps': validation_steps,
#  'test-steps': test_steps,
#  'max-seq-length': max_seq_length,
#  'freeze-bert-layer': freeze_bert_layer,
#  'enable-sagemaker-debugger': enable_sagemaker_debugger},
                                                
hyperparameter_ranges = {
    'use-xla': CategoricalParameter([True, False]),
    'use-amp': CategoricalParameter([True, False]),
#    'train-batch-size': CategoricalParameter([128]),
#    'validation-batch-size': CategoricalParameter([128]),
#    'test-batch-size': CategoricalParameter([128]),
    'epochs': IntegerParameter(2, 8, scaling_type='Logarithmic'),
    'train-steps-per-epoch': CategoricalParameter([10, 100, 1000]),
#    'validation_steps': CategoricalParameter([100]),
#    'test-steps': CategoricalParameter([100]),
#    'max-seq-length': CategoricalParameter([128]),
    'freeze-bert-layer': CategoricalParameter([True, False]),
#    'enabled-sagemaker-debugger': CategoricalParameter([True])
}

objective_metric_name = 'validation:accuracy'

tuner = HyperparameterTuner(
    estimator=estimator,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=metrics_definitions,
    max_jobs=12,
    max_parallel_jobs=3,
    strategy='Bayesian'
)

# Start Tuning Job

In [None]:
tuner.fit({'train': s3_input_train_data, 
           'validation': s3_input_validation_data,
           'test': s3_input_test_data
          }, include_cls_metadata=False)

In [None]:
from pprint import pprint

job_description = sm.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)

status = job_description['HyperParameterTuningJobStatus']

print('\n')
print(status)
print('\n')
pprint(job_description)

if status != 'Completed':
    job_count = job_description['TrainingJobStatusCounters']['Completed']
    print('Not yet complete, but {} jobs have completed.')
    
    if job_description.get('BestTrainingJob', None):
        print("Best candidate:")
        pprint(job_description['BestTrainingJob']['TrainingJobName'])
        pprint(job_description['BestTrainingJob']['FinalHyperParameterTuningJobObjectiveMetric'])
    else:
        print("No training jobs have reported results yet.")    

Wait 30-60 seconds for this...

In [None]:
from sagemaker.analytics import HyperparameterTuningJobAnalytics

hp_results = HyperparameterTuningJobAnalytics(
    sagemaker_session=sess, 
    hyperparameter_tuning_job_name=tuner.latest_tuning_job.name
)

df_results = hp_results.dataframe()

df_results.sort_values('FinalObjectiveValue', ascending=0)

In [None]:
df_results.sort_values('FinalObjectiveValue', ascending=0).head(1)

# Download and Load the Trained Model

In [None]:
# download the model artifact from AWS S3

!aws s3 cp $model_output_path/$training_job_name/output/model.tar.gz ./models

#!aws s3 cp s3://sagemaker-us-east-1-835319576252/models/tf-bert/script-mode/training-runs/tensorflow-training-2020-03-24-04-41-39-405/output/model.tar.gz ./models/tf2-bert/

In [None]:
import tarfile
import pickle as pkl

tar = tarfile.open('./models/model.tar.gz')
tar.extractall(path='./models')
tar.close()

In [None]:
!ls -al ./models

In [None]:
# Must upgrade wrapt before installing TF
!pip install -q pip --upgrade
!pip install -q wrapt --upgrade --ignore-installed
!pip install -q tensorflow==2.0.0

# Load the model

In [None]:
# TODO