In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

In [None]:
!pip install -q smdebug==0.7.2
!pip install -q sagemaker-experiments==0.1.11

# Specify the S3 Location of the Features

In [None]:
%store -r scikit_processing_job_s3_output_prefix

In [None]:
print('Previous Scikit Processing Job Name: {}'.format(scikit_processing_job_s3_output_prefix))

In [None]:
prefix_train = '{}/output/bert-train-all'.format(scikit_processing_job_s3_output_prefix)
prefix_validation = '{}/output/bert-validation-all'.format(scikit_processing_job_s3_output_prefix)
prefix_test = '{}/output/bert-test-all'.format(scikit_processing_job_s3_output_prefix)

path_train = './{}'.format(prefix_train)
path_validation = './{}'.format(prefix_validation)
path_test = './{}'.format(prefix_test)

train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)
validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)
test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)

In [None]:
s3_input_train_data = sagemaker.s3_input(s3_data=train_s3_uri, distribution='ShardedByS3Key') 
s3_input_validation_data = sagemaker.s3_input(s3_data=validation_s3_uri, distribution='ShardedByS3Key')
s3_input_test_data = sagemaker.s3_input(s3_data=test_s3_uri, distribution='ShardedByS3Key')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

In [None]:
!cat src_bert_tf2/tf_bert_reviews.py

In [None]:
import time
from sagemaker.tensorflow import TensorFlow
from sagemaker.debugger import Rule, DebuggerHookConfig, TensorBoardOutputConfig, CollectionConfig, rule_configs

model_output_path = 's3://{}/models/tf2-bert'.format(bucket)

rules=[
        Rule.sagemaker(
            rule_configs.loss_not_decreasing(),
            rule_parameters={
                'collection_names': 'losses,metrics',
                'use_losses_collection': 'true',
                'num_steps': '5',
                'diff_percent': '5'
            },
            collections_to_save=[
                CollectionConfig(name='losses',
                                 parameters={
                                     'save_interval': '5',
                                 }),
                CollectionConfig(name='metrics',
                                 parameters={
                                     'save_interval': '5',
                                 })
            ]
        ),
        Rule.sagemaker(
            rule_configs.overtraining(),
            rule_parameters={
                'collection_names': 'losses,metrics',
                'patience_train': '10',
                'patience_validation': '20',
                'delta': '0.1'
            },
            collections_to_save=[
                CollectionConfig(name='losses',
                                 parameters={
                                     'save_interval': '5',
                                 }),
                CollectionConfig(name='metrics',
                                 parameters={
                                     'save_interval': '5',
                                 })
            ]
        )
    ]

# Hyper-parameters
epochs=2
train_batch_size=128
train_instance_count=1
use_parameter_server=False
input_mode='File' # 'File' or 'Pipe'

unique_name = '{}-{}'.format(input_mode, int(time.time()))

# Track Experiments
from smexperiments.experiment import Experiment

experiment=Experiment.create(
    experiment_name='train-reviews-bert-{}'.format(unique_name),
    description='Train Reviews BERT', 
    sagemaker_boto_client=sm)

from smexperiments.tracker import Tracker

tracker_display_name='Train-Reviews-BERT-Tracker-{}'.format(unique_name)
with Tracker.create(display_name=tracker_display_name, sagemaker_boto_client=sm) as tracker:
    tracker.log_parameters({
        'epochs': epochs,
    })
    # we can log the s3 uri to the dataset we just uploaded
    tracker.log_input(name='reviews_dataset_train', media_type='s3/uri', value=train_s3_uri)
    tracker.log_input(name='reviews_dataset_validation', media_type='s3/uri', value=validation_s3_uri)
    tracker.log_input(name='reviews_dataset_test', media_type='s3/uri', value=test_s3_uri)
    
from smexperiments.trial import Trial

trial_name='train-reviews-bert-training-job-{}'.format(unique_name)
trial = Trial.create(trial_name=trial_name, experiment_name=experiment.experiment_name, sagemaker_boto_client=sm)
trial.add_trial_component(tracker.trial_component)

trial_component_display_name='Train-Reviews-BERT-Trial-{}'.format(unique_name)
    
experiment_config={'ExperimentName': experiment.experiment_name,
                   'TrialName': trial.trial_name,
                   'TrialComponentDisplayName': trial_component_display_name}

estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                            source_dir='src',
                            role=role,
                            train_instance_count=train_instance_count, # Make sure you have at least this number of input files or the ShardedByS3Key distibution strategy will fail the job due to no data available
                            train_instance_type='ml.p3.2xlarge',
                            train_volume_size=1800,
                            py_version='py3',
                            framework_version='2.0.0',
                            output_path=model_output_path,
                            hyperparameters={'use-xla': False,
                                             'use-amp': False,
                                             'train-batch-size': train_batch_size,
                                             'validation-batch-size': 128,
                                             'test-batch-size': 128,
                                             'epochs': epochs,
                                             'train-steps-per-epoch': 100,
                                             'validation-steps': 100,
                                             'test-steps': 100,
                                             'max-seq-length': 128,
                                             'freeze-bert-layer': False,
                                             'enable-sagemaker-debugger': True},
                            distributions={'parameter_server': {'enabled': use_parameter_server}},
                            input_mode=input_mode,
                            enable_cloudwatch_metrics=True,
                            metric_definitions=[
                                 {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
                                 {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
                                 {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
                                 {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'},
                            ],
                            rules=rules
                           )

In [None]:
from smdebug.trials import create_trial

# this is where we create a Trial object that allows access to saved tensors
trial = create_trial(estimator.latest_job_debugger_artifacts_path())


# Train the model

In [None]:
bert_estimator.fit(inputs={'train': s3_input_train_data, 
                           'validation': s3_input_validation_data,
                           'test': s3_input_test_data},
                   experiment_config=experiment_config,                   
                   wait=False)

In [None]:
training_job_name = bert_estimator.latest_training_job.name
print('training_job_name:  {}'.format(training_job_name))

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [None]:
from IPython.core.display import display, HTML

training_job_s3_output_prefix = 'models/tf2-bert/{}'.format(training_job_name) # 'models/tf-bert/script-mode/training-runs/{}'.format(training_job_name)

display(HTML('<b>Review <a href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(bucket, training_job_s3_output_prefix, region)))


# Analyze Experiment

In [None]:
# # Might need to convert ' => "
# search_expression = {
#     'Filters':[
#         {
#             'Name': 'DisplayName',
#             'Operator': 'Equals',
#             'Value': 'Training'
#         }
#     ]
# }

In [None]:
from sagemaker.analytics import ExperimentAnalytics

trial_component_analytics = ExperimentAnalytics(
    sagemaker_session=sess, 
    experiment_name=experiment.experiment_name,
#    search_expression=search_expression,
    sort_by='metrics.validation:accuracy.max',
    sort_order='Descending',
    metric_names=['validation:accuracy'],
    parameter_names=['epochs', 'train_batch_size']
)

In [None]:
analytics_table = trial_component_analytics.dataframe()
analytics_table

In [None]:
lineage_table = ExperimentAnalytics(
    sagemaker_session=sess,
    experiment_name=experiment.experiment_name,
#     search_expression={
#         'Filters':[{
#             'Name': 'Parents.TrialName',
#             'Operator': 'Equals',
#             'Value': ??
#         }]
#     },
    sort_by="CreationTime",
    sort_order="Ascending",
)

# Download and Load the Trained Model

In [None]:
# download the model artifact from AWS S3

!aws s3 cp $model_output_path/$training_job_name/output/model.tar.gz ./models

#!aws s3 cp s3://sagemaker-us-east-1-835319576252/models/tf-bert/script-mode/training-runs/tensorflow-training-2020-03-24-04-41-39-405/output/model.tar.gz ./models/tf2-bert/

In [None]:
import tarfile
import pickle as pkl

tar = tarfile.open('./models/model.tar.gz')
tar.extractall(path='./models')
tar.close()

In [None]:
!ls -al ./models

In [None]:
# Must upgrade wrapt before installing TF
!pip install -q pip --upgrade
!pip install -q wrapt --upgrade --ignore-installed
!pip install -q tensorflow==2.0.0

# Load the model

In [None]:
# TODO