In [None]:
!  python -m pip install smdebug

In [20]:
entry_point_script = 'sm-tf.py'

In [21]:
import boto3
import os
import sagemaker
from sagemaker.tensorflow import TensorFlow
from sagemaker.debugger import DebuggerHookConfig, CollectionConfig, TensorBoardOutputConfig, Rule, rule_configs



sagemaker_session = sagemaker.Session()
BUCKET_NAME = "ml-pipeline-1511"
LOCATION_IN_BUCKET = 'smdebug-tf'

s3_bucket_for_tensors = 's3://{BUCKET_NAME}/{LOCATION_IN_BUCKET}'.format(BUCKET_NAME=BUCKET_NAME, LOCATION_IN_BUCKET=LOCATION_IN_BUCKET)

rules = [
    Rule.sagemaker(rule_configs.vanishing_gradient()), 
    Rule.sagemaker(rule_configs.loss_not_decreasing())
    ]

            

estimator = TensorFlow(
    role=sagemaker.get_execution_role(),
    base_job_name='smdebug-sm-tf',
    train_instance_count=1,
    train_instance_type='ml.m5.xlarge',
    entry_point = entry_point_script ,
    framework_version='2.1.0',
    py_version='py3',
    train_max_run=3600,
    script_mode=True,
    rules=rules
   
    )

In [22]:
estimator.fit(wait=True)

2020-04-29 15:53:53 Starting - Starting the training job...
2020-04-29 15:53:56 Starting - Launching requested ML instances
********* Debugger Rule Status *********
*
*  VanishingGradient: InProgress        
*  LossNotDecreasing: InProgress        
*
****************************************
...
2020-04-29 15:54:53 Starting - Preparing the instances for training......
2020-04-29 15:55:46 Downloading - Downloading input data
2020-04-29 15:55:46 Training - Downloading the training image..[34m2020-04-29 15:56:03,140 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-04-29 15:56:03,147 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-04-29 15:56:03,423 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-04-29 15:56:03,439 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-04-29 15:56:03,454 sagemaker-containers INFO    


2020-04-29 15:56:21 Training - Training image download completed. Training in progress.
2020-04-29 15:56:41 Uploading - Uploading generated training model
2020-04-29 15:56:41 Completed - Training job completed
Training seconds: 62
Billable seconds: 62


In [19]:
estimator.latest_training_job.rule_job_summary()

[{'RuleConfigurationName': 'VanishingGradient',
  'RuleEvaluationJobArn': 'arn:aws:sagemaker:ap-south-1:851154475795:processing-job/smdebug-sm-tf-2020-04-29-1-vanishinggradient-7c6e93d1',
  'RuleEvaluationStatus': 'InProgress',
  'LastModifiedTime': datetime.datetime(2020, 4, 29, 14, 58, 39, 397000, tzinfo=tzlocal())},
 {'RuleConfigurationName': 'LossNotDecreasing',
  'RuleEvaluationJobArn': 'arn:aws:sagemaker:ap-south-1:851154475795:processing-job/smdebug-sm-tf-2020-04-29-1-lossnotdecreasing-2d9c7426',
  'RuleEvaluationStatus': 'NoIssuesFound',
  'LastModifiedTime': datetime.datetime(2020, 4, 29, 14, 58, 39, 397000, tzinfo=tzlocal())}]

In [17]:
def _get_rule_job_name(training_job_name, rule_configuration_name, rule_job_arn):
        """Helper function to get the rule job name with correct casing"""
        return "{}-{}-{}".format(
            training_job_name[:26], rule_configuration_name[:26], rule_job_arn[-8:]
        )
    
def _get_cw_url_for_rule_job(rule_job_name, region):
    return "https://{}.console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/ProcessingJobs;prefix={};streamFilter=typeLogStreamPrefix".format(region, region, rule_job_name)


def get_rule_jobs_cw_urls(estimator):
    region = boto3.Session().region_name
    training_job = estimator.latest_training_job
    training_job_name = training_job.describe()["TrainingJobName"]
    rule_eval_statuses = training_job.describe()["DebugRuleEvaluationStatuses"]
    
    result={}
    for status in rule_eval_statuses:
        if status.get("RuleEvaluationJobArn", None) is not None:
            rule_job_name = _get_rule_job_name(training_job_name, status["RuleConfigurationName"], status["RuleEvaluationJobArn"])
            result[status["RuleConfigurationName"]] = _get_cw_url_for_rule_job(rule_job_name, region)
    return result

get_rule_jobs_cw_urls(estimator)

{'VanishingGradient': 'https://ap-south-1.console.aws.amazon.com/cloudwatch/home?region=ap-south-1#logStream:group=/aws/sagemaker/ProcessingJobs;prefix=smdebug-sm-tf-2020-04-29-1-VanishingGradient-7c6e93d1;streamFilter=typeLogStreamPrefix',
 'LossNotDecreasing': 'https://ap-south-1.console.aws.amazon.com/cloudwatch/home?region=ap-south-1#logStream:group=/aws/sagemaker/ProcessingJobs;prefix=smdebug-sm-tf-2020-04-29-1-LossNotDecreasing-2d9c7426;streamFilter=typeLogStreamPrefix'}

In [23]:
from smdebug.trials import create_trial
trial = create_trial(estimator.latest_job_debugger_artifacts_path())

trial.tensor_names()

[2020-04-29 15:59:13.062 ip-172-16-10-133:5042 INFO s3_trial.py:42] Loading trial debug-output at path s3://sagemaker-ap-south-1-851154475795/smdebug-sm-tf-2020-04-29-15-53-53-534/debug-output
[2020-04-29 15:59:13.454 ip-172-16-10-133:5042 INFO trial.py:198] Training has ended, will refresh one final time in 1 sec.
[2020-04-29 15:59:14.469 ip-172-16-10-133:5042 INFO trial.py:210] Loaded all steps


['accuracy', 'batch', 'loss', 'size', 'val_accuracy', 'val_loss']