**Note**: When running this notebook on SageMaker Studio, you should make sure the 'SageMaker JumpStart Tensorflow 1.0' image/kernel is used. You can run run all cells at once or step through the notebook.
# Policy Training

This notebook outlines the steps involved in building and deploying a Battlesnake model using Ray RLlib and TensorFlow on Amazon SageMaker.

Library versions currently in use:  TensorFlow 2.1, Ray RLlib 0.8.2

The model is first trained using multi-agent PPO, and then deployed to a managed _TensorFlow Serving_ SageMaker endpoint that can be used for inference.

In [21]:
import sagemaker
from sagemaker.rl import RLEstimator, RLToolkit, RLFramework
import boto3
import botocore
import json
from sagemaker.tuner import (IntegerParameter, CategoricalParameter, ContinuousParameter, 
                             HyperparameterTuner)

In [24]:
with open("../stack_outputs.json") as f:
    info = json.load(f)

## Initialise sagemaker
We need to define several parameters prior to running the training job. 

In [25]:
sm_session = sagemaker.session.Session()
s3_bucket = info["S3Bucket"]

s3_output_path = 's3://{}/'.format(s3_bucket)
print("S3 bucket path: {}".format(s3_output_path))

S3 bucket path: s3://sagemaker-soln-bs-snake-bucket-farbod/


In [26]:
job_name_prefix = info["SolutionPrefix"]+'-job-rllib'

role = info["SageMakerIamRoleArn"]
print(role)

arn:aws:iam::681627153266:role/sagemaker-soln-bs-us-west-2-nb-role


Change local_mode to True if you want to do local training within this Notebook instance

In [27]:
local_mode = False

if local_mode:
    instance_type = 'local'
else:
    instance_type = info["SagemakerTrainingInstanceType"]
    
# If training locally, do some Docker housekeeping..
if local_mode:
    !/bin/bash ./common/setup.sh

# Train your model here

In [28]:
region = sm_session.boto_region_name
device = "cpu"
image_name = '462105765813.dkr.ecr.{region}.amazonaws.com/sagemaker-rl-ray-container:ray-0.8.2-tf-{device}-py36'.format(region=region, device=device)

In [29]:
%%time

# Define and execute our training job
# Adjust hyperparameters and train_instance_count accordingly

metric_definitions =  [
#     {'Name': 'training_iteration', 'Regex': 'training_iteration: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
#     {'Name': 'episodes_total', 'Regex': 'episodes_total: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
#     {'Name': 'num_steps_trained', 'Regex': 'num_steps_trained: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    {'Name': 'timesteps_total', 'Regex': 'timesteps_total: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}
#     {'Name': 'training_iteration', 'Regex': 'training_iteration: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},

#     {'Name': 'episode_reward_max', 'Regex': 'episode_reward_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
#     {'Name': 'episode_reward_mean', 'Regex': 'episode_reward_mean: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
#     {'Name': 'episode_reward_min', 'Regex': 'episode_reward_min: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
    
#     {'Name': 'episode_len_max', 'Regex': 'episode_len_mean: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
#     {'Name': 'episode_len_mean', 'Regex': 'episode_len_mean: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 
#     {'Name': 'episode_len_min', 'Regex': 'episode_len_mean: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}, 

#     {'Name': 'best_snake_episode_len_max', 'Regex': 'best_snake_episode_len_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
#     {'Name': 'worst_snake_episode_len_max', 'Regex': 'worst_snake_episode_len_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},

#     {'Name': 'Snake_hit_wall_max', 'Regex': 'Snake_hit_wall_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
#     {'Name': 'Snake_was_eaten_max', 'Regex': 'Snake_was_eaten_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
#     {'Name': 'Killed_another_snake_max', 'Regex': 'Killed_another_snake_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
#     {'Name': 'Snake_hit_body_max', 'Regex': 'Snake_hit_body_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
#     {'Name': 'Starved_max', 'Regex': 'Starved_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'},
#     {'Name': 'Forbidden_move_max', 'Regex': 'Forbidden_move_max: ([-+]?[0-9]*[.]?[0-9]+([eE][-+]?[0-9]+)?)'}
] 

algorithm = "PPO"
map_size = 11
num_agents = 4
additional_config = {
    'lambda': 0.90,
    'gamma': 0.999,
    'kl_coeff': 0.2,
    'clip_rewards': True,
    'vf_clip_param': 175.0,
    'train_batch_size': 9216,
    'sample_batch_size': 96,
    'sgd_minibatch_size': 256,
    'num_sgd_iter': 3,
    'lr': 5.0e-4,
}

estimator = RLEstimator(entry_point="train-mabs.py",
                        source_dir='training/training_src',
                        dependencies=["training/common/sagemaker_rl", "inference/inference_src/", "../BattlesnakeGym/"],
                        toolkit=RLToolkit.COACH,
                        toolkit_version=RLEstimator.COACH_LATEST_VERSION_MXNET,
                        framework=RLFramework.MXNET,
                        image_name=image_name,
                        role=role,
                        train_instance_type=instance_type,
                        train_instance_count=1,
                        output_path=s3_output_path,
                        base_job_name=job_name_prefix,
                        metric_definitions=metric_definitions,
                        hyperparameters={
                            # See train-mabs.py to add additional hyperparameters
                            # Also see ray_launcher.py for the rl.training.* hyperparameters
                            
                            "num_iters": 15,
                            # number of snakes in the gym
                            "num_agents": num_agents,

                            "iterate_map_size": False,
                            "map_size": map_size,
                            "algorithm": algorithm,
                            "additional_configs": additional_config,
                            "use_heuristics_action_masks": False
                        }
                    )

# estimator.fit()

# job_name = estimator.latest_training_job.job_name
# print("Training job: %s" % job_name)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
image_name has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


CPU times: user 21.4 ms, sys: 40 µs, total: 21.5 ms
Wall time: 24.2 ms


In [30]:
hyperparameter_ranges = {
    'buffer_size': IntegerParameter(1000, 6000),
    'update_every': IntegerParameter(10, 20),
    'batch_size': IntegerParameter(16, 256),

    'lr_start': ContinuousParameter(1e-5, 1e-3),
    'lr_factor': ContinuousParameter(0.5, 1.0),
    'lr_step': IntegerParameter(5000, 30000),
    
    'tau': ContinuousParameter(1e-4, 1e-3),
    'gamma': ContinuousParameter(0.85, 0.99),
    
    'depth': IntegerParameter(10, 256),
    'depthS': IntegerParameter(10, 256),
}

max_jobs = 3
max_parallel_jobs = 3
run_hpo = True
tuner = HyperparameterTuner(estimator,
                            objective_metric_name='timesteps',
                            objective_type='Maximize',
                            hyperparameter_ranges=hyperparameter_ranges,
                            metric_definitions=metric_definitions,
                            max_jobs=max_jobs,
                            max_parallel_jobs=max_parallel_jobs,
                            base_tuning_job_name=job_name_prefix)
if run_hpo:
    tuner.fit()



ClientError: An error occurred (ValidationException) when calling the CreateHyperParameterTuningJob operation: A metric is required for this hyperparameter tuning job objective. Provide a metric in the metric definitions.

In [13]:
# Where is the model stored in S3?
estimator.model_data

's3://sagemaker-soln-bs-snake-bucket-farbod/sagemaker-soln-bs-job-rllib-2021-08-07-00-37-55-315/output/model.tar.gz'

# Create an endpoint to host the policy
Firstly, we will delete the previous endpoint and model

In [14]:
sm_client = boto3.client(service_name='sagemaker')
waiter = sm_client.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=info['SagemakerEndPointName'])
try:
    sm_client.delete_endpoint(EndpointName=info['SagemakerEndPointName'])
    sm_client.delete_endpoint_config(EndpointConfigName=info['SagemakerEndPointName'])
    sm_client.delete_model(ModelName=info['SagemakerEndPointName'])
    ep_waiter = sm_client.get_waiter('endpoint_deleted')
    ep_waiter.wait(EndpointName=info['SagemakerEndPointName'])
except botocore.exceptions.ClientError:
    pass
    
# Copy the endpoint to a central location
model_data = "s3://{}/pretrainedmodels/model.tar.gz".format(s3_bucket)
!aws s3 cp {estimator.model_data} {model_data}

from sagemaker.tensorflow.serving import Model

model = Model(model_data=model_data,
              role=role,
              entry_point="inference.py",
              source_dir='inference/inference_src',
              framework_version='2.1.0',
              name=info['SagemakerEndPointName'],
              code_location='s3://{}//code'.format(s3_bucket)
             )

if local_mode:
    inf_instance_type = 'local'
else:
    inf_instance_type = info["SagemakerInferenceInstanceType"]

# Deploy an inference endpoint
predictor = model.deploy(initial_instance_count=1, instance_type=inf_instance_type,
                         endpoint_name=info['SagemakerEndPointName'])

Completed 6.2 MiB/6.2 MiB (19.7 MiB/s) with 1 file(s) remainingcopy: s3://sagemaker-soln-bs-snake-bucket-farbod/sagemaker-soln-bs-job-rllib-2021-08-07-00-37-55-315/output/model.tar.gz to s3://sagemaker-soln-bs-snake-bucket-farbod/pretrainedmodels/model.tar.gz


The class sagemaker.tensorflow.serving.Model has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
update_endpoint is a no-op in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


---------------!

# Test the endpoint

This example is using single observation for a 5-agent environment 
The last axis is 12 because the current MultiAgentEnv is concatenating 2 frames
5 agent maps + 1 food map = 6 maps total    6 maps * 2 frames = 12

In [None]:
import numpy as np
from time import time

state = np.zeros(shape=(1, 21, 21, 6), dtype=np.float32).tolist()

health_dict = {0: 50, 1: 50}
json = {"turn": 4,
        "board": {
                "height": 11,
                "width": 11,
                "food": [],
                "snakes": []
                },
            "you": {
                "id": "snake-id-string",
                "name": "Sneky Snek",
                "health": 90,
                "body": [{"x": 1, "y": 3}]
                }
            }

before = time()
action_mask = np.array([1, 1, 1, 1]).tolist()

action = predictor.predict({"state": state, "action_mask": action_mask,
                            "prev_action": -1, 
                           "prev_reward": -1, "seq_lens": -1,  
                           "all_health": health_dict, "json": json})
elapsed = time() - before

action_to_take = action["outputs"]["heuristisc_action"]
print("Action to take {}".format(action_to_take))
print("Inference took %.2f ms" % (elapsed*1000))

# Navigation
- To go back to the introduction click [here](./1_Introduction.ipynb)
- To build some heuristics click [here](./3_HeuristicsDeveloper.ipynb)