https://medium.com/distributed-computing-with-ray/intro-to-rllib-example-environments-3a113f532c70

In [18]:
%load_ext autoreload
import ray
import ray.rllib.agents.ppo as ppo
from ray.tune.logger import pretty_print

ray.shutdown()
ray.init(ignore_reinit_error=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


2020-11-12 14:41:53,865	INFO services.py:1164 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '128.40.41.23',
 'raylet_ip_address': '128.40.41.23',
 'redis_address': '128.40.41.23:6379',
 'object_store_address': '/tmp/ray/session_2020-11-12_14-41-52_956587_196182/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-11-12_14-41-52_956587_196182/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2020-11-12_14-41-52_956587_196182',
 'metrics_export_port': 62988}

## Configure Checkpoint Saving

In [19]:
import shutil
import os

# clear saved agent folder
CHECKPOINT_ROOT = 'tmp/ppo/cartpole_v0'
shutil.rmtree(CHECKPOINT_ROOT, ignore_errors=True, onerror=None)

# clear ray results folder
RAY_RESULTS = os.getenv('HOME') + '/ray_results'
print(RAY_RESULTS)
shutil.rmtree(RAY_RESULTS, ignore_errors=True, onerror=None)

/home/zciccwf/ray_results


## Configure RL Params

In [20]:
%autoreload

config = ppo.DEFAULT_CONFIG.copy() # use 'proximal policy optimisation' policy optimiser
print(config.keys())
config['num_gpus'] = 1
config['num_workers'] = 1
config['eager_tracing'] = False
config['log_level'] = 'WARN'

agent = ppo.PPOTrainer(config=config, env='CartPole-v0')

dict_keys(['num_workers', 'num_envs_per_worker', 'rollout_fragment_length', 'batch_mode', 'num_gpus', 'train_batch_size', 'model', 'optimizer', 'gamma', 'horizon', 'soft_horizon', 'no_done_at_end', 'env_config', 'env', 'normalize_actions', 'clip_rewards', 'clip_actions', 'preprocessor_pref', 'lr', 'monitor', 'log_level', 'callbacks', 'ignore_worker_failures', 'log_sys_usage', 'fake_sampler', 'framework', 'eager_tracing', 'no_eager_on_workers', 'explore', 'exploration_config', 'evaluation_interval', 'evaluation_num_episodes', 'in_evaluation', 'evaluation_config', 'evaluation_num_workers', 'custom_eval_function', 'sample_async', '_use_trajectory_view_api', 'observation_filter', 'synchronize_filters', 'tf_session_args', 'local_tf_session_args', 'compress_observations', 'collect_metrics_timeout', 'metrics_smoothing_episodes', 'remote_worker_envs', 'remote_env_batch_wait_ms', 'min_iter_time_s', 'timesteps_per_iteration', 'seed', 'extra_python_environs_for_driver', 'extra_python_environs_for

[2m[36m(pid=94578)[0m Instructions for updating:
[2m[36m(pid=94578)[0m non-resource variables are not supported in the long term
[2m[36m(pid=94578)[0m Instructions for updating:
[2m[36m(pid=94578)[0m If using Keras pass *_constraint arguments to layers.


## Train RL Agent

In [4]:
%autoreload

N_ITER = 50
s = "{:3d} | reward {:6.2f}/{:6.2f}/{:6.2f} | len {:6.2f} | saved agent to {}"

for i in range(N_ITER):
    # perform 1 iter of training the policy with the PPO algorithm
    result = agent.train()
    file_name = agent.save(CHECKPOINT_ROOT)
    
    print(s.format(
    i + 1,
    result["episode_reward_min"],
    result["episode_reward_mean"],
    result["episode_reward_max"],
    result["episode_len_mean"],
    file_name
   ))

[2m[36m(pid=196365)[0m Instructions for updating:
[2m[36m(pid=196365)[0m If using Keras pass *_constraint arguments to layers.


Instructions for updating:
Prefer Variable.assign which has equivalent behavior in 2.X.


[2m[36m(pid=196365)[0m Instructions for updating:
[2m[36m(pid=196365)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


  1 reward   9.00/ 21.03/ 67.00 len  21.03 saved tmp/ppo/cartpole_v0/checkpoint_1/checkpoint-1
  2 reward   9.00/ 43.81/200.00 len  43.81 saved tmp/ppo/cartpole_v0/checkpoint_2/checkpoint-2
  3 reward  10.00/ 72.86/200.00 len  72.86 saved tmp/ppo/cartpole_v0/checkpoint_3/checkpoint-3
  4 reward  13.00/101.47/200.00 len 101.47 saved tmp/ppo/cartpole_v0/checkpoint_4/checkpoint-4
  5 reward  13.00/129.61/200.00 len 129.61 saved tmp/ppo/cartpole_v0/checkpoint_5/checkpoint-5
  6 reward  13.00/154.00/200.00 len 154.00 saved tmp/ppo/cartpole_v0/checkpoint_6/checkpoint-6
  7 reward  20.00/173.38/200.00 len 173.38 saved tmp/ppo/cartpole_v0/checkpoint_7/checkpoint-7
  8 reward  20.00/185.62/200.00 len 185.62 saved tmp/ppo/cartpole_v0/checkpoint_8/checkpoint-8
  9 reward  70.00/198.02/200.00 len 198.02 saved tmp/ppo/cartpole_v0/checkpoint_9/checkpoint-9
 10 reward  70.00/198.70/200.00 len 198.70 saved tmp/ppo/cartpole_v0/checkpoint_10/checkpoint-10
 11 reward 200.00/200.00/200.00 len 200.00 saved

## Examing Policy

In [5]:
policy = agent.get_policy()
model = policy.model
print(model.base_model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observations (InputLayer)       [(None, 4)]          0                                            
__________________________________________________________________________________________________
fc_1 (Dense)                    (None, 256)          1280        observations[0][0]               
__________________________________________________________________________________________________
fc_value_1 (Dense)              (None, 256)          1280        observations[0][0]               
__________________________________________________________________________________________________
fc_2 (Dense)                    (None, 256)          65792       fc_1[0][0]                       
______________________________________________________________________________________________

## Rollout a Trained Agent from Saved Checkpoint

In [11]:
!rllib rollout tmp/ppo/cartpole_v0/checkpoint_50/checkpoint-50 --config "{\"env\": \"CartPole-v0\"}" --run PPO --steps 2000

Instructions for updating:
non-resource variables are not supported in the long term
2020-11-06 11:53:02,219	INFO services.py:1164 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2020-11-06 11:53:03,774	INFO trainer.py:591 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
2020-11-06 11:53:03,774	INFO trainer.py:616 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2020-11-06 11:53:04.793561: F tensorflow/stream_executor/lib/statusor.cc:34] Attempting to fetch value instead of handling error Internal: failed initializing StreamExecutor for CUDA device ordinal 1: Internal: failed call to cuDevicePrimaryCtxRetain: CUDA_ERROR_OUT_OF_MEMORY: out of memory; total memory reported: 16945512448
*** Aborted at 1604663584 (unix time) try "date -d @1604663584" if you are using GNU date ***
PC: @                0x0 (unknown)
*** SIGABRT (@0x82500003ef5) received by PID 16117 (TID 