In [1]:
import multiprocessing

import tensorflow as tf

import ray

n_cpus = multiprocessing.cpu_count()
n_cpus

4

In [2]:
n_gpus = len(tf.config.list_physical_devices('GPU'))
n_gpus

1

In [3]:
# initialize ray
# https://ray.readthedocs.io/en/latest/package-ref.html#ray.init
ray.init(ignore_reinit_error=True, log_to_driver=False, webui_host='0.0.0.0')


2020-02-24 12:31:05,810	INFO resource_spec.py:212 -- Starting Ray with 5.86 GiB memory available for workers and up to 2.94 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-02-24 12:31:06,260	INFO services.py:1093 -- View the Ray dashboard at [1m[32m172.30.1.184:8268[39m[22m


{'node_ip_address': '172.30.1.184',
 'redis_address': '172.30.1.184:37548',
 'object_store_address': '/tmp/ray/session_2020-02-24_12-31-05_807697_9762/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-02-24_12-31-05_807697_9762/sockets/raylet',
 'webui_url': '172.30.1.184:8268',
 'session_dir': '/tmp/ray/session_2020-02-24_12-31-05_807697_9762'}

In [None]:
# run one training iteration
# https://github.com/ray-project/ray/blob/master/rllib/agents/ppo/ppo.py
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG

env_name = 'CartPole-v1'

ppo_config = DEFAULT_CONFIG.copy()
if n_gpus:
    ppo_config['num_gpus'] = n_gpus
    ppo_config['tf_session_args']['device_count']['GPU'] = n_gpus

ppo_config['num_workers'] = 1
ppo_config['num_sgd_iter'] = 2
ppo_config['sgd_minibatch_size'] = 128
ppo_config['lr'] = 0.0003
ppo_config['gamma'] = 0.99
ppo_config['model']['fcnet_hiddens'] = [64, 64]
ppo_config['timesteps_per_iteration'] = 2000
ppo_config['train_batch_size'] = 8000
ppo_config['num_cpus_per_worker'] = 0  # This avoids running out of resources in the notebook environment when this cell is re-executed

agent = PPOTrainer(ppo_config, env_name)
result = agent.train()

result

In [None]:
# tune hyperparamters with grid search
# https://github.com/ray-project/ray/blob/master/python/ray/tune/tune.py
ray.init(ignore_reinit_error=True)
env_name = 'CartPole-v1'
ppo_config = {
    "env": env_name,
    "num_workers": 1,
    'model': {
        'fcnet_hiddens': tune.grid_search([
                                           [16, 16], [32, 32], [64, 64], [128, 128],
                                          ])
    },        
    'train_batch_size': 1000,
    "lr": tune.grid_search([0.0003, 0.0001]),
    'gamma': tune.grid_search([0.99, 0.999]),
    "eager": False,
    'num_gpus': n_gpus  
}
                      
analysis = tune.run(
    "PPO",
    name='cartpole_test',
    verbose=1,

    stop={"episode_reward_mean": 300},  # stop when a parameter set is able to reach 300 timesteps
    config = ppo_config,
    checkpoint_freq=10,
    checkpoint_at_end=True,
    checkpoint_score_attr='episode_reward_mean',
    num_samples=1,  # for grid search, number of times to run each hyperparameter combo
    #     with_server=True,
    #     server_port=8267,
)
