<a href="https://colab.research.google.com/github/denklewer/ray-custom-agents/blob/master/trpo-ray.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!sudo apt-get install -y build-essential curl unzip psmisc
!pip install cython==0.29.0
!git clone https://github.com/ray-project/ray.git
!ray/ci/travis/install-bazel.sh
!pip install lz4
!pip install setproctitle
!mv ray ray-distr
!pip install -e ray-distr/python/. --verbose  # Add --user if you see a permission denied error.

Reading package lists... Done
Building dependency tree       
Reading state information... Done
build-essential is already the newest version (12.4ubuntu1).
unzip is already the newest version (6.0-21ubuntu1).
curl is already the newest version (7.58.0-2ubuntu3.6).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'sudo apt autoremove' to remove it.
The following NEW packages will be installed:
  psmisc
0 upgraded, 1 newly installed, 0 to remove and 7 not upgraded.
Need to get 52.5 kB of archives.
After this operation, 266 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 psmisc amd64 23.1-1ubuntu0.1 [52.5 kB]
Fetched 52.5 kB in 0s (114 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to

In [0]:
import sys
import os
sys.path.append(os.path.join("", '/content/ray-distr/python')) # To find local version of the library

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import gym
import logging
import ray
from ray import tune
from ray.rllib.evaluation import PolicyGraph, PolicyEvaluator, SampleBatch
from ray.rllib.evaluation.metrics import collect_metrics
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import scipy
import scipy.signal
from ray.rllib.utils.annotations import override
logger = logging.getLogger(__name__)
from ray.rllib.evaluation.postprocessing import compute_advantages,Postprocessing

from torch.autograd import Variable
parser = argparse.ArgumentParser()
parser.add_argument("--gpu", action="store_true")
parser.add_argument("--num-iters", type=int, default=20)
parser.add_argument("--num-workers", type=int, default=2)

class TRPOAgent(nn.Module):
    def __init__(self, state_shape, n_actions, hidden_size=32):
        '''
        Here you should define your model
        You should have LOG-PROBABILITIES as output because you will need it to compute loss
        We recommend that you start simple:
        use 1-2 hidden layers with 100-500 units and relu for the first try
        '''
        nn.Module.__init__(self)

        self.n_actions = n_actions
        self.state_hape = state_shape

        self.model = nn.Sequential(
            nn.Linear(state_shape[0], hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
            nn.LogSoftmax()
        )

    def forward(self, states):
        """
        takes agent's observation (Variable), returns log-probabilities (Variable)
        :param state_t: a batch of states, shape = [batch_size, state_shape]
        """

        # Use your network to compute log_probs for given state
        log_probs = self.model(states)
        return log_probs

    def get_log_probs(self, states):
        '''
        Log-probs for training
        '''

        return self.forward(states)

    def get_probs(self, states):
        '''
        Probs for interaction
        '''

        return torch.exp(self.forward(states))

    def act(self, obs, sample=True):
        '''
        Samples action from policy distribution (sample = True) or takes most likely action (sample = False)
        :param: obs - single observation vector
        :param sample: if True, samples from \pi, otherwise takes most likely action
        :returns: action (single integer) and probabilities for all actions
        '''

        probs = self.get_probs(Variable(torch.FloatTensor([obs]))).data.numpy()

        if sample:
            action = int(np.random.choice(self.n_actions, p=probs[0]))
        else:
            action = int(np.argmax(probs))

        return action, probs[0]



Instructions for updating:
non-resource variables are not supported in the long term


In [0]:
# UTIL FUNCTIONS
def get_cummulative_returns(r, gamma=1):
  """
  Computes cummulative discounted rewards given immediate rewards
        G_i = r_i + gamma*r_{i+1} + gamma^2*r_{i+2} + ...
        Also known as R(s,a)
  """
  r = np.array(r)
  assert r.ndim >= 1
  return scipy.signal.lfilter([1], [1, -gamma], r[::-1], axis=0)[::-1]

def conjugate_gradient(f_Ax, b, cg_iters=10, residual_tol=1e-10):
    """
    This method solves system of equation Ax=b using iterative method called conjugate gradients
    :f_Ax: function that returns Ax
    :b: targets for Ax
    :cg_iters: how many iterations this method should do
    :residual_tol: epsilon for stability
    """
    p = b.clone()
    r = b.clone()
    x = torch.zeros(b.size())
    rdotr = torch.sum(r * r)
    for i in range(cg_iters):
        z = f_Ax(p)
        v = rdotr / (torch.sum(p * z) + 1e-8)
        x += v * p
        r -= v * z
        newrdotr = torch.sum(r * r)
        mu = newrdotr / (rdotr + 1e-8)
        p = r + mu * p
        rdotr = newrdotr
        if rdotr < residual_tol:
            break
    return x


In [0]:
class CustomPolicy(PolicyGraph):
    """Example of a custom policy graph written from scratch.
    You might find it more convenient to extend TF/TorchPolicyGraph instead
    for a real policy.
    """

    def __init__(self, observation_space, action_space, config):
        PolicyGraph.__init__(self, observation_space, action_space, config)
        # example parameter
        self.w = 1.0
        self.observation_shape = observation_space.shape
        self.n_actions = action_space.n
        self.agent = TRPOAgent(self.observation_shape, self.n_actions)
        self.policy = []


    def compute_actions(self,
                        obs_batch,
                        state_batches,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):
        # return random actions
        actions = []
        action_probs = []
        for obs in obs_batch:
            action, policy = self.agent.act(obs)
            actions.append(action)
            action_probs.append(policy)

        return actions, [], {"action_probs": action_probs}
      
    @override(PolicyGraph)
    def postprocess_trajectory(self,
                               sample_batch,
                               other_agent_batches=None,
                               episode=None):
        traj = {}
        for key in sample_batch:
            traj[key] = np.stack(sample_batch[key])
        traj["cummulative_returns"] = get_cummulative_returns(traj[SampleBatch.REWARDS])
        return SampleBatch(traj)


    def get_flat_params_from(self, model):
        params = []
        for param in model.parameters():
            params.append(param.data.view(-1))

        flat_params = torch.cat(params)
        return flat_params

    def set_flat_params_to(self, model, flat_params):
        prev_ind = 0
        for param in model.parameters():
            flat_size = int(np.prod(list(param.size())))
            param.data.copy_(
                flat_params[prev_ind:prev_ind + flat_size].view(param.size()))
            prev_ind += flat_size

    def get_loss(self, agent, observations, actions, cummulative_returns, old_probs):
        """
        Computes TRPO objective
        :param: observations - batch of observations
        :param: actions - batch of actions
        :param: cummulative_returns - batch of cummulative returns
        :param: old_probs - batch of probabilities computed by old network
        :returns: scalar value of the objective function
        """
        batch_size = observations.shape[0]
        log_probs_all = agent.get_log_probs(observations)
        probs_all = torch.exp(log_probs_all)

        probs_for_actions = probs_all[torch.arange(
            0, batch_size, out=torch.LongTensor()), actions]
        old_probs_for_actions = old_probs[torch.arange(
            0, batch_size, out=torch.LongTensor()), actions]

        # Compute surrogate loss, aka importance-sampled policy gradient
        Loss = -torch.mean(cummulative_returns * (probs_for_actions / old_probs_for_actions))

        return Loss

    def get_kl(self, agent, observations, actions, cummulative_returns, old_probs_all):
        """
        Computes KL-divergence between network policy and old policy
        :param: observations - batch of observations
        :param: actions - batch of actions
        :param: cummulative_returns - batch of cummulative returns (we don't need it actually)
        :param: old_probs - batch of probabilities computed by old network
        :returns: scalar value of the KL-divergence
        """
        batch_size = observations.shape[0]
        log_probs_all = agent.get_log_probs(observations)
        probs_all = torch.exp(log_probs_all)

        # Compute Kullback-Leibler divergence (see formula above)
        # Note: you need to sum KL and entropy over all actions, not just the ones agent took
        old_log_probs_all = torch.log(old_probs_all + 1e-10)

        kl = torch.sum(old_probs_all * (old_log_probs_all - log_probs_all)) / batch_size

        return kl

    def get_entropy(self, agent, observations):
        """
        Computes entropy of the network policy
        :param: observations - batch of observations
        :returns: scalar value of the entropy
        """

        observations = Variable(torch.FloatTensor(observations))

        batch_size = observations.shape[0]
        log_probs_all = agent.get_log_probs(observations)
        probs_all = torch.exp(log_probs_all)

        entropy = torch.sum(-probs_all * log_probs_all) / batch_size

        return entropy

    def linesearch(self, f, x, fullstep, max_kl):
        """
        Linesearch finds the best parameters of neural networks in the direction of fullstep contrainted by KL divergence.
        :param: f - function that returns loss, kl and arbitrary third component.
        :param: x - old parameters of neural network.
        :param: fullstep - direction in which we make search.
        :param: max_kl - constraint of KL divergence.
        :returns:
        """
        max_backtracks = 10
        loss, _, = f(x)
        for stepfrac in .5 ** np.arange(max_backtracks):
            xnew = x + stepfrac * fullstep
            new_loss, kl = f(xnew)
            actual_improve = new_loss - loss
            if kl.data.numpy() <= max_kl and actual_improve.data.numpy() < 0:
                x = xnew
                loss = new_loss
        return x

    def learn_on_batch(self, samples):
        # implement your learning code here
        max_kl = 0.01
        # Updating policy.
        observations = samples['obs']
        actions = samples['actions']
        returns = samples['cummulative_returns']
        old_probs = samples['action_probs']
        loss, kl = self.update_step(observations, actions, returns, old_probs, max_kl)
        
        return {
            "loss": loss,
            "kl": kl
        }

    def get_weights(self):
        return self.get_flat_params_from(self.agent)

    def set_weights(self, weights):
        self.set_flat_params_to(self.agent, weights)
        
    def update_step(self, observations, actions, cummulative_returns, old_probs, max_kl):
      """
      This function does the TRPO update step
      :param: observations - batch of observations
      :param: actions - batch of actions
      :param: cummulative_returns - batch of cummulative returns
      :param: old_probs - batch of probabilities computed by old network
      :param: max_kl - controls how big KL divergence may be between old and new policy every step.
      :returns: KL between new and old policies and the value of the loss function.
      """
      agent = self.agent

      # Here we prepare the information
      observations = Variable(torch.FloatTensor(observations))
      actions = torch.LongTensor(actions)
      cummulative_returns = Variable(torch.FloatTensor(cummulative_returns))
      old_probs = Variable(torch.FloatTensor(old_probs))

      # Here we compute gradient of the loss function
      loss = self.get_loss(agent, observations, actions,
                      cummulative_returns, old_probs)
      grads = torch.autograd.grad(loss, agent.parameters())
      loss_grad = torch.cat([grad.view(-1) for grad in grads]).data

      def Fvp(v):
          # Here we compute Fx to do solve Fx = g using conjugate gradients
          # We actually do here a couple of tricks to compute it efficiently

          kl = self.get_kl(agent, observations, actions,
                      cummulative_returns, old_probs)

          grads = torch.autograd.grad(kl, agent.parameters(), create_graph=True)
          flat_grad_kl = torch.cat([grad.view(-1) for grad in grads])

          kl_v = (flat_grad_kl * Variable(v)).sum()
          grads = torch.autograd.grad(kl_v, agent.parameters())
          flat_grad_grad_kl = torch.cat(
              [grad.contiguous().view(-1) for grad in grads]).data

          return flat_grad_grad_kl + v * 0.1

      # Here we solveolve Fx = g system using conjugate gradients
      stepdir = conjugate_gradient(Fvp, -loss_grad, 10)

      # Here we compute the initial vector to do linear search
      shs = 0.5 * (stepdir * Fvp(stepdir)).sum(0, keepdim=True)

      lm = torch.sqrt(shs / max_kl)
      fullstep = stepdir / lm[0]


      # Here we get the start point
      prev_params = self.get_weights()

      def get_loss_kl(params):
          # Helper for linear search
          # Set new params and return loss + kl
          self.set_weights(params)
          return [self.get_loss(agent, observations, actions, cummulative_returns, old_probs),
                  self.get_kl(agent, observations, actions, cummulative_returns, old_probs)]

      # Here we find our new parameters
      new_params = self.linesearch(get_loss_kl, prev_params, fullstep, max_kl)

      return get_loss_kl(new_params)


In [0]:
def training_workflow(config, reporter):
    # Setup policy and policy evaluation actors
    env = gym.make("CartPole-v0")
    policy = CustomPolicy(env.observation_space, env.action_space, {})
    workers = [
        PolicyEvaluator.as_remote().remote(lambda c: gym.make("CartPole-v0"),
                                           CustomPolicy)
        for _ in range(config["num_workers"])
    ]

    for it in range(config["num_iters"]):
        print("\n********** Iteration %i ************" % it)
        # Broadcast weights to the policy evaluation workers
        weights = ray.put({"default_policy": policy.get_weights()})
        for w in workers:
            w.set_weights.remote(weights)

        # Gather a batch of samples
        samples = SampleBatch.concat_samples(ray.get([w.sample.remote() for w in workers]))
       
        

#         prev_t = 0
#         obervations, actions, rewards, action_probs, cum_returns = [], [], [], [], []
#         for t in samples['t']:
#           if t == 0 and prev_t != 0:
            
#             path = {"observations": np.array(obervations),
#                     "policy": np.array(action_probs),
#                     "actions": np.array(actions),
#                     "rewards": np.array(rewards),
#                     "cumulative_returns": np.array(cum_returns),
#                     }
#             obervations, actions, rewards, action_probs = [], [], [], []
#             paths.append(path)
#           else:  
#             obervations.append(samples['obs'][t])
#             actions.append(samples['actions'][t])
#             action_probs.append(samples['action_probs'][t])
#             rewards.append(samples['rewards'][t])   
#             cum_returns.append(samples['cummulative_returns'][t])
#             prev_t = t

        # Improve the policy using the  batch
        loss_stats =   policy.learn_on_batch(samples)
        # Report current progress
#         episode_rewards = np.array([path["rewards"].sum() for path in paths])
    
        reporter(**collect_metrics(remote_evaluators=workers))


In [5]:
ray.init()

2019-05-19 22:13:56,161	INFO node.py:498 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-05-19_22-13-56_160574_13215/logs.
2019-05-19 22:13:56,286	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:60639 to respond...
2019-05-19 22:13:56,432	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:24143 to respond...
2019-05-19 22:13:56,439	INFO services.py:806 -- Starting Redis shard with 2.52 GB max memory.
2019-05-19 22:13:56,500	INFO node.py:512 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-05-19_22-13-56_160574_13215/logs.
2019-05-19 22:13:56,503	INFO services.py:1442 -- Starting the Plasma object store with 3.78 GB memory using /dev/shm.


{'node_ip_address': '172.28.0.2',
 'object_store_address': '/tmp/ray/session_2019-05-19_22-13-56_160574_13215/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2019-05-19_22-13-56_160574_13215/sockets/raylet',
 'redis_address': '172.28.0.2:60639',
 'session_dir': '/tmp/ray/session_2019-05-19_22-13-56_160574_13215',
 'webui_url': None}

In [99]:

tune.run(
        training_workflow,
        resources_per_trial={
            "gpu": 0,
            "cpu": 1,
            "extra_cpu": 1,
        },
        config={
            "num_workers": 1,
            "num_iters": 3,
        })

2019-05-19 23:52:23,984	INFO tune.py:60 -- Tip: to resume incomplete experiments, pass resume='prompt' or resume=True to run()
2019-05-19 23:52:23,986	INFO tune.py:223 -- Starting a new experiment.


== Status ==
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs
Memory usage on this node: 6.5/13.7 GB

== Status ==
Using FIFO scheduling algorithm.
Resources requested: 2/2 CPUs, 0/1 GPUs
Memory usage on this node: 6.5/13.7 GB
Result logdir: /root/ray_results/training_workflow
Number of trials: 1 ({'RUNNING': 1})
RUNNING trials:
 - training_workflow_0:	RUNNING

[2m[36m(pid=15582)[0m Instructions for updating:
[2m[36m(pid=15582)[0m non-resource variables are not supported in the long term
[2m[36m(pid=15586)[0m Instructions for updating:
[2m[36m(pid=15586)[0m non-resource variables are not supported in the long term
[2m[36m(pid=15582)[0m 
[2m[36m(pid=15582)[0m ********** Iteration 0 ************
[2m[36m(pid=15586)[0m 2019-05-19 23:52:27,812	INFO policy_evaluator.py:732 -- Built policy map: {'default_policy': <__main__.CustomPolicy object at 0x7fb2a17bb9e8>}
[2m[36m(pid=15586)[0m 2019-05-19 23:52:27,812	INFO policy_evaluator.py:733 -- Built 

2019-05-19 23:52:28,141	INFO ray_trial_executor.py:180 -- Destroying actor for trial training_workflow_0. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(pid=15582)[0m 
[2m[36m(pid=15582)[0m ********** Iteration 2 ************
== Status ==
Using FIFO scheduling algorithm.
Resources requested: 0/2 CPUs, 0/1 GPUs
Memory usage on this node: 6.9/13.7 GB
Result logdir: /root/ray_results/training_workflow
Number of trials: 1 ({'TERMINATED': 1})
TERMINATED trials:
 - training_workflow_0:	TERMINATED, [2 CPUs, 0 GPUs], [pid=15582], 0 s, 3 iter, 45 rew



[training_workflow_0]

In [0]:

sys.path.append(os.path.join("", '/content/ray-custom-agents')) # To find local version of the library


In [0]:
from trpo.trainer import DEFAULT_CONFIG,TRPOTrainer

In [0]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Example of using two different training methods at once in multi-agent.
Here we create a number of CartPole agents, some of which are trained with
DQN, and some of which are trained with PPO. We periodically sync weights
between the two trainers (note that no such syncing is needed when using just
a single training method).
For a simpler example, see also: multiagent_cartpole.py
"""

import argparse
import gym

import ray
from ray.rllib.agents.dqn.dqn import DQNTrainer
from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph
from ray.rllib.agents.ppo.ppo import PPOTrainer
from ray.rllib.agents.ppo.ppo_policy_graph import PPOTFPolicy
from ray.rllib.tests.test_multi_agent_env import MultiCartpole
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env

Instructions for updating:
non-resource variables are not supported in the long term


In [0]:
parser = argparse.ArgumentParser()
parser.add_argument("--num-iters", type=int, default=20)

_StoreAction(option_strings=['--num-iters'], dest='num_iters', nargs=None, const=None, default=20, type=<class 'int'>, choices=None, help=None, metavar=None)

In [0]:
num_iters = 20
ray.init()

# Simple environment with 4 independent cartpole entities
register_env("multi_cartpole", lambda _: MultiCartpole(4))
single_env = gym.make("CartPole-v0")
obs_space = single_env.observation_space
act_space = single_env.action_space

# You can also have multiple policy graphs per trainer, but here we just
# show one each for PPO and DQN.
policy_graphs = {
    "ppo_policy": (PPOTFPolicy, obs_space, act_space, {}),
    "dqn_policy": (DQNPolicyGraph, obs_space, act_space, {}),
}

def policy_mapping_fn(agent_id):
    if agent_id % 2 == 0:
        return "ppo_policy"
    else:
        return "dqn_policy"


2019-05-19 14:11:18,685	INFO node.py:498 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-05-19_14-11-18_684709_123/logs.
2019-05-19 14:11:18,807	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:61930 to respond...
2019-05-19 14:11:18,954	INFO services.py:409 -- Waiting for redis server at 127.0.0.1:21384 to respond...
2019-05-19 14:11:18,960	INFO services.py:806 -- Starting Redis shard with 2.58 GB max memory.
2019-05-19 14:11:19,015	INFO node.py:512 -- Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-05-19_14-11-18_684709_123/logs.
2019-05-19 14:11:19,018	INFO services.py:1442 -- Starting the Plasma object store with 3.87 GB memory using /dev/shm.


In [0]:
ppo_trainer = PPOTrainer(
    env="multi_cartpole",
    config={
        "multiagent": {
            "policy_graphs": policy_graphs,
            "policy_mapping_fn": policy_mapping_fn,
            "policies_to_train": ["ppo_policy"],
        },
        # disable filters, otherwise we would need to synchronize those
        # as well to the DQN agent
        "observation_filter": "NoFilter",
})

2019-05-19 14:11:45,290	INFO ppo.py:149 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2019-05-19 14:11:45,299	INFO policy_evaluator.py:312 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)


Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use `tf.GraphKeys.GLOBAL_VARIABLES` instead.
Instructions for updating:
Use tf.random.categorical instead.
Instructions for updating:
Use tf.cast instead.


2019-05-19 14:11:47,424	INFO dynamic_tf_policy_graph.py:265 -- Initializing loss function with dummy input:

{ 'action_prob': <tf.Tensor 'ppo_policy/action_prob:0' shape=(?,) dtype=float32>,
  'actions': <tf.Tensor 'ppo_policy/actions:0' shape=(?,) dtype=int64>,
  'advantages': <tf.Tensor 'ppo_policy/advantages:0' shape=(?,) dtype=float32>,
  'behaviour_logits': <tf.Tensor 'ppo_policy/behaviour_logits:0' shape=(?, 2) dtype=float32>,
  'dones': <tf.Tensor 'ppo_policy/dones:0' shape=(?,) dtype=bool>,
  'new_obs': <tf.Tensor 'ppo_policy/new_obs:0' shape=(?, 4) dtype=float32>,
  'obs': <tf.Tensor 'ppo_policy/observation:0' shape=(?, 4) dtype=float32>,
  'prev_actions': <tf.Tensor 'ppo_policy/action:0' shape=(?,) dtype=int64>,
  'prev_rewards': <tf.Tensor 'ppo_policy/prev_reward:0' shape=(?,) dtype=float32>,
  'rewards': <tf.Tensor 'ppo_policy/rewards:0' shape=(?,) dtype=float32>,
  'value_targets': <tf.Tensor 'ppo_policy/value_targets:0' shape=(?,) dtype=float32>,
  'vf_preds': <tf.Tensor 

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


2019-05-19 14:11:48,909	INFO policy_evaluator.py:732 -- Built policy map: {'dqn_policy': <ray.rllib.agents.dqn.dqn_policy_graph.DQNPolicyGraph object at 0x7f964a2bd898>, 'ppo_policy': <ray.rllib.evaluation.tf_policy_template.PPOTFPolicy object at 0x7f963b33cd30>}
2019-05-19 14:11:48,911	INFO policy_evaluator.py:733 -- Built preprocessor map: {'dqn_policy': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7f964a2bd4a8>, 'ppo_policy': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7f964a2bd5c0>}
2019-05-19 14:11:48,918	INFO policy_evaluator.py:344 -- Built filter map: {'dqn_policy': <ray.rllib.utils.filter.NoFilter object at 0x7f963b356b70>, 'ppo_policy': <ray.rllib.utils.filter.NoFilter object at 0x7f963b294550>}
2019-05-19 14:11:49,167	INFO multi_gpu_optimizer.py:80 -- LocalMultiGPUOptimizer devices ['/cpu:0']


[2m[36m(pid=11718)[0m Instructions for updating:
[2m[36m(pid=11718)[0m non-resource variables are not supported in the long term
[2m[36m(pid=11719)[0m Instructions for updating:
[2m[36m(pid=11719)[0m non-resource variables are not supported in the long term
[2m[36m(pid=11718)[0m 2019-05-19 14:11:54,565	INFO policy_evaluator.py:312 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors)
[2m[36m(pid=11718)[0m 2019-05-19 14:11:54.615841: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2300000000 Hz
[2m[36m(pid=11718)[0m 2019-05-19 14:11:54.616146: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x1824840 executing computations on platform Host. Devices:
[2m[36m(pid=11718)[0m 2019-05-19 14:11:54.616185: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor device (0): <undefined>, <undefined>
[2m[36m(pid=11719)[0m 2019-05-19 14:11:54,657	INFO policy_evaluator.py:312 -- Creating policy 

In [0]:
dqn_trainer = DQNTrainer(
    env="multi_cartpole",
    config={
        "multiagent": {
            "policy_graphs": policy_graphs,
            "policy_mapping_fn": policy_mapping_fn,
            "policies_to_train": ["dqn_policy"],
        },
        "gamma": 0.95,
        "n_step": 3,
})

2019-05-19 14:12:20,039	INFO policy_evaluator.py:312 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
2019-05-19 14:12:23,485	INFO policy_evaluator.py:732 -- Built policy map: {'dqn_policy': <ray.rllib.agents.dqn.dqn_policy_graph.DQNPolicyGraph object at 0x7f9552336e80>, 'ppo_policy': <ray.rllib.evaluation.tf_policy_template.PPOTFPolicy object at 0x7f95502e73c8>}
2019-05-19 14:12:23,487	INFO policy_evaluator.py:733 -- Built preprocessor map: {'dqn_policy': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7f9552336a90>, 'ppo_policy': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7f9552336ba8>}
2019-05-19 14:12:23,488	INFO policy_evaluator.py:344 -- Built filter map: {'dqn_policy': <ray.rllib.utils.filter.NoFilter object at 0x7f95502c8fd0>, 'ppo_policy': <ray.rllib.utils.filter.NoFilter object at 0x7f95502e7cf8>}


In [0]:

# disable DQN exploration when used by the PPO trainer
ppo_trainer.optimizer.foreach_evaluator(
    lambda ev: ev.for_policy(
        lambda pi: pi.set_epsilon(0.0), policy_id="dqn_policy"))


[None, None, None]

In [0]:
# You should see both the printed X and Y approach 200 as this trains:
# info:
#   policy_reward_mean:
#     dqn_policy: X
#     ppo_policy: Y
for i in range(num_iters):
    print("== Iteration", i, "==")

    # improve the DQN policy
    print("-- DQN --")
    print(pretty_print(dqn_trainer.train()))

    # improve the PPO policy
    print("-- PPO --")
    print(pretty_print(ppo_trainer.train()))

    # swap weights to synchronize
    dqn_trainer.set_weights(ppo_trainer.get_weights(["ppo_policy"]))
    ppo_trainer.set_weights(dqn_trainer.get_weights(["dqn_policy"]))

2019-05-19 14:13:47,632	INFO policy_evaluator.py:438 -- Generating sample batch of size 4
2019-05-19 14:13:47,635	INFO sampler.py:308 -- Raw obs from env: { 0: { 0: np.ndarray((4,), dtype=float64, min=-0.047, max=0.028, mean=-0.015),
       1: np.ndarray((4,), dtype=float64, min=-0.049, max=0.044, mean=0.003),
       2: np.ndarray((4,), dtype=float64, min=-0.009, max=0.037, mean=0.022),
       3: np.ndarray((4,), dtype=float64, min=-0.033, max=0.044, mean=0.005)}}
2019-05-19 14:13:47,641	INFO sampler.py:309 -- Info return from env: {0: {0: {}, 1: {}, 2: {}, 3: {}}}
2019-05-19 14:13:47,644	INFO sampler.py:407 -- Preprocessed obs: np.ndarray((4,), dtype=float64, min=-0.047, max=0.028, mean=-0.015)
2019-05-19 14:13:47,646	INFO sampler.py:411 -- Filtered obs: np.ndarray((4,), dtype=float64, min=-0.047, max=0.028, mean=-0.015)
2019-05-19 14:13:47,650	INFO sampler.py:525 -- Inputs to compute_actions():

{ 'dqn_policy': [ { 'data': { 'agent_id': 1,
                              'env_id': 0,
 

== Iteration 0 ==
-- DQN --


2019-05-19 14:13:47,886	INFO sampler.py:552 -- Outputs of compute_actions():

{ 'dqn_policy': ( np.ndarray((2,), dtype=int64, min=1.0, max=1.0, mean=1.0),
                  [],
                  { 'q_values': np.ndarray((2, 2), dtype=float32, min=0.026, max=0.042, mean=0.032)}),
  'ppo_policy': ( np.ndarray((2,), dtype=int64, min=0.0, max=1.0, mean=0.5),
                  [],
                  { 'action_prob': np.ndarray((2,), dtype=float32, min=0.5, max=0.5, mean=0.5),
                    'behaviour_logits': np.ndarray((2, 2), dtype=float32, min=-0.0, max=0.0, mean=-0.0),
                    'vf_preds': np.ndarray((2,), dtype=float32, min=-0.0, max=0.0, mean=-0.0)})}

2019-05-19 14:13:47,911	INFO sample_batch_builder.py:161 -- Trajectory fragment after postprocess_trajectory():

{ 0: { 'data': { 'action_prob': np.ndarray((4,), dtype=float32, min=0.5, max=0.5, mean=0.5),
                 'actions': np.ndarray((4,), dtype=int64, min=0.0, max=1.0, mean=0.75),
                 'advantages

custom_metrics: {}
date: 2019-05-19_14-13-50
done: false
episode_len_mean: 34.67857142857143
episode_reward_max: 147.0
episode_reward_mean: 86.82142857142857
episode_reward_min: 50.0
episodes_this_iter: 28
episodes_total: 28
experiment_id: 15a9bb44309949c293858c3ef351adf4
hostname: 44573652ebd5
info:
  grad_time_ms: .nan
  learner: {}
  max_exploration: 1.0
  min_exploration: 1.0
  num_steps_sampled: 1000
  num_steps_trained: 0
  num_target_updates: 1
  opt_peak_throughput: 0.0
  opt_samples: .nan
  replay_time_ms: .nan
  sample_time_ms: 9.656
  update_time_ms: 0.002
iterations_since_restore: 1
node_ip: 172.28.0.2
num_healthy_workers: 0
num_metric_batches_dropped: 0
off_policy_estimator: {}
pid: 123
policy_reward_mean:
  dqn_policy: 21.875
  ppo_policy: 21.535714285714285
sampler_perf:
  mean_env_wait_ms: 0.10670862950526032
  mean_inference_ms: 1.8492776316243575
  mean_processing_ms: 0.7551121306824278
time_since_restore: 3.345728635787964
time_this_iter_s: 3.345728635787964
time_tot

2019-05-19 14:13:59,544	INFO multi_gpu_impl.py:146 -- Training on concatenated sample batches:

{ 'inputs': [ np.ndarray((4674,), dtype=int64, min=0.0, max=1.0, mean=0.493),
              np.ndarray((4674,), dtype=float32, min=0.0, max=1.0, mean=0.955),
              np.ndarray((4674, 4), dtype=float32, min=-2.844, max=2.871, mean=0.002),
              np.ndarray((4674,), dtype=int64, min=0.0, max=1.0, mean=0.514),
              np.ndarray((4674,), dtype=float32, min=-1.283, max=4.432, mean=0.0),
              np.ndarray((4674, 2), dtype=float32, min=-0.01, max=0.01, mean=0.0),
              np.ndarray((4674,), dtype=float32, min=0.999, max=53.412, mean=12.765),
              np.ndarray((4674,), dtype=float32, min=-0.005, max=0.005, mean=-0.0)],
  'placeholders': [ <tf.Tensor 'ppo_policy/action:0' shape=(?,) dtype=int64>,
                    <tf.Tensor 'ppo_policy/prev_reward:0' shape=(?,) dtype=float32>,
                    <tf.Tensor 'ppo_policy/observation:0' shape=(?, 4) dtype=floa

custom_metrics: {}
date: 2019-05-19_14-14-08
done: false
episode_len_mean: 38.71568627450981
episode_reward_max: 208.0
episode_reward_mean: 85.72549019607843
episode_reward_min: 40.0
episodes_this_iter: 102
episodes_total: 102
experiment_id: a4bfe1eb1cbb431bbf10d96dc753816f
hostname: 44573652ebd5
info:
  grad_time_ms: 9091.688
  learner:
    ppo_policy:
      cur_kl_coeff: 0.19999998807907104
      cur_lr: 4.999999873689376e-05
      entropy: 0.6632583141326904
      kl: 0.030282454565167427
      policy_loss: -0.04262179508805275
      total_loss: 90.29634857177734
      vf_explained_var: 0.14817292988300323
      vf_loss: 90.33291625976562
  load_time_ms: 101.647
  num_steps_sampled: 4000
  num_steps_trained: 4608
  sample_time_ms: 5291.572
  update_time_ms: 3096.36
iterations_since_restore: 1
node_ip: 172.28.0.2
num_healthy_workers: 2
num_metric_batches_dropped: 0
off_policy_estimator: {}
pid: 123
policy_reward_mean:
  dqn_policy: 20.230392156862745
  ppo_policy: 22.63235294117647
s

2019-05-19 14:14:09,971	INFO policy_evaluator.py:565 -- Training on concatenated sample batches:

{ 'count': 32,
  'policy_batches': { 'dqn_policy': { 'data': { 'actions': np.ndarray((32,), dtype=int64, min=0.0, max=1.0, mean=0.469),
                                                'batch_indexes': np.ndarray((32,), dtype=int64, min=17.0, max=1199.0, mean=630.531),
                                                'dones': np.ndarray((32,), dtype=bool, min=0.0, max=1.0, mean=0.125),
                                                'new_obs': np.ndarray((32, 4), dtype=float32, min=-2.331, max=1.388, mean=-0.049),
                                                'obs': np.ndarray((32, 4), dtype=float32, min=-1.982, max=1.249, mean=-0.039),
                                                'rewards': np.ndarray((32,), dtype=float32, min=1.0, max=2.852, mean=2.164),
                                                'weights': np.ndarray((32,), dtype=float64, min=1.0, max=1.0, mean=1.0)},
          

== Iteration 1 ==
-- DQN --


2019-05-19 14:14:10,233	INFO policy_evaluator.py:587 -- Training output:

{ 'dqn_policy': { 'learner_stats': { 'cur_lr': 0.0005000000237487257,
                                     'max_q': 1.1783957,
                                     'mean_q': 0.13968769,
                                     'mean_td_error': -2.2150373,
                                     'min_q': -0.3040439,
                                     'model': {}},
                  'td_error': np.ndarray((32,), dtype=float32, min=-3.565, max=-0.607, mean=-2.215)}}



custom_metrics: {}
date: 2019-05-19_14-14-17
done: false
episode_len_mean: 43.108695652173914
episode_reward_max: 196.0
episode_reward_mean: 98.91304347826087
episode_reward_min: 50.0
episodes_this_iter: 18
episodes_total: 46
experiment_id: 15a9bb44309949c293858c3ef351adf4
hostname: 44573652ebd5
info:
  grad_time_ms: 10.871
  learner:
    dqn_policy:
      cur_lr: 0.0005000000237487257
      max_q: 5.458551406860352
      mean_q: 4.397507190704346
      mean_td_error: 0.17537327110767365
      min_q: 2.0352535247802734
      model: {}
  max_exploration: 0.902
  min_exploration: 0.902
  num_steps_sampled: 2000
  num_steps_trained: 8000
  num_target_updates: 3
  opt_peak_throughput: 2943.694
  opt_samples: 32.0
  replay_time_ms: 7.406
  sample_time_ms: 11.561
  update_time_ms: 0.002
iterations_since_restore: 2
node_ip: 172.28.0.2
num_healthy_workers: 0
num_metric_batches_dropped: 0
off_policy_estimator: {}
pid: 123
policy_reward_mean:
  dqn_policy: 20.554347826086957
  ppo_policy: 28.902

In [0]:
config = DEFAULT_CONFIG.copy()
config['num_workers'] = 2
config['num_sgd_iter'] = 30
config['num_gpus'] = 3
config['sgd_minibatch_size'] = 128
config['model']['fcnet_hiddens'] = [100, 100]
config['num_cpus_per_worker'] = 0  # This avoids running out of resources in the notebook environment when this cell is re-executed

agent = TRPOTrainer(config, 'CartPole-v0')

FYI: By default, the value function will not share layers with the policy model ('vf_share_layers': False).
2019-05-18 19:21:34,429	INFO policy_evaluator.py:312 -- Creating policy evaluation worker 0 on CPU (please ignore any CUDA init errors)


Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.random.categorical instead.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Instructions for updating:
Deprecated in favor of operator or tf.math.divide.


2019-05-18 19:21:35,945	INFO policy_evaluator.py:732 -- Built policy map: {'default_policy': <trpo.trpo_policy_graph.TRPOPolicyGraph object at 0x7f6e7f6f74e0>}
2019-05-18 19:21:35,947	INFO policy_evaluator.py:733 -- Built preprocessor map: {'default_policy': <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7f6e7f6f7198>}
2019-05-18 19:21:35,949	INFO policy_evaluator.py:344 -- Built filter map: {'default_policy': <ray.rllib.utils.filter.NoFilter object at 0x7f6e7f61a048>}
2019-05-18 19:21:36,006	INFO multi_gpu_optimizer.py:80 -- LocalMultiGPUOptimizer devices ['/gpu:0', '/gpu:1', '/gpu:2']


[2m[36m(pid=11726)[0m Instructions for updating:
[2m[36m(pid=11726)[0m non-resource variables are not supported in the long term
[2m[36m(pid=11727)[0m Instructions for updating:
[2m[36m(pid=11727)[0m non-resource variables are not supported in the long term
[2m[36m(pid=11726)[0m 2019-05-18 19:21:41,039	INFO policy_evaluator.py:312 -- Creating policy evaluation worker 1 on CPU (please ignore any CUDA init errors)
[2m[36m(pid=11726)[0m 2019-05-18 19:21:41.065532: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2200000000 Hz
[2m[36m(pid=11726)[0m 2019-05-18 19:21:41.065807: I tensorflow/compiler/xla/service/service.cc:150] XLA service 0x18ae840 executing computations on platform Host. Devices:
[2m[36m(pid=11726)[0m 2019-05-18 19:21:41.065845: I tensorflow/compiler/xla/service/service.cc:158]   StreamExecutor device (0): <undefined>, <undefined>
[2m[36m(pid=11726)[0m Instructions for updating:
[2m[36m(pid=11726)[0m Use keras.layers.den

In [0]:
for i in range(2):
    result = agent.train()
    print(pretty_print(result))

[2m[36m(pid=11726)[0m 2019-05-18 19:22:07,265	INFO policy_evaluator.py:438 -- Generating sample batch of size 200
[2m[36m(pid=11726)[0m 2019-05-18 19:22:07,266	INFO sampler.py:308 -- Raw obs from env: { 0: { 'agent0': np.ndarray((4,), dtype=float64, min=-0.048, max=0.037, mean=-0.014)}}
[2m[36m(pid=11726)[0m 2019-05-18 19:22:07,266	INFO sampler.py:309 -- Info return from env: {0: {'agent0': None}}
[2m[36m(pid=11726)[0m 2019-05-18 19:22:07,266	INFO sampler.py:407 -- Preprocessed obs: np.ndarray((4,), dtype=float64, min=-0.048, max=0.037, mean=-0.014)
[2m[36m(pid=11726)[0m 2019-05-18 19:22:07,266	INFO sampler.py:411 -- Filtered obs: np.ndarray((4,), dtype=float64, min=-0.048, max=0.037, mean=-0.014)
[2m[36m(pid=11726)[0m 2019-05-18 19:22:07,267	INFO sampler.py:525 -- Inputs to compute_actions():
[2m[36m(pid=11726)[0m 
[2m[36m(pid=11726)[0m { 'default_policy': [ { 'data': { 'agent_id': 'agent0',
[2m[36m(pid=11726)[0m                                   'env_id': 0

2019-05-18 19:22:10,333	INFO multi_gpu_impl.py:146 -- Training on concatenated sample batches:

{ 'inputs': [ np.ndarray((4000, 4), dtype=float32, min=-2.671, max=2.54, mean=0.01),
              np.ndarray((4000,), dtype=float32, min=0.996, max=51.984, mean=12.926),
              np.ndarray((4000,), dtype=float32, min=-1.266, max=4.144, mean=-0.0),
              np.ndarray((4000,), dtype=int64, min=0.0, max=1.0, mean=0.5),
              np.ndarray((4000, 2), dtype=float32, min=-0.008, max=0.007, mean=-0.0),
              np.ndarray((4000,), dtype=float32, min=-0.005, max=0.005, mean=-0.0),
              np.ndarray((4000,), dtype=int64, min=0.0, max=1.0, mean=0.477),
              np.ndarray((4000,), dtype=float32, min=0.0, max=1.0, mean=0.956)],
  'placeholders': [ <tf.Tensor 'default_policy/obs:0' shape=(?, 4) dtype=float32>,
                    <tf.Tensor 'default_policy/value_targets:0' shape=(?,) dtype=float32>,
                    <tf.Tensor 'default_policy/advantages:0' shape=(?,

custom_metrics: {}
date: 2019-05-18_19-22-15
done: false
episode_len_mean: 22.977011494252874
episode_reward_max: 78.0
episode_reward_mean: 22.977011494252874
episode_reward_min: 9.0
episodes_this_iter: 174
episodes_total: 174
experiment_id: 9ae62e6c9087415bb982b06d080c2260
hostname: 202c4c6687c7
info:
  grad_time_ms: 4804.961
  learner:
    default_policy:
      cur_kl_coeff: 0.19999995827674866
      cur_lr: 4.999999873689376e-05
      entropy: 0.6644614934921265
      kl: 0.029476119205355644
      policy_loss: -0.04188487306237221
      total_loss: 176.8184814453125
      vf_explained_var: 0.02176438644528389
      vf_loss: 176.8544921875
  load_time_ms: 138.061
  num_steps_sampled: 4000
  num_steps_trained: 3906
  sample_time_ms: 3111.643
  update_time_ms: 1341.804
iterations_since_restore: 1
node_ip: 172.28.0.2
num_healthy_workers: 2
num_metric_batches_dropped: 0
off_policy_estimator: {}
pid: 125
policy_reward_mean: {}
sampler_perf:
  mean_env_wait_ms: 0.0887114286492576
  mean_i