In [1]:
import rospy
import gym, ray
import numpy as np
import os
import torch.nn as nn

from ray import tune
from ray.rllib.agents import ppo
from ray.tune.registry import register_env
from neuroracer_gym.tasks.neuroracer_discrete_task import NeuroRacerDiscreteTask

from ray.tune import grid_search
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.models.torch.visionnet import VisionNetwork
import ray.rllib.agents.ppo as ppo
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.agents.dqn import DQNTrainer
from ray.rllib.models.tf.misc import normc_initializer
from ray.tune.logger import pretty_print
from matplotlib import pyplot as plt
import tensorflow as tf
from neuroracer_discrete import NeuroRacer

# rospy.init_node('neuroracer_qlearn', anonymous=True, log_level=rospy.INFO)


Instructions for updating:
non-resource variables are not supported in the long term
None


In [2]:
env = gym.make('NeuroRacer-v0')

print(env.observation_space)

[ERROR] [1610481006.206943, 195.271000]: NOT Initialising Simulation Physics Parameters
[WARN] [1610481006.211594, 0.002000]: Start Init ControllersConnection
[WARN] [1610481006.212852, 0.002000]: END Init ControllersConnection


Box(0.0, 10.0, (300,), float32)


In [3]:
env.reset()
for _ in range(10):
    obs, reward, done, _ = env.step(2)
print(obs.min())
print(obs.max())

2.1470845
10.0


In [4]:
class CustomModel(TFModelV2):
    """Example of a keras custom model that just delegates to an fc-net."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        super(CustomModel, self).__init__(obs_space, action_space, num_outputs,
                                          model_config, name)
        self.model = FullyConnectedNetwork(obs_space, action_space,
                                           num_outputs, model_config, name)
        self.register_variables(self.model.variables())

    def forward(self, input_dict, state, seq_lens):
        return self.model.forward(input_dict, state, seq_lens)

    def value_function(self):
        return self.model.value_function()

class MyKerasModel(TFModelV2):
    """Custom model for policy gradient algorithms."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
        super(MyKerasModel, self).__init__(obs_space, action_space,
                                           num_outputs, model_config, name)
        self.inputs = tf.keras.layers.Input(
            shape=obs_space.shape, name="observations")

        layer_dense_1 = tf.keras.layers.Dense(
            256,
            name="Dense1",
            activation=tf.nn.relu,
            kernel_initializer=normc_initializer(1.0))(self.inputs)

        layer_dense_2 = tf.keras.layers.Dense(
            128,
            name="Dense2",
            activation=tf.nn.relu,
            kernel_initializer=normc_initializer(1.0))(layer_dense_1)
        layer_out = tf.keras.layers.Dense(
            num_outputs,
            name="my_out",
            activation=None,
            kernel_initializer=normc_initializer(0.01))(layer_dense_2)
        value_out = tf.keras.layers.Dense(
            1,
            name="value_out",
            activation=None,
            kernel_initializer=normc_initializer(0.01))(layer_dense_2)
        self.base_model = tf.keras.Model(self.inputs, [layer_out, value_out])
        self.register_variables(self.base_model.variables)

    def forward(self, input_dict, state, seq_lens):
        model_out, self._value_out = self.base_model(input_dict["obs"])
        return model_out, state

    def value_function(self):
        return tf.reshape(self._value_out, [-1])

    def metrics(self):
        return {"foo": tf.constant(42.0)}

In [5]:
ray.init()

ModelCatalog.register_custom_model(
    "my_model", CustomModel)

config = {
    "env": NeuroRacerDiscreteTask,  # or "corridor" if registered above
    "env_config": {
        "corridor_length": 5,
    },
    # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.

    "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
    "model": {
        "custom_model": "my_model",
    },
    "num_workers": 1,  # parallelism
    "vf_share_layers": False,
}



stop = {
    "training_iteration": 500,
    "timesteps_total": 1000000
}

2021-01-12 20:50:09,949	INFO services.py:1171 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


In [6]:
trainer = PPOTrainer(config=config)
policy = trainer.get_policy()

print(policy.model.model.base_model.summary())

2021-01-12 20:50:11,513	INFO trainer.py:591 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
2021-01-12 20:50:11,514	INFO trainer.py:616 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=4029258)[0m Instructions for updating:
[2m[36m(pid=4029258)[0m non-resource variables are not supported in the long term
[2m[36m(pid=4029258)[0m [ERROR] [1610481014.180349, 0.000000]: NOT Initialising Simulation Physics Parameters
[2m[36m(pid=4029258)[0m [WARN] [1610481014.184074, 0.000000]: Start Init ControllersConnection
[2m[36m(pid=4029258)[0m [WARN] [1610481014.185442, 0.000000]: END Init ControllersConnection


[2m[36m(pid=4029258)[0m None
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observations (InputLayer)       [(None, 300)]        0                                            
__________________________________________________________________________________________________
fc_1 (Dense)                    (None, 256)          77056       observations[0][0]               
__________________________________________________________________________________________________
fc_value_1 (Dense)              (None, 256)          77056       observations[0][0]               
__________________________________________________________________________________________________
fc_2 (Dense)                    (None, 256)          65792       fc_1[0][0]                       
______________________________________________________________

In [7]:
results = []
for i in range(250):
    result = trainer.train()
    results.append(result)

    if i % 5 == 0:
        checkpoint = trainer.save()
        print("checkpoint saved at", checkpoint)

    if i % 20 == 0:
        print(pretty_print(result))

[2m[36m(pid=4029258)[0m Instructions for updating:
[2m[36m(pid=4029258)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


Instructions for updating:
Prefer Variable.assign which has equivalent behavior in 2.X.
checkpoint saved at /home/dschori/ray_results/PPO_NeuroRacerDiscreteTask_2021-01-12_20-50-116mp15u2n/checkpoint_1/checkpoint-1
custom_metrics: {}
date: 2021-01-12_20-52-35
done: false
episode_len_mean: 172.08695652173913
episode_reward_max: 30.940000000001277
episode_reward_mean: -194.65217391304347
episode_reward_min: -362.92999999999904
episodes_this_iter: 23
episodes_total: 23
experiment_id: 8ef5b927a9584569864261b159c35931
hostname: workstation
info:
  learner:
    default_policy:
      cur_kl_coeff: 0.20000000298023224
      cur_lr: 4.999999873689376e-05
      entropy: 1.0767686367034912
      entropy_coeff: 0.0
      kl: 0.02213660627603531
      model: {}
      policy_loss: -0.05445832386612892
      total_loss: 7458.43310546875
      vf_explained_var: 0.004315183963626623
      vf_loss: 7458.484375
  num_steps_sampled: 4000
  num_steps_trained: 4000
iterations_since_restore: 1
node_ip: 192.1