# Imports

In [9]:
import collections
import math
import gym
import numpy as np
import random
import tensorflow as tf
import tensorflow_probability as tfp
import pyoneer as pynr
import pyoneer.rl as pyrl

# Seed the environment.
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

# Problem

In [10]:
def create_env_model(batch_size=None):
    # Create the gym env, wrapped in a vectorized manner.
    env_spec = 'CartPole-v1'

    def make_fn():
        env = gym.make(env_spec)
        env.observation_space.dtype = 'float64'
        return env

    if batch_size is None:
        gym_env = make_fn()
    else:
        gym_env = pyrl.wrappers.Batch(make_fn, batch_size)

    # Wrap it in a Model.
    env_model = pyrl.rollouts.Env(gym_env)
    return env_model

In [11]:
env_model = create_env_model(1)
env_outputs = env_model.reset()
print(env_outputs.next_state)
env_model.close()

tf.Tensor([[-0.01138523 -0.04794491 -0.01941054  0.01141418]], shape=(1, 4), dtype=float64)


In [12]:
AgentPolicyOutput = collections.namedtuple(
    'AgentPolicyOutput', ['action'])
AgentValueOutput = collections.namedtuple(
    'AgentValueOutput', ['value'])
AgentPolicyValueOutput = collections.namedtuple(
    'AgentPolicyValueOutput', ['log_prob', 'entropy', 'value'])


class CartPoleAgent(tf.Module):

    def __init__(self, action_spec):
        super(CartPoleAgent, self).__init__(name='CartPoleAgent')
        self._hidden = tf.keras.layers.Dense(32, activation=tf.nn.relu)
        self._logits = tf.keras.Sequential([
            tf.keras.layers.Dense(8, activation=tf.nn.relu),
            tf.keras.layers.Dense(2)])
        self._value = tf.keras.Sequential([
            tf.keras.layers.Dense(8, activation=tf.nn.relu),
            tf.keras.layers.Dense(1)])
        self._policy = tfp.distributions.Categorical

        self.action_spec = action_spec
        self.output_specs = AgentPolicyOutput(
            action=self.action_spec)
        self.output_shapes = tf.nest.map_structure(
            lambda spec: spec.shape, self.output_specs)
        self.output_dtypes = tf.nest.map_structure(
            lambda spec: spec.dtype, self.output_specs)

    @property
    def value_trainable_variables(self):
        return self._hidden.trainable_variables + self._value.trainable_variables

    @property
    def policy_trainable_variables(self):
        return self._hidden.trainable_variables + self._logits.trainable_variables

    @tf.function
    def _scale_state(self, state):
        state = tf.cast(state, tf.float32)
        state = (state / [[2.4, 10., 1., 10.]])
        state = tf.concat(
            [state, tf.stack([tf.math.cos(state[..., 2] / math.pi),
                              tf.math.sin(state[..., 2] / math.pi)],
                             axis=-1)],
            axis=-1)
        return tf.clip_by_value(state, -1., 1.)

    @tf.function
    def initialize(self, env_outputs, agent_outputs):
        state = self._scale_state(env_outputs.state)
        hidden = self._hidden(state)
        _ = self._value(hidden)
        _ = self._logits(hidden)
    
    @tf.function
    def value(self, env_outputs, agent_outputs):
        state = self._scale_state(env_outputs.state)
        hidden = self._hidden(state)
        value = tf.squeeze(self._value(hidden), axis=-1)
        return AgentValueOutput(value=value)

    @tf.function
    def policy_value(self, env_outputs, agent_outputs):
        state = self._scale_state(env_outputs.state)
        hidden = self._hidden(state)
        logits = self._logits(hidden)
        policy = self._policy(logits=logits)
        entropy = policy.entropy()
        log_prob = policy.log_prob(agent_outputs.action)
        value = tf.squeeze(self._value(hidden), axis=-1)
        return AgentPolicyValueOutput(log_prob=log_prob,
                                      entropy=entropy,
                                      value=value)

    @tf.function
    def reset(self, env_outputs, explore=True):
        initial_action = pynr.debugging.mock_spec(
            tf.TensorShape([env_outputs.state.shape[0]]), 
            self.action_spec, 
            tf.zeros)
        return AgentPolicyOutput(
            action=initial_action)

    @tf.function
    def step(self, env_outputs, agent_outputs, time_step, explore=True):
        state = env_outputs.next_state
        state = self._scale_state(state)
        hidden = self._hidden(state)
        logits = self._logits(hidden)
        policy = self._policy(logits=logits)

        if explore:
            action = policy.sample()
        else:
            action = policy.mode()

        action = tf.nest.map_structure(
            lambda t, s: tf.cast(t, s.dtype), 
            action, self.action_spec)
        return AgentPolicyOutput(action=action)


# Strategies

In [13]:
class Strategy(object):

    def __init__(self, agent, explore):
        self.agent = agent
        self.explore = explore

    @tf.function
    def reset(self, *args, **kwargs):
        return self.agent.reset(*args, explore=self.explore, **kwargs)

    @tf.function
    def step(self, *args, **kwargs):
        return self.agent.step(*args, explore=self.explore, **kwargs)


# On-Policy AWR

In [None]:
HyperParameters = collections.namedtuple(
    'HyperParameters', 
    ['iterations',
     'value_steps',
     'policy_steps',
     'discounts',
     'lambdas',
     'beta',
     'score_max',
     'value_scale',
     'eval_every',
     'learning_rate'])

explore_size = 128
exploit_size = 100
max_steps = 500

explore_env_model = create_env_model(explore_size)
exploit_env_model = create_env_model(explore_size)

agent_model = CartPoleAgent(explore_env_model.action_spec)

explore_strategy = Strategy(agent_model, True)
exploit_strategy = Strategy(agent_model, False)

explore_rollout = pyrl.rollouts.Rollout(explore_env_model, explore_strategy, max_steps)
exploit_rollout = pyrl.rollouts.Rollout(exploit_env_model, exploit_strategy, max_steps)

hparams = HyperParameters(
    iterations=100,
    value_steps=50,
    policy_steps=10,
    discounts=.99,
    lambdas=.95,
    beta=.05,
    score_max=100.,
    value_scale=.5,
    eval_every=1,
    learning_rate=1e-4,
)
value_optimizer = tf.keras.optimizers.Adam(hparams.learning_rate)
policy_optimizer = tf.keras.optimizers.Adam(hparams.learning_rate)

discounted_returns = tf.function(pyrl.targets.discounted_returns)
generalized_advantage_estimate = tf.function(pyrl.targets.generalized_advantage_estimate)

mock_env_outputs = pynr.debugging.mock_spec(
    tf.TensorShape([1, max_steps]), explore_env_model.output_specs, tf.zeros)
mock_agent_outputs = pynr.debugging.mock_spec(
    tf.TensorShape([1, max_steps]), agent_model.output_specs, tf.zeros)
agent_model.initialize(mock_env_outputs, mock_agent_outputs)

explore_env_model.seed(42)
for iteration in range(hparams.iterations):
    if (iteration % hparams.eval_every) == 0:
        (_, eval_env_outputs) = exploit_rollout().outputs
        eval_returns = tf.reduce_sum(
            tf.cast(eval_env_outputs.reward, tf.float32) * eval_env_outputs.weight, 
            axis=1)
        tf.print(tf.reduce_mean(eval_returns))

    (agent_outputs, env_outputs) = explore_rollout().outputs
    returns = discounted_returns(
        tf.cast(env_outputs.reward, tf.float32) * env_outputs.weight, 
        discounts=hparams.discounts)
    
    for _ in range(hparams.value_steps):
        with tf.GradientTape() as tape:
            agent_value_outputs = agent_model.value(env_outputs, agent_outputs)
            value_loss = hparams.value_scale * tf.reduce_sum(
                (tf.square(agent_value_outputs.value - tf.stop_gradient(returns)) *
                 env_outputs.weight))
            loss = value_loss / (explore_size * max_steps)

        variables = agent_model.value_trainable_variables
        grads = tape.gradient(loss, variables)
        value_optimizer.apply_gradients(zip(grads, variables))

    agent_value_outputs = agent_model.value(env_outputs, agent_outputs)
    advantages = generalized_advantage_estimate(
        tf.cast(env_outputs.reward, tf.float32) * env_outputs.weight, 
        agent_value_outputs.value * env_outputs.weight,
        discounts=hparams.discounts, 
        lambdas=hparams.lambdas, 
        weights=env_outputs.weight)

    for _ in range(hparams.policy_steps):
        with tf.GradientTape() as tape:
            agent_estimates_output = agent_model.policy_value(
                env_outputs, agent_outputs)
            score = tf.minimum(tf.exp(advantages / hparams.beta), hparams.score_max)
            policy_loss = -tf.reduce_sum(
                agent_estimates_output.log_prob * tf.stop_gradient(score) * env_outputs.weight)
            loss = policy_loss / (explore_size * max_steps)

        variables = agent_model.policy_trainable_variables
        grads = tape.gradient(loss, variables)
        policy_optimizer.apply_gradients(zip(grads, variables))

explore_env_model.close()
exploit_env_model.close()

9.3515625
9.4765625
9.421875
9.3515625
9.4140625
9.4453125
9.375
9.3515625
9.421875
9.3046875
9.3203125
9.2578125
9.234375
9.375
9.2421875
9.3203125
9.3125
9.296875
9.3671875
9.3203125
9.375
9.3828125
9.4296875
9.359375
9.515625
9.3984375
9.4140625
9.3203125
9.421875
9.453125
9.4296875
9.3828125
9.3046875
9.40625
9.25
9.234375
9.2734375
9.3984375
9.46875
9.421875
9.34375
9.3125
9.390625
9.296875
9.359375
9.390625
9.375
9.2890625
9.3515625
9.3671875
9.3359375
9.3828125
9.2421875
9.3984375
9.5625
10.0546875
11.625
14.78125
23.9921875
34.2578125
29.515625
36.859375
47.140625
54.6484375
55.8515625
59.703125
65.9140625
68.625
72.8984375
73.2578125
73.8984375
81.5078125
77.1796875
72.3671875
67.96875
76.84375
102.453125
144.242188
115.492188
89.3359375
97.7109375
114.679688
123.414062
103.476562
90.5703125
101.703125
120.664062
109.25
98.9375
95.4375
96.078125
121.117188
102.257812
92.3515625
92.7890625
102.171875
114.796875
104.742188
