In [1]:
import os
import tf_agents.policies
import tf_agents.specs
from tf_agents.environments import suite_gym, parallel_py_environment
from tf_agents.environments import tf_py_environment
from tf_agents.metrics import tf_metrics
from tf_agents.policies import policy_saver
from tf_agents.policies.tf_py_policy import TFPyPolicy
from tf_agents.replay_buffers import tf_uniform_replay_buffer, episodic_replay_buffer
from tf_agents.drivers import dynamic_episode_driver, dynamic_step_driver
import tensorflow as tf
from tf_agents.trajectories.policy_step import PolicyStep
tf.config.set_visible_devices([], 'GPU')  #  allows testing during training
from tf_agents.trajectories import time_step as ts, policy_step, trajectory
from reinforcement_learning import labeling_functions
labeling_function = labeling_functions['LunarLanderContinuous-v2']
from util.io.dataset_generator import map_rl_trajectory_to_vae_input
from util.io.dataset_generator import ErgodicMDPTransitionGenerator
import tensorflow_probability as tfp
tfd = tfp.distributions

In [2]:
py_env = suite_gym.load('LunarLanderContinuous-v2')
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
tf_env.time_step_spec()

TimeStep(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)), observation=BoundedTensorSpec(shape=(8,), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32)))

In [3]:
tf_env = tf_py_environment.TFPyEnvironment(py_env)

In [None]:
replay_buffer_capacity = 1280
# specs
policy_step_spec =  policy_step.PolicyStep(
    action=tf_env.action_spec(),
    state=(),
    info=())
trajectory_spec = trajectory.from_transition(tf_env.time_step_spec(),
                                             policy_step_spec,
                                             tf_env.time_step_spec())
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=trajectory_spec,
    batch_size=tf_env.batch_size,
    max_length=replay_buffer_capacity)

dataset_generator = lambda: replay_buffer.as_dataset(
    num_parallel_calls=tf.data.experimental.AUTOTUNE,
    num_steps=2
).map(
    map_func=lambda trajectory, _: map_rl_trajectory_to_vae_input(trajectory, labeling_function),
    num_parallel_calls=tf.data.experimental.AUTOTUNE,
    #  deterministic=False  # TF version >= 2.2.0
)

In [None]:
class SavedTFPolicy(tf_agents.policies.tf_policy.Base):

    def __init__(self, saved_policy_path):
        self.saved_policy = tf.compat.v2.saved_model.load(sac_policy_dir)
        spec_path = os.path.join(saved_policy_path, policy_saver.COLLECT_POLICY_SPEC)
        policy_specs = tf_agents.specs.tensor_spec.from_pbtxt_file(spec_path)
        super().__init__(
            time_step_spec=tf_agents.trajectories.time_step.time_step_spec(
                policy_specs['collect_data_spec'].observation),
            action_spec=policy_specs['collect_data_spec'].action,
            info_spec=policy_specs['collect_data_spec'].policy_info,
            policy_state_spec=policy_specs['policy_state_spec'])

    def _distribution(self, time_step, policy_state):
        step = self.saved_policy.action(time_step, policy_state)
        return PolicyStep(tfd.Deterministic(step.action), step.state, step.info)

    def _get_initial_state(self, batch_size):
        return self.saved_policy.get_initial_state(batch_size)

spec_path = os.path.join('../saves/LunarLanderContinuous-v2/policy', policy_saver.COLLECT_POLICY_SPEC)
policy_specs = tf_agents.specs.tensor_spec.from_pbtxt_file(spec_path)

In [None]:
def display_labeling(trajectory):
    label = labeling_functions['LunarLanderContinuous-v2'](trajectory.observation)
    if tf.reduce_any(label[..., 2]) and tf.reduce_any(label[..., 6]):
        print('close to the lunar pad with high speed')
    if not tf.reduce_any(label[..., 7]):
        print('unsafe lander angle')
    if tf.reduce_any(label[..., 2]) and not tf.reduce_any(label[..., 8]):
        print('close to the lunar pad with unsafe lander angle')
    if tf.reduce_any(label[..., 1]):
        print('lander too close to the edge of the frame')

reward_metric = tf_metrics.AverageReturnMetric()
sac_policy_dir = '../saves/LunarLanderContinuous-v2/policy'
saved_policy = SavedTFPolicy(sac_policy_dir)
policy = tf_agents.policies.epsilon_greedy_policy.EpsilonGreedyPolicy(
    policy=saved_policy, epsilon=0.9)
dynamic_episode_driver.DynamicEpisodeDriver(tf_env, policy, num_episodes=10,
                                            observers=[
                                                display_labeling,
                                                lambda _: py_env.render(mode='human'),
                                                replay_buffer.add_batch,
                                                reward_metric
                                            ]).run()
reward_metric.result()

In [None]:
dataset = replay_buffer.as_dataset(
    num_parallel_calls=tf.data.experimental.AUTOTUNE,
    num_steps=2
)
iterator = iter(dataset)
trajectory, _ = next(iterator)

state = trajectory.observation[0, ...]
labels = tf.cast(labeling_function(trajectory.observation), tf.float32)
if tf.rank(labels) == 1:
    labels = tf.expand_dims(labels, axis=-1)
label = labels[0, ...]
action = tf.cast(trajectory.action[0, ...], dtype=tf.float32)
reward = trajectory.reward[0, ...]
if tf.rank(reward) == 1:
    reward = tf.expand_dims(reward, axis=-1)
next_state = trajectory.observation[1, ...]
next_label = labels[1, ...]

print("\nstate", state)
print('\nlabels', labels)
print('\nlabel', label)
print('\naction', action)
print('\nreward', reward)
print('\nnext_state', next_state)
print('\nnext_label', next_label)

In [None]:
generator = ErgodicMDPTransitionGenerator(labeling_function, replay_buffer)
dataset = replay_buffer.as_dataset(
    num_parallel_calls=4,
    num_steps=2
).map(
    map_func=generator,
    num_parallel_calls=4,
    #  deterministic=False  # TF version >= 2.2.0
).batch(batch_size=8, drop_remainder=True)
iterator = iter(dataset)
next(iterator)

In [5]:
import variational_action_discretizer

vae_mdp = variational_action_discretizer.load(
    # "../../saves/Pendulum-v0/models/vae_LS12_MC1_ER20.0-decay=7.5e-05-min=-10_KLA0.0-growth=1e-06_TD0.70-0.50_1e-06-2e-06/policy/action_discretizer/LA3_MC1_ER20.0-decay=7.5e-05-min=-10_KLA0.0-growth=1e-06_TD0.50-0.33_1e-06-2e-06_params=full_vae_optimization/step140000/eval_elbo-3.784"
    # "../../saves/Pendulum-v0/models/vae_LS13_MC1_ER20.0-decay=2e-05-min=-10_KLA0.0-growth=1e-06_TD0.70-0.50_1e-06-2e-06/policy/action_discretizer/LA5_MC1_ER20.0-decay=2e-05-min=-10_KLA0.0-growth=1e-06_TD0.25-0.17_1e-06-2e-06_params=full_vae_optimization/step270000/eval_elbo-1.704"
    # "../../saves/Pendulum-v0/models/vae_LS13_MC1_ER20.0-decay=7.5e-05-min=-10_KLA0.0-growth=1e-06_TD0.70-0.50_1e-06-2e-06/permissive_variance_policy-multiplier=10.0/action_discretizer/LA5_MC1_ER20.0-decay=7.5e-05-min=-10_KLA0.0-growth=1e-06_TD0.25-0.17_1e-06-2e-06_params=full_vae_optimization/step70000/eval_elbo-7.265"
    # "../../saves/Pendulum-v0/models/vae_LS12_MC3_CER100.0-decay=1e-05_KLA1e-06-growth=1e-07_TD0.95-0.90_1e-06-2e-06/policy/action_discretizer/LA3_MC3_CER100.0-decay=1e-05_KLA1e-06-growth=1e-07_TD0.50-0.33_1e-06-2e-06_params=full_vae_optimization/step1010000/eval_elbo11.174"
    '../../saves/LunarLanderContinuous-v2/models/vae_LS20_MC1_ER10.0-decay=1e-05-min=0_KLA0.0-growth=5e-05_TD0.67-0.50_1e-06-2e-06_seed=20421/policy/action_discretizer/LA4_MC1_ER10.0-decay=1e-05-min=0_KLA0.0-growth=5e-05_TD0.33-0.22_1e-06-2e-06_params=full_vae_optimization-relaxed_state_encoding/base',
    step=880000
)
print("VAE MDP loaded")

discrete_tf_env = vae_mdp.wrap_tf_environment(
    tf_env=tf_env,
    labeling_function=labeling_function,
    deterministic_embedding_functions=True
)
discrete_tf_env.reset()
print(discrete_tf_env.time_step_spec(), discrete_tf_env.action_spec())

reward_metric = tf_metrics.AverageReturnMetric()

policy = vae_mdp.get_latent_policy()

dynamic_episode_driver.DynamicEpisodeDriver(
    discrete_tf_env,
    policy,
    num_episodes=20,
    observers=[
        lambda _: py_env.render(mode='human'),
        reward_metric
    ]
).run()

print("avg rewards", reward_metric.result())

VAE MDP loaded
TimeStep(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)), observation=BoundedTensorSpec(shape=(20,), dtype=tf.int32, name='observation', minimum=array(0, dtype=int32), maximum=array(1, dtype=int32))) BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(3, dtype=int32))
avg rewards tf.Tensor(-76.85185, shape=(), dtype=float32)
