In [1]:
import sys
import os

path = os.path.dirname(os.path.abspath("__file__"))
sys.path.insert(0, path + '/../..')

import tf_agents.policies
import tf_agents.specs
from tf_agents.environments import suite_gym, parallel_py_environment
from tf_agents.environments import tf_py_environment
from tf_agents.metrics import tf_metrics
from tf_agents.policies import policy_saver
from tf_agents.policies.tf_py_policy import TFPyPolicy
from tf_agents.replay_buffers import tf_uniform_replay_buffer, episodic_replay_buffer
from tf_agents.drivers import dynamic_episode_driver, dynamic_step_driver
import tensorflow as tf
from tf_agents.trajectories.policy_step import PolicyStep
from policies import SavedTFPolicy
tf.config.set_visible_devices([], 'GPU')  #  allows testing during training
from tf_agents.trajectories import time_step as ts, policy_step, trajectory
from reinforcement_learning import labeling_functions
labeling_function = labeling_functions['LunarLander-v2']
from util.io.dataset_generator import map_rl_trajectory_to_vae_input
from util.io.dataset_generator import ErgodicMDPTransitionGenerator
import tensorflow_probability as tfp
tfd = tfp.distributions

import reinforcement_learning.environments

In [2]:
py_env = suite_gym.load('LunarLander-v2')
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
tf_env.time_step_spec()

TimeStep(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)), observation=BoundedTensorSpec(shape=(8,), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32)))

In [None]:
def display_labeling(trajectory):
    label = labeling_functions['LunarLanderContinuous-v2'](trajectory.observation)
    if tf.reduce_any(label[..., 2]) and tf.reduce_any(label[..., 6]):
        print('close to the lunar pad with high speed')
    if not tf.reduce_any(label[..., 7]):
        print('unsafe lander angle')
    if tf.reduce_any(label[..., 2]) and not tf.reduce_any(label[..., 8]):
        print('close to the lunar pad with unsafe lander angle')
    if tf.reduce_any(label[..., 1]):
        print('lander too close to the edge of the frame')

reward_metric = tf_metrics.AverageReturnMetric()
sac_policy_dir = '../saves/LunarLander-v2/dqn_policy'
saved_policy = SavedTFPolicy(sac_policy_dir)
dynamic_episode_driver.DynamicEpisodeDriver(tf_env, saved_policy, num_episodes=5,
                                            observers=[
                                                display_labeling,
                                                lambda _: py_env.render(mode='human'),
                                                reward_metric
                                            ]).run()
reward_metric.result()

In [None]:
import variational_mdp

vae_mdp = variational_mdp.load(
    # "../../saves/Pendulum-v0/models/vae_LS12_MC1_ER20.0-decay=7.5e-05-min=-10_KLA0.0-growth=1e-06_TD0.70-0.50_1e-06-2e-06/policy/action_discretizer/LA3_MC1_ER20.0-decay=7.5e-05-min=-10_KLA0.0-growth=1e-06_TD0.50-0.33_1e-06-2e-06_params=full_vae_optimization/step140000/eval_elbo-3.784"
    # "../../saves/Pendulum-v0/models/vae_LS13_MC1_ER20.0-decay=2e-05-min=-10_KLA0.0-growth=1e-06_TD0.70-0.50_1e-06-2e-06/policy/action_discretizer/LA5_MC1_ER20.0-decay=2e-05-min=-10_KLA0.0-growth=1e-06_TD0.25-0.17_1e-06-2e-06_params=full_vae_optimization/step270000/eval_elbo-1.704"
    # "../../saves/Pendulum-v0/models/vae_LS13_MC1_ER20.0-decay=7.5e-05-min=-10_KLA0.0-growth=1e-06_TD0.70-0.50_1e-06-2e-06/permissive_variance_policy-multiplier=10.0/action_discretizer/LA5_MC1_ER20.0-decay=7.5e-05-min=-10_KLA0.0-growth=1e-06_TD0.25-0.17_1e-06-2e-06_params=full_vae_optimization/step70000/eval_elbo-7.265"
    # "../../saves/Pendulum-v0/models/vae_LS12_MC3_CER100.0-decay=1e-05_KLA1e-06-growth=1e-07_TD0.95-0.90_1e-06-2e-06/policy/action_discretizer/LA3_MC3_CER100.0-decay=1e-05_KLA1e-06-growth=1e-07_TD0.50-0.33_1e-06-2e-06_params=full_vae_optimization/step1010000/eval_elbo11.174"
    '../../saves/LunarLander-v2/models/vae_LS20_MC1_ER10.0-decay=1e-05-min=0_KLA0.0-growth=5e-05_TD0.67-0.50_1e-06-2e-06_seed=20210510_PER-priority_exponent=0.99-WIS_exponent=0.4-WIS_growth_rate=7.5e-05loss_based_priorities_params=full_vae_optimization-relaxed_state_encoding-latent_policy/base',
    step=720000,
    discrete_action=True
)
print("VAE MDP loaded")

In [None]:
vae_mdp.eval_policy(eval_env=py_env, labeling_function=labeling_function, num_eval_episodes=20, render=True)

In [2]:
py_env = suite_gym.load('LunarLanderNoRewardShaping-v2')
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
tf_env.time_step_spec()

TimeStep(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)), observation=BoundedTensorSpec(shape=(8,), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32)))

In [4]:
reward_metric = tf_metrics.AverageReturnMetric()
sac_policy_dir = '../saves/LunarLander-v2/dqn_policy'
saved_policy = SavedTFPolicy(sac_policy_dir)
dynamic_episode_driver.DynamicEpisodeDriver(tf_env, saved_policy, num_episodes=5,
                                            observers=[
                                                lambda _: py_env.render(mode='human'),
                                                reward_metric
                                            ]).run()
reward_metric.result()

<tf.Tensor: shape=(), dtype=float32, numpy=55.57008>

In [3]:
py_env = suite_gym.load('LunarLanderRewardShapingAugmented-v2')
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
tf_env.time_step_spec()

TimeStep(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)), observation=BoundedTensorSpec(shape=(9,), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32)))

In [27]:
from tf_agents.policies.tf_policy import TFPolicy
from tf_agents.typing import types
from tf_agents import specs

class MyPolicy(TFPolicy):
    
    def __init__(self, tf_policy: TFPolicy):
        
        observation_spec = specs.BoundedTensorSpec(
        shape=tf.TensorShape(saved_policy.time_step_spec.observation.shape[:-1] +
                             (saved_policy.time_step_spec.observation.shape[-1] + 1)),
        dtype=tf_policy.time_step_spec.observation.dtype,
        name=tf_policy.time_step_spec.observation.name,
        minimum=tf_policy.time_step_spec.observation.minimum,
        maximum=tf_policy.time_step_spec.observation.maximum)

        super().__init__(tf_policy.time_step_spec._replace(observation=observation_spec), tf_policy.action_spec)
        self.wrapped_tf_policy = tf_policy
        
    def _distribution(self, time_step: ts.TimeStep, policy_state: types.NestedTensorSpec) -> policy_step.PolicyStep:
        _time_step = time_step._replace(observation=time_step.observation[..., :-1])
        return self.wrapped_tf_policy._distribution(_time_step, policy_state)

    def _get_initial_state(self, batch_size: int) -> types.NestedTensor:
        return self.wrapped_tf_policy._get_initial_state(batch_size)

    def _variables(self):
        return self.wrapped_tf_policy._variables()



In [12]:
reward_metric = tf_metrics.AverageReturnMetric()
policy_dir = '../saves/LunarLander-v2/dqn_policy'
saved_policy = SavedTFPolicy(policy_dir)
augmented_policy = MyPolicy(saved_policy)
dynamic_episode_driver.DynamicEpisodeDriver(tf_env, augmented_policy, num_episodes=5,
                                            observers=[
                                                lambda _: py_env.render(mode='human'),
                                                reward_metric
                                            ]).run()
reward_metric.result()

<tf.Tensor: shape=(), dtype=float32, numpy=260.72534>

In [28]:
augmented_policy = MyPolicy(saved_policy)
augmented_policy.time_step_spec

TimeStep(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)), observation=BoundedTensorSpec(shape=(9,), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32)))

In [30]:
from tf_agents.policies import policy_saver

saver = policy_saver.PolicySaver(augmented_policy)
saver.save(os.path.join('..', 'saves', 'LunarLander-v2', 'dqn_policy_reward_shaping_augmented'))



INFO:tensorflow:Assets written to: ../saves/LunarLander-v2/dqn_policy_reward_shaping_augmented/assets


INFO:tensorflow:Assets written to: ../saves/LunarLander-v2/dqn_policy_reward_shaping_augmented/assets


In [31]:
reward_metric = tf_metrics.AverageReturnMetric()
policy_dir = '../saves/LunarLander-v2/dqn_policy_reward_shaping_augmented'
saved_policy = SavedTFPolicy(policy_dir)
dynamic_episode_driver.DynamicEpisodeDriver(tf_env, saved_policy, num_episodes=5,
                                            observers=[
                                                lambda _: py_env.render(mode='human'),
                                                reward_metric
                                            ]).run()
reward_metric.result()

<tf.Tensor: shape=(), dtype=float32, numpy=237.3409>