In [1]:
import os
from typing import Tuple
from tensorflow.python.keras.utils.generic_utils import Progbar
from tf_agents.environments import suite_gym, parallel_py_environment
from tf_agents.environments import tf_py_environment
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer, episodic_replay_buffer
from tf_agents.trajectories.trajectory import Trajectory
from tf_agents.networks import actor_distribution_network
from tf_agents.drivers import dynamic_episode_driver, dynamic_step_driver
from tf_agents.policies import random_tf_policy
import tensorflow as tf
import numpy as np
from tf_agents.utils import common
from reinforcement_learning.sac_training import NumberOfSafetyViolations
from tf_agents.trajectories import time_step as ts, policy_step, trajectory
from reinforcement_learning import sac_training
from reinforcement_learning import labeling_functions

In [2]:
py_env = suite_gym.load('BipedalWalker-v2')
py_env.render(mode='human')
py_env.reset()
tf_env = tf_py_environment.TFPyEnvironment(py_env)
tf_env.time_step_spec()


TimeStep(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)), observation=BoundedTensorSpec(shape=(24,), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32)))

In [None]:
bad_state_detection = lambda trajectory: print("bad state!: {}".format(trajectory.observation[..., 0]))\
    if tf.math.abs(trajectory.observation[...,0]) > np.pi / 3 else None

tf_env = tf_py_environment.TFPyEnvironment(py_env)
policy = random_tf_policy.RandomTFPolicy(time_step_spec=tf_env.time_step_spec(), action_spec=tf_env.action_spec())
dynamic_episode_driver.DynamicEpisodeDriver(tf_env, policy, num_episodes=15,
                                            observers=[bad_state_detection, lambda _: py_env.render(mode='human')]).run()

In [None]:
def bad_state_detection(trajectory):
    # if trajectory.reward[..., 0] <= -100:
    if trajectory.observation[..., 0] < -1. or trajectory.observation[..., 0] > 1:
        py_env.render(mode='human')
        print(trajectory.observation)
        return True
    else:
        return False

walk = True
while walk:
    action = policy.action(time_step=tf_env.current_time_step())
    time_step = tf_env.step(action)
    walk = not bad_state_detection(time_step)


In [None]:
from reinforcement_learning import labeling_functions

labeling_function = labeling_functions['BipedalWalker-v2']
safety_violations = NumberOfSafetyViolations(labeling_function)
progressbar = Progbar(target=None, interval=0.5, stateful_metrics=['violation'])

tf_env = tf_py_environment.TFPyEnvironment(py_env)
policy = random_tf_policy.RandomTFPolicy(
    time_step_spec=tf_env.time_step_spec(),
    action_spec=tf_env.action_spec())
dynamic_episode_driver.DynamicEpisodeDriver(
    tf_env,
    policy,
    num_episodes=15,
    observers=[safety_violations,
               lambda _: progressbar.add(
                   1, [('violation', safety_violations.average())]),
               lambda _: py_env.render(mode='human')]
).run()

safety_violations._num_episodes

In [2]:
# Parallel environments.
num_parallel_environments = 4
tf_env = tf_py_environment.TFPyEnvironment(
    parallel_py_environment.ParallelPyEnvironment(
    [lambda : suite_gym.load('BipedalWalker-v2')] * num_parallel_environments))
tf_env.reset()
tf_env.time_step_spec()

TimeStep(step_type=TensorSpec(shape=(), dtype=tf.int32, name='step_type'), reward=TensorSpec(shape=(), dtype=tf.float32, name='reward'), discount=BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)), observation=BoundedTensorSpec(shape=(24,), dtype=tf.float32, name='observation', minimum=array(-3.4028235e+38, dtype=float32), maximum=array(3.4028235e+38, dtype=float32)))

In [None]:
labeling_function = labeling_functions['BipedalWalker-v2']

policy = random_tf_policy.RandomTFPolicy(
    time_step_spec=tf_env.time_step_spec(),
    action_spec=tf_env.action_spec())

safety_violations = NumberOfSafetyViolations(labeling_function)

dynamic_step_driver.DynamicStepDriver(
    tf_env,
    policy,
    num_steps=5000,
    observers=[safety_violations]
).run()

print('Safety violations')
print('episodes', safety_violations._num_episodes)
print('result=', safety_violations.result())
print('average=', safety_violations.average())

In [None]:
import importlib
from tf_agents.environments import suite_gym
from reinforcement_learning import labeling_functions
from reinforcement_learning import sac_training

importlib.reload(sac_training)

learner = sac_training.SACLearner(
    env_name='BipedalWalker-v2',
    env_suite=suite_gym,
    num_iterations=int(1e6),
    num_parallel_environments=8,
    labeling_function=labeling_functions['BipedalWalker-v2']
)

In [None]:
learner.train_and_eval()

In [None]:
# Before running this cell, load the single py environment
tf_env = tf_py_environment.TFPyEnvironment(py_env)
stochastic_policy_dir = "../saves/BipedalWalker-v2/policy"
policy = tf.compat.v2.saved_model.load(stochastic_policy_dir)
dynamic_episode_driver.DynamicEpisodeDriver(
    tf_env,
    policy,
    num_episodes=15,
    observers=[lambda _: py_env.render(mode='human')]
).run()

In [None]:
from reinforcement_learning import sac_training

learner = sac_training.SACLearner(
    env_name='BipedalWalker-v2',
    env_suite=suite_gym,
    labeling_function=labeling_functions['BipedalWalker-v2'],
    save_directory_location='..'
)

In [None]:
variance_multiplier = 3.

In [None]:
learner.save_permissive_variance_policy(variance_multiplier=variance_multiplier)

In [None]:
# Before running this cell, load the single py environment
stochastic_policy_dir = os.path.join(
    learner.save_directory_location,
    'policy',
    "permissive_variance_policy-multiplier={}".format(
        variance_multiplier)
)
policy = tf.compat.v2.saved_model.load(stochastic_policy_dir)
safety_violations = NumberOfSafetyViolations(
    labeling_function=labeling_functions['BipedalWalker-v2'])

dynamic_episode_driver.DynamicEpisodeDriver(
    tf_env,
    policy,
    num_episodes=1000,
    observers=[
        #  lambda _: py_env.render(mode='human'),
        safety_violations
    ]
).run()

print("avg number of safety violations per episode", safety_violations.average())

In [4]:
import variational_action_discretizer

vae_mdp = variational_action_discretizer.load(
    # "/home/florentdelgrange/workspace/hpc_hydra/policy/Bipedal-walker/vae_LS15_MC16_CER10.0-decay=0.0015_KLA0.0-growth=5e-06_TD1.00-0.95_1e-06-2e-06_step400000_eval_elbo53.687/step3000000/eval_elbo0.227"
    #
    # "../saves/BipedalWalker-v2/models/vae_LS13_MC3_CER10.0-decay=0.0015_KLA0.0-growth=5e-06_TD1.00-0.90_1e-06-2e"
    # "-06_params=relaxed_state_encoding_step320000_eval_elbo55.821/step320000/eval_elbo55.821/policy"
    # "/action_discretizer/LA5_MC1_CER1.0-decay=0.001_KLA0.0-growth=5e-06_TD0.25-0.17_1e-06-2e-06_params"
    # "=one_output_per_action-relaxed_state_encoding/step200000/eval_elbo-0.866"
    # "../saves/BipedalWalker-v2/models/vae_LS13_MC3_CER10.0-decay=0.001_KLA0.0-growth=5e-06_TD0.99-0.95_1e-06-2e-06/policy/action_discretizer/LA5_MC3_CER10.0-decay=0.001_KLA0.0-growth=5e-06_TD0.50-0.40_1e-06-2e-06_params=one_output_per_action-full_vae_optimization-relaxed_state_encoding/step400000/eval_elbo53.956"
    #
    # "/home/florentdelgrange/workspace/hpc_hydra/policy/Bipedal-walker/LA5_MC3_CER10.0-decay=0.001_KLA0.0-growth=5e-06_TD0.50-0.40_1e-06-2e-06_params=one_output_per_action-full_vae_optimization-relaxed_state_encoding/step650000/eval_elbo58.928"
    #
    # "/home/florentdelgrange/workspace/hpc_hydra/policy/Bipedal-walker/LA5_MC3_CER10.0-decay=0.001_KLA0.0-growth=5e-06_TD0.50-0.40_1e-06-2e-06_params=one_output_per_action-full_vae_optimization-relaxed_state_encoding/step1330000/eval_elbo62.704"
    #
    "/home/florentdelgrange/workspace/hpc_hydra/policy/Bipedal-walker/LA5_MC3_CER10.0-decay=0.001_KLA0.0-growth=5e-06_TD0.50-0.40_1e-06-2e-06_params=one_output_per_action-full_vae_optimization-relaxed_state_encoding/step2000000/eval_elbo64.010"
)
print("VAE MDP loaded")

discrete_tf_env = vae_mdp.wrap_tf_environment(
    tf_env=tf_env,
    labeling_function=labeling_functions['BipedalWalker-v2']
)
z = discrete_tf_env.reset()

safety_violations = NumberOfSafetyViolations(
    labeling_function=labeling_functions['BipedalWalker-v2'])
reward_metric = tf_metrics.AverageReturnMetric()

policy = random_tf_policy.RandomTFPolicy(
    time_step_spec=discrete_tf_env.time_step_spec(),
    action_spec=discrete_tf_env.action_spec())

dynamic_episode_driver.DynamicEpisodeDriver(
    discrete_tf_env,
    policy,
    num_episodes=30,
    observers=[
        lambda _: py_env.render(mode='human'),
        lambda _: safety_violations(tf_env.current_time_step()),
        reward_metric
    ]
).run()

print("avg number of safety violations per episode", safety_violations.average())
print("avg rewards", reward_metric.result())

VAE MDP loaded
avg number of safety violations per episode tf.Tensor(19.258064, shape=(), dtype=float32)
avg rewards tf.Tensor(-107.633286, shape=(), dtype=float32)


In [3]:
import variational_action_discretizer

vae_mdp = variational_action_discretizer.load(
    # "/home/florentdelgrange/workspace/hpc_hydra/policy/Bipedal-walker/vae_LS15_MC16_CER10.0-decay=0.0015_KLA0.0-growth=5e-06_TD1.00-0.95_1e-06-2e-06_step400000_eval_elbo53.687/step3000000/eval_elbo0.227"
    #
    # "../saves/BipedalWalker-v2/models/vae_LS13_MC3_CER10.0-decay=0.0015_KLA0.0-growth=5e-06_TD1.00-0.90_1e-06-2e"
    # "-06_params=relaxed_state_encoding_step320000_eval_elbo55.821/step320000/eval_elbo55.821/policy"
    # "/action_discretizer/LA5_MC1_CER1.0-decay=0.001_KLA0.0-growth=5e-06_TD0.25-0.17_1e-06-2e-06_params"
    # "=one_output_per_action-relaxed_state_encoding/step200000/eval_elbo-0.866"
    #
    # "../saves/BipedalWalker-v2/models/vae_LS13_MC3_CER10.0-decay=0.001_KLA0.0-growth=5e-06_TD0.99-0.95_1e-06-2e-06/policy/action_discretizer/LA5_MC3_CER10.0-decay=0.001_KLA0.0-growth=5e-06_TD0.50-0.40_1e-06-2e-06_params=one_output_per_action-full_vae_optimization-relaxed_state_encoding/step400000/eval_elbo53.956"
    #
    # "/home/florentdelgrange/workspace/hpc_hydra/policy/Bipedal-walker/LA5_MC3_CER10.0-decay=0.001_KLA0.0-growth=5e-06_TD0.50-0.40_1e-06-2e-06_params=one_output_per_action-full_vae_optimization-relaxed_state_encoding/step650000/eval_elbo58.928"
    #
    # "/home/florentdelgrange/workspace/hpc_hydra/policy/Bipedal-walker/LA5_MC3_CER10.0-decay=0.001_KLA0.0-growth=5e-06_TD0.50-0.40_1e-06-2e-06_params=one_output_per_action-full_vae_optimization-relaxed_state_encoding/step1330000/eval_elbo62.704"
    #
    "/home/florentdelgrange/workspace/hpc_hydra/policy/Bipedal-walker/LA5_MC3_CER10.0-decay=0.001_KLA0.0-growth=5e-06_TD0.50-0.40_1e-06-2e-06_params=one_output_per_action-full_vae_optimization-relaxed_state_encoding/step2000000/eval_elbo64.010"
)
print("VAE MDP loaded")

discrete_tf_env = vae_mdp.wrap_tf_environment(
    tf_env=tf_env,
    labeling_function=labeling_functions['BipedalWalker-v2']
)
discrete_tf_env.reset()

safety_violations = NumberOfSafetyViolations(
    labeling_function=labeling_functions['BipedalWalker-v2'])
reward_metric = tf_metrics.AverageReturnMetric()

policy = vae_mdp.get_abstract_policy()

dynamic_episode_driver.DynamicEpisodeDriver(
    discrete_tf_env,
    policy,
    num_episodes=50,
    observers=[
        # lambda _: py_env.render(mode='human'),
        lambda _: safety_violations(tf_env.current_time_step()),
        reward_metric
    ]
).run()

print("avg number of safety violations per episode", safety_violations.average())
print("avg rewards", reward_metric.result())

VAE MDP loaded


ValueError: in user code:

    /home/florentdelgrange/anaconda3/envs/vae-mdp/lib/python3.7/site-packages/tf_agents/metrics/tf_metrics.py:164 call  *
        self._return_accumulator.assign(
    /home/florentdelgrange/anaconda3/envs/vae-mdp/lib/python3.7/site-packages/tensorflow/python/ops/resource_variable_ops.py:846 assign  **
        self._shape.assert_is_compatible_with(value_tensor.shape)
    /home/florentdelgrange/anaconda3/envs/vae-mdp/lib/python3.7/site-packages/tensorflow/python/framework/tensor_shape.py:1117 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (1,) and (4,) are incompatible
