In [11]:
import sys
import os

path = os.path.dirname(os.path.abspath("__file__"))
sys.path.insert(0, path + '/../..')

import base64
import IPython
import importlib
import logging
logging.getLogger().setLevel(logging.ERROR)
import random

from tf_agents.environments import suite_gym, parallel_py_environment
from tf_agents.environments import tf_py_environment
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer, episodic_replay_buffer
from tf_agents.drivers import dynamic_episode_driver, dynamic_step_driver
from tf_agents.trajectories import time_step as ts, policy_step, trajectory
from tf_agents.utils import common

import tensorflow as tf
tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(3)

import numpy as np
import pandas as pd

from reinforcement_learning import labeling_functions
from util.io.dataset_generator import map_rl_trajectory_to_vae_input
from util.io.dataset_generator import ErgodicMDPTransitionGenerator
import reinforcement_learning.environments
from reinforcement_learning.environments import perturbed_env
from policies.saved_policy import SavedTFPolicy
from policies.epsilon_mimic import EpsilonMimicPolicy
from policies.latent_policy import LatentPolicyOverRealStateAndActionSpaces

from util.io import video
import variational_mdp
import variational_action_discretizer

# set seed
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [3]:
def embed_mp4(filename):
  """Embeds an mp4 file in the notebook."""
  video = open(filename,'rb').read()
  b64 = base64.b64encode(video)
  tag = '''
  <video width="640" height="480" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4">
  Your browser does not support the video tag.
  </video>'''.format(b64.decode())

  return IPython.display.HTML(tag)

# CartPole

In [6]:
with suite_gym.load('CartPole-v0') as py_env:
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)

    policy_dir = '../saves/CartPole-v0/policy/'
    policy = SavedTFPolicy(policy_dir)
    num_episodes=10

    reward_metric = tf_metrics.AverageReturnMetric()
    video_observer = video.VideoEmbeddingObserver(py_env, 'pendulum_sac_policy', num_episodes=num_episodes)
    dynamic_episode_driver.DynamicEpisodeDriver(tf_env, policy, num_episodes=num_episodes,
                                                observers=[
                                                    reward_metric,
                                                    video_observer,
                                                ]).run()

    tf.print('avg. episode return:', reward_metric.result())

embed_mp4(video_observer.file_name)

avg. episode return: 200


In [14]:
vae_mdp = variational_mdp.load(
    '../../saves/CartPole-v0/models/vae_LS9_ER10.0-decay=1e-05-min=0_KLA0.0-growth=5e-05_TD0.67-0.50_activation=leaky_relu_lr=0.001_seed=20210510_PER-P_exp=0.33-WIS_exponent=0.4-WIS_growth=7e-05_buckets_based_params=latent_policy/base',
    discrete_action=True,
    step=980000
)
print("VAE MDP loaded")

with suite_gym.load('CartPole-v0') as py_env:
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    tf_env = vae_mdp.wrap_tf_environment(tf_env, labeling_functions['Pendulum-v0'])
    policy = vae_mdp.get_latent_policy()
    
    num_episodes=10
    reward_metric = tf_metrics.AverageReturnMetric()
    video_observer = video.VideoEmbeddingObserver(py_env, 'distilled_policy', num_episodes=num_episodes)
    dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env, policy, num_episodes=num_episodes,
        observers=[
            reward_metric,
            video_observer
        ]).run()
    
    tf.print('avg. episode return:', reward_metric.result())

embed_mp4(video_observer.file_name)    

ValueError: Shapes (5,) and (1,) are incompatible

# Pendulum

In [30]:
with suite_gym.load('Pendulum-v0') as py_env:
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)

    sac_policy_dir = '../saves/PendulumRandomInit-v0/sac_policy'
    policy = SavedTFPolicy(sac_policy_dir)
    num_episodes=10

    reward_metric = tf_metrics.AverageReturnMetric()
    video_observer = video.VideoEmbeddingObserver(py_env, 'pendulum_sac_policy', num_episodes=num_episodes)
    dynamic_episode_driver.DynamicEpisodeDriver(tf_env, policy, num_episodes=num_episodes,
                                                observers=[
                                                    reward_metric,
                                                    video_observer,
                                                ]).run()

    reward_metric.result()

embed_mp4(video_observer.file_name)

In [61]:
vae_mdp = variational_action_discretizer.load(
    '../../saves/PendulumRandomInit-v0/models/vae_LS13_ER10.0-decay=7.5e-05-min=0_KLA0.0-growth=7.5e-05_TD0.67-0.50_activation=relu_lr=0.0001_seed=22222222/sac_policy/action_discretizer/LA3_ER10.0-decay=7.5e-05-min=0_KLA0.0-growth=7.5e-05_TD0.50-0.33_PER-P_exp=0.3-WIS_exponent=0.4-WIS_growth=1e-05_loss_based_epsilon_greedy=0.5-decay=1e-05/base',
    step=350000
)
print("VAE MDP loaded")

with suite_gym.load('Pendulum-v0') as py_env:
    py_env.reset()
    tf_env = tf_py_environment.TFPyEnvironment(py_env)
    tf_env = vae_mdp.wrap_tf_environment(tf_env, labeling_functions['Pendulum-v0'])
    policy = vae_mdp.get_latent_policy()
    
    num_episodes=10
    video_observer = video.VideoEmbeddingObserver(py_env, 'distilled_policy', num_episodes=num_episodes)
    dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env, policy, num_episodes=num_episodes,
        observers=[video_observer]
    ).run()

embed_mp4(video_observer.file_name)    

VAE MDP loaded


## Robustness test