# DDPG

In [1]:
from tf_agents.agents.ddpg import ddpg_agent
from tf_agents.agents.dqn import dqn_agent
from tf_agents.environments import suite_gym, tf_py_environment
from tf_agents.networks import sequential, q_network, nest_map
from tf_agents.specs import tensor_spec
from tf_agents.utils import common
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.drivers import py_driver
from tf_agents.policies import py_tf_eager_policy
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.drivers import dynamic_step_driver
from tf_agents.eval import metric_utils
from tf_agents.keras_layers import inner_reshape

import matplotlib
import matplotlib.pyplot as plt

import reverb

import tensorflow as tf
import pyvirtualdisplay
import functools


import PIL
from PIL import ImageDraw, ImageFont
import numpy as np
import IPython
import imageio
import base64

import os
import time

from absl import logging

In [2]:
logging.set_verbosity(logging.INFO)

In [3]:
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()


In [20]:
env_name = 'Pendulum-v1'
train_or_retrain = True
num_iterations = 100_000

actor_fc_layers = (400, 300)
critic_obs_fc_layers = (400,)
critic_action_fc_layers=None
critic_joint_fc_layers=(300,)

learning_rate = 1e-4
num_eval_episodes = 10
replay_buffer_max_length = 100000
initial_collect_steps = 100
batch_size = 64
collect_steps_per_iteration = 1

log_interval = 200
eval_interval = 1000
video_recording_interval = 1_000

root_dir = os.path.join('./data', env_name)
summaries_flush_secs = 10

fc_layer_params = (100, 50)
gamma = 0.99
reward_scale_factor = 1.0
gradient_clipping = None
debug_summaries = False
summarize_grads_and_vars = False

# Params for train
use_tf_functions = True
train_steps_per_iteration = 1
actor_learning_rate=1e-4
critic_learning_rate=1e-3
dqda_clipping=None
td_errors_loss_fn = tf.compat.v1.losses.huber_loss

# Params for collect
epsilon_greedy = 0.1
replay_buffer_capacity = 100_000
ou_stddev=0.2
ou_damping=0.15

# Params for target update
target_update_tau = 0.05
target_update_period = 5

# Params for summaries and logging
summary_interval = 1_000
eval_metrics_callback = None

train_sequence_length = 1

# Params for checkpoints
train_checkpoint_interval = 10_000
policy_checkpoint_interval = 5_000
rb_checkpoint_interval = 20_000

## Setup

In [21]:
root_dir = os.path.expanduser(root_dir)
train_dir = os.path.join(root_dir, 'train')
eval_dir = os.path.join(root_dir, 'eval')
video_dir = os.path.join(root_dir, 'video')

# Create the video recording directory
if not os.path.exists(video_dir):
    os.makedirs(video_dir)

In [22]:
# Train Summary Writer
train_summary_writer = tf.summary.create_file_writer(
    train_dir, flush_millis=summaries_flush_secs * 1000)
train_summary_writer.set_as_default()

In [23]:
# Eval Summary Writer
eval_summary_writer = tf.summary.create_file_writer(
    eval_dir, flush_millis=summaries_flush_secs * 1000)
eval_metrics = [
    tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
    tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
]

In [24]:
# Create global_step
global_step = tf.compat.v1.train.get_or_create_global_step()

### Video

In [25]:
def get_timestamp():
    import datetime
    return datetime.datetime.now().timestamp()

In [26]:
def embed_mp4(filename):
    video = open(filename, 'rb').read()
    b64 = base64.b64encode(video)
    
    tag = '''
    <video width="640" height="480" controls>
        <source src="data:video/mp4;base64,{0}" type="video/mp4">
    Your browser does not support the video tag.
    </video>'''.format(b64.decode())
    
    return IPython.display.HTML(tag)

In [27]:
def enhance_frame(frame: np.ndarray, text=None) -> np.ndarray:
    if text is None:
        return frame
    
    # Convert array to PIl.Image
    image = PIL.Image.fromarray(frame).convert('RGB')

    # Get draw context
    draw = ImageDraw.Draw(image, 'RGB')

    # Get font
    font = ImageFont.truetype('/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf', 20)

    # Draw text
    draw.text((30, 30), text, font=font, fill=(0, 0, 0), stroke_width=1, stroke_fill=(255, 255, 255))

    return np.array(image)

In [28]:
def create_policy_eval_video(policy, eval_env, eval_py_env, filename=None, 
        num_episodes=3, fps=30, env_name=env_name, freeze_seconds=0,
        step=None):
    if filename is None:
        filename = str(get_timestamp())
        
    filename = filename + '.mp4'
    logging.info('Env: %s', env_name)
    logging.info('Filename: %s', filename)
    
    with imageio.get_writer(filename, fps=fps) as video:
        for idx in range(num_episodes):
            logging.info('Begin #%d of %d', idx+1, num_episodes)
            time_step = eval_env.reset()
            frame_idx = 0
            total_reward = 0.0
            
            total_reward += +time_step.reward.numpy()[0]
            
            text = f'Env: {env_name}'
            if step is not None:
                text += f'\nStp: {step}'
            text += f'\nEp:  {idx+1}/{num_episodes}\nFrm: {frame_idx}'
            text += f'\nRw:  {total_reward:.2f}'
            
            frame = enhance_frame(eval_py_env.render(mode='rgb_array'), text)
            video.append_data(frame)

            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = eval_env.step(action_step.action)
                frame_idx += 1
                total_reward += +time_step.reward.numpy()[0]
                
                text = f'Env: {env_name}'
                if step is not None:
                    text += f'\nStp: {step}'
                text += f'\nEp:  {idx+1}/{num_episodes}\nFrm: {frame_idx}'
                text += f'\nRw:  {total_reward:.2f}'

                frame = enhance_frame(eval_py_env.render(mode='rgb_array'), text)
                video.append_data(frame)
                
                # Freeze frame for a few seconds
                if time_step.is_last() and freeze_seconds > 0:
                    for _ in range(fps * freeze_seconds):
                        video.append_data(frame)
    
    logging.info('All done')
    return filename
    # return embed_mp4(filename)

### Train

In [29]:
dense = functools.partial(
    tf.keras.layers.Dense,
    activation=tf.keras.activations.relu,
    kernel_initializer=tf.compat.v1.variance_scaling_initializer(
        scale=1.0/3.0, mode='fan_in', distribution='uniform'
    )
)

In [30]:
# Create an actor network
def create_actor_network(fc_layer_units, action_spec):
    """Create an actor network for DDPG."""
    flat_action_spec = tf.nest.flatten(action_spec)
    if len(flat_action_spec) > 1:
        raise ValueError('Only a single action tensor is supported by this network.')
    flat_action_spec = flat_action_spec[0]

    fc_layers = [dense(num_units) for num_units in fc_layer_units]

    num_actions = flat_action_spec.shape.num_elements()
    action_fc_layer = tf.keras.layers.Dense(
        num_actions,
        activation=tf.keras.activations.tanh,
        kernel_initializer=tf.keras.initializers.RandomUniform(
            minval=-0.003, maxval=0.003
        )
    )

    scaling_layer = tf.keras.layers.Lambda(
        lambda x: common.scale_to_spec(x, flat_action_spec)
    )

    return sequential.Sequential(fc_layers + [action_fc_layer, scaling_layer])

In [31]:
def create_identity_layer():
    return tf.keras.layers.Lambda(lambda x: x)

def create_fc_network(layer_units):
    return sequential.Sequential([dense(num_units) for num_units in layer_units])

In [32]:
def create_critic_network(obs_fc_layer_units,
                          action_fc_layer_units,
                          joint_fc_layer_units):
    """Create a critic network for DDPG."""
    
    def split_inputs(inputs):
        return {'observation': inputs[0], 'action': inputs[1]}
    
    # Obs network
    obs_network = create_fc_network(
        obs_fc_layer_units) if obs_fc_layer_units else create_identity_layer()
    
    # Action network
    action_network = create_fc_network(
        action_fc_layer_units) if action_fc_layer_units else create_identity_layer()
    
    # Joint network
    joint_network = create_fc_network(
        joint_fc_layer_units) if joint_fc_layer_units else create_identity_layer()
    
    # Value layer
    value_fc_layer = tf.keras.layers.Dense(
        1,
        activation=None,
        kernel_initializer=tf.keras.initializers.RandomUniform(
            minval=-0.003, maxval=0.003))
    
    return sequential.Sequential([
        tf.keras.layers.Lambda(split_inputs),
        nest_map.NestMap({
            'observation': obs_network,
            'action': action_network
        }),
        nest_map.NestFlatten(),
        tf.keras.layers.Concatenate(),
        joint_network,
        value_fc_layer,
        inner_reshape.InnerReshape([1], [])
    ])

In [33]:
with tf.summary.record_if(lambda: tf.math.equal(global_step % summary_interval, 0)):
    # Create env
    tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name))
    eval_py_env = suite_gym.load(env_name)
    eval_tf_env = tf_py_environment.TFPyEnvironment(eval_py_env)
    
    # Create Actor Net
    actor_net = create_actor_network(actor_fc_layers, tf_env.action_spec())

    # Create Critic Net
    critic_net = create_critic_network(critic_obs_fc_layers,
                                       critic_action_fc_layers,
                                       critic_joint_fc_layers)
    
    # Create Agent
    tf_agent = ddpg_agent.DdpgAgent(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
        actor_network=actor_net,
        critic_network=critic_net,
        actor_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=actor_learning_rate),
        critic_optimizer=tf.compat.v1.train.AdamOptimizer(
            learning_rate=critic_learning_rate),
        ou_stddev=ou_stddev,
        ou_damping=ou_damping,
        target_update_tau=target_update_tau,
        target_update_period=target_update_period,
        dqda_clipping=dqda_clipping,
        td_errors_loss_fn=td_errors_loss_fn,
        gamma=gamma,
        reward_scale_factor=reward_scale_factor,
        gradient_clipping=gradient_clipping,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        train_step_counter=global_step)
    tf_agent.initialize()
    
    
    # Train Metrics
    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]
    
    # Policies
    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy
    
    # Replay Buffer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=tf_agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)
    
    # Initial Collect Driver
    initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
        tf_env,
        collect_policy,
        observers=[replay_buffer.add_batch],
        num_steps=initial_collect_steps)
    
    # Collect Driver
    collect_driver = dynamic_step_driver.DynamicStepDriver(
        tf_env,
        collect_policy,
        observers=[replay_buffer.add_batch] + train_metrics,
        num_steps=collect_steps_per_iteration)
    
    # Checkpointers
    train_checkpointer = common.Checkpointer(
        ckpt_dir=train_dir,
        agent=tf_agent,
        global_step=global_step,
        metrics=metric_utils.MetricsGroup(train_metrics, 'train_metrics'))
    policy_checkpointer = common.Checkpointer(
        ckpt_dir=os.path.join(train_dir, 'policy'),
        policy=eval_policy,
        global_step=global_step)
    rb_checkpointer = common.Checkpointer(
        ckpt_dir=os.path.join(train_dir, 'replay_buffer'),
        max_to_keep=1,
        replay_buffer=replay_buffer)
    
    train_checkpointer.initialize_or_restore()
    policy_checkpointer.initialize_or_restore()
    rb_checkpointer.initialize_or_restore()
    
    if train_or_retrain:
        # Speed up with common.function
        if use_tf_functions:
            initial_collect_driver.run = common.function(initial_collect_driver.run)
            collect_driver.run = common.function(collect_driver.run)
            tf_agent.train = common.function(tf_agent.train)

        logging.info(
            'Initializing replay buffer by collecting experience for %d steps with '
            'a random policy.', initial_collect_steps)

        initial_collect_driver.run()

        results = metric_utils.eager_compute(
            eval_metrics,
            eval_tf_env,
            eval_policy,
            num_episodes=num_eval_episodes,
            train_step=global_step,
            summary_writer=eval_summary_writer,
            summary_prefix='Metrics',
        )
        if eval_metrics_callback is not None:
            eval_metrics_callback(results, global_step.numpy())
        metric_utils.log_metrics(eval_metrics)

        time_step = None
        policy_state = collect_policy.get_initial_state(tf_env.batch_size)

        # Variables for logging time (steps_per_sec)
        timed_at_step = global_step.numpy()
        time_acc = 0  # Time accumulation

        # Dataset generates trajetories with shape [Bx2x...]
        dataset = replay_buffer.as_dataset(
            num_parallel_calls=3,
            sample_batch_size=batch_size,
            num_steps=2,
        ).prefetch(3)
        iterator = iter(dataset)

        def train_step():
            experience, _ = next(iterator)
            return tf_agent.train(experience)

        if use_tf_functions:
            train_step = common.function(train_step)

        for _ in range(num_iterations):
            start_time = time.time()
            time_step, policy_state = collect_driver.run(
                time_step=time_step,
                policy_state=policy_state,
            )

            for _ in range(train_steps_per_iteration):
                train_loss = train_step()
            time_acc += time.time() - start_time

            if global_step.numpy() % log_interval == 0:
                logging.info('step = %d, loss = %f', global_step.numpy(),
                             train_loss.loss)
                steps_per_sec = (global_step.numpy() - timed_at_step) / time_acc
                logging.info('%.3f steps/sec', steps_per_sec)
                tf.compat.v2.summary.scalar(
                    name='global_steps_per_sec', data=steps_per_sec, step=global_step)

                # Reset time.
                timed_at_step = global_step.numpy()
                time_acc = 0

            for train_metric in train_metrics:
                train_metric.tf_summaries(
                    train_step=global_step, step_metrics=train_metrics[:2])

            if global_step.numpy() % train_checkpoint_interval == 0:
                train_checkpointer.save(global_step=global_step.numpy())
            if global_step.numpy() % policy_checkpoint_interval == 0:
                policy_checkpointer.save(global_step=global_step.numpy())
            if global_step.numpy() % rb_checkpoint_interval == 0:
                rb_checkpointer.save(global_step=global_step.numpy())

            if global_step.numpy() % eval_interval == 0:
                results = metric_utils.eager_compute(
                    eval_metrics,
                    eval_tf_env,
                    eval_policy,
                    num_episodes=num_eval_episodes,
                    train_step=global_step,
                    summary_writer=eval_summary_writer,
                    summary_prefix='Metrics',
                )
                if eval_metrics_callback is not None:
                    eval_metrics_callback(results, global_step.numpy())
                metric_utils.log_metrics(eval_metrics)

            # Record a video of current eval agent policy
            if global_step.numpy() % video_recording_interval == 0:
                filename = '{}_{}'.format(global_step.numpy(), get_timestamp())
                full_filename = os.path.join(video_dir, filename)
                create_policy_eval_video(
                    eval_policy,
                    eval_tf_env,
                    eval_py_env,
                    filename=full_filename,
                    fps=30,
                    freeze_seconds=3,
                    num_episodes=1,
                    step=global_step.numpy(),
                )

INFO:absl:No checkpoint available at ./data/Pendulum-v1/train
INFO:absl:No checkpoint available at ./data/Pendulum-v1/train/policy
INFO:absl:No checkpoint available at ./data/Pendulum-v1/train/replay_buffer
INFO:absl:Initializing replay buffer by collecting experience for 100 steps with a random policy.
INFO:absl: 
		 AverageReturn = -1347.712158203125
		 AverageEpisodeLength = 200.0


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.
INFO:absl:step = 200, loss = 0.248881
INFO:absl:37.083 steps/sec
INFO:absl:step = 400, loss = 0.144126
INFO:absl:75.180 steps/sec
INFO:absl:step = 600, loss = 0.298098
INFO:absl:66.623 steps/sec
INFO:absl:step = 800, loss = 1.093385
INFO:absl:61.005 steps/sec
INFO:absl:step = 1000, loss = 0.525363
INFO:absl:70.155 steps/sec
INFO:absl: 
		 AverageReturn = -1360.58349609375
		 AverageEpisodeLength = 200.0
INFO:absl:Env: Pendulum-v1
INFO:absl:Filename: ./data/Pendulum-v1/video/1000_1643997910.447711.mp4
INFO:absl:Begin #1 of 1
INFO:absl:All done
INFO:absl:step = 1200, loss = 0.599682
INFO:absl:69.003 steps/sec
INFO:absl:step = 1400, loss = 0.707077
INFO:absl:60.162 steps/sec
INFO:absl:step = 1600, loss = 1.152277
INFO:absl:73.140 steps/sec
INFO:absl:step = 1800, loss = 1.260137
INFO:absl:63.179 steps/sec
INFO:absl:step = 2000, loss = 1.018130
INFO:absl:65.497 steps/sec
INFO:absl: 
		 AverageReturn = 



INFO:absl: 
		 AverageReturn = -125.23780822753906
		 AverageEpisodeLength = 200.0
INFO:absl:Env: Pendulum-v1
INFO:absl:Filename: ./data/Pendulum-v1/video/4000_1643998016.292384.mp4
INFO:absl:Begin #1 of 1
INFO:absl:All done
INFO:absl:step = 4200, loss = 2.174500
INFO:absl:82.476 steps/sec
INFO:absl:step = 4400, loss = 2.187369
INFO:absl:84.578 steps/sec
INFO:absl:step = 4600, loss = 4.360679
INFO:absl:96.475 steps/sec
INFO:absl:step = 4800, loss = 2.632923
INFO:absl:100.257 steps/sec
INFO:absl:step = 5000, loss = 3.537304
INFO:absl:99.387 steps/sec
INFO:absl:Saved checkpoint: ./data/Pendulum-v1/train/policy/ckpt-5000




INFO:absl: 
		 AverageReturn = -193.1265106201172
		 AverageEpisodeLength = 200.0
INFO:absl:Env: Pendulum-v1
INFO:absl:Filename: ./data/Pendulum-v1/video/5000_1643998044.129182.mp4
INFO:absl:Begin #1 of 1
INFO:absl:All done
INFO:absl:step = 5200, loss = 5.218864
INFO:absl:100.760 steps/sec
INFO:absl:step = 5400, loss = 3.679869
INFO:absl:102.237 steps/sec
INFO:absl:step = 5600, loss = 4.827673
INFO:absl:97.285 steps/sec
INFO:absl:step = 5800, loss = 2.840884
INFO:absl:104.152 steps/sec
INFO:absl:step = 6000, loss = 3.158718
INFO:absl:98.470 steps/sec
INFO:absl: 
		 AverageReturn = -169.31570434570312
		 AverageEpisodeLength = 200.0
INFO:absl:Env: Pendulum-v1
INFO:absl:Filename: ./data/Pendulum-v1/video/6000_1643998069.031164.mp4
INFO:absl:Begin #1 of 1
INFO:absl:All done
INFO:absl:step = 6200, loss = 4.375756
INFO:absl:101.889 steps/sec
INFO:absl:step = 6400, loss = 10.851881
INFO:absl:102.673 steps/sec
INFO:absl:step = 6600, loss = 2.968659
INFO:absl:102.569 steps/sec
INFO:absl:step =

In [34]:
time_step = eval_py_env.reset()

In [23]:
saved_filename = create_policy_eval_video(tf_agent.policy, eval_tf_env, eval_py_env, fps=15, freeze_seconds=3, num_episodes=1)

INFO:absl:Env: Pendulum-v1
INFO:absl:Filename: 1643997780.938523.mp4
INFO:absl:Begin #1 of 1
INFO:absl:All done


In [None]:
embed_mp4(saved_filename)