# DQN

Ref: https://www.tensorflow.org/agents/tutorials/1_dqn_tutorial

In [89]:
from tf_agents.environments import suite_gym, tf_py_environment
from tf_agents.networks import sequential, q_network
from tf_agents.specs import tensor_spec
from tf_agents.utils import common
from tf_agents.agents.dqn import dqn_agent
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import reverb_replay_buffer
from tf_agents.replay_buffers import reverb_utils
from tf_agents.drivers import py_driver
from tf_agents.policies import py_tf_eager_policy
from tf_agents.metrics import tf_metrics
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.drivers import dynamic_step_driver, dynamic_episode_driver
from tf_agents.eval import metric_utils

import matplotlib
import matplotlib.pyplot as plt

import reverb

import tensorflow as tf
import pyvirtualdisplay


import PIL
from PIL import ImageDraw, ImageFont
import numpy as np
import IPython
import imageio
import base64

import os
import time

from absl import logging

In [90]:
logging.set_verbosity(logging.DEBUG)

In [91]:
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()


In [92]:
env_name = 'Taxi-v3'
train_or_retrain = True
num_iterations = 1_500
max_episode_steps=200

learning_rate = 0.001 # 1e-4
num_eval_episodes = 10
replay_buffer_max_length = 100_000
# initial_collect_steps = 10_000
initial_collect_episodes = 640

batch_size = 64
collect_steps_per_iteration = 1
collect_episodes_per_iteration = 1

log_interval = 50
eval_interval = 100
video_recording_interval = 1_000

root_dir = os.path.join('./data', env_name)
summaries_flush_secs = 10

fc_layer_params = (100, 50, 50)
gamma = 0.9
reward_scale_factor = 1.0
gradient_clipping = None
debug_summaries = False
summarize_grads_and_vars = False

# Params for train
use_tf_functions = True
train_steps_per_iteration = max_episode_steps

# Params for collect
# epsilon_greedy = 0.1
replay_buffer_capacity = 100_000

# Params for target update
target_update_tau = 0.1
target_update_period = 5

# Params for summaries and logging
summary_interval = 100
eval_metrics_callback = None

train_sequence_length = 1

# Params for checkpoints
train_checkpoint_interval = 1_000
policy_checkpoint_interval = 5000
rb_checkpoint_interval = 2_000

## Setup

In [93]:
root_dir = os.path.expanduser(root_dir)
train_dir = os.path.join(root_dir, 'train')
eval_dir = os.path.join(root_dir, 'eval')
video_dir = os.path.join(root_dir, 'video')

# Create the video recording directory
os.makedirs(video_dir, exist_ok=True)

In [94]:
# Train Summary Writer
train_summary_writer = tf.summary.create_file_writer(
    train_dir, flush_millis=summaries_flush_secs * 1000)
train_summary_writer.set_as_default()

In [95]:
# Eval Summary Writer
eval_summary_writer = tf.summary.create_file_writer(
    eval_dir, flush_millis=summaries_flush_secs * 1000)
eval_metrics = [
    tf_metrics.AverageReturnMetric(buffer_size=num_eval_episodes),
    tf_metrics.AverageEpisodeLengthMetric(buffer_size=num_eval_episodes)
]

In [96]:
# Create global_step
global_step = tf.compat.v1.train.get_or_create_global_step()

## Train

In [97]:
def get_q_net(tf_env):
    obervation_space_size = tf_env.observation_spec().maximum - tf_env.observation_spec().minimum + 1
    action_space_size = tf_env.action_spec().maximum - tf_env.action_spec().minimum + 1

    # obervation_space_size, action_space_size
    
    q_net = sequential.Sequential([
        tf.keras.layers.Embedding(obervation_space_size, 10, input_length=1),
        tf.keras.layers.Reshape((10,)),
        tf.keras.layers.Dense(50, activation='relu'),
        tf.keras.layers.Dense(50, activation='relu'),
        tf.keras.layers.Dense(action_space_size, activation='linear'),
    ])
    
    return q_net

In [98]:
def get_q_net_v2(tf_env):
    obervation_space_size = tf_env.observation_spec().maximum - tf_env.observation_spec().minimum + 1
    action_space_size = tf_env.action_spec().maximum - tf_env.action_spec().minimum + 1

    # obervation_space_size, action_space_size
    
    q_net = sequential.Sequential([
        tf.keras.layers.Embedding(obervation_space_size, action_space_size, input_length=1),
        # tf.keras.layers.Reshape((10,)),
        # tf.keras.layers.Dense(50, activation='relu'),
        # tf.keras.layers.Dense(50, activation='relu'),
        # tf.keras.layers.Dense(action_space_size, activation='linear'),
    ])
    
    return q_net

In [99]:
def initial_logger(_):
    print('global_step', global_step, _)
    
    return _

In [100]:
def epsilon_greedy_func(global_step):
    epsilon_decay = 0.0001
    epsilon_min = 0.001
    
    steps = 0
    
    def value():
        val = epsilon_min + (1 - epsilon_min) * tf.exp(tf.cast(global_step, dtype=tf.float32) * -1 * epsilon_decay)
        
        
        tf.compat.v2.summary.scalar(
                name='epsilon', data=val, step=global_step)
        
        return val
    
    
    return value

In [101]:

tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name, max_episode_steps=max_episode_steps))
eval_py_env = suite_gym.load(env_name, max_episode_steps=max_episode_steps)
eval_tf_env = tf_py_environment.TFPyEnvironment(eval_py_env)

random_policy = random_tf_policy.RandomTFPolicy(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
)

In [None]:
with tf.summary.record_if(lambda: tf.math.equal(global_step % summary_interval, 0)):
    # Create env
    tf_env = tf_py_environment.TFPyEnvironment(suite_gym.load(env_name, max_episode_steps=max_episode_steps))
    eval_py_env = suite_gym.load(env_name, max_episode_steps=max_episode_steps)
    eval_tf_env = tf_py_environment.TFPyEnvironment(eval_py_env)

    q_net = get_q_net_v2(tf_env)
    
    # Create Agent
    tf_agent = dqn_agent.DqnAgent(
        time_step_spec=tf_env.time_step_spec(),
        action_spec=tf_env.action_spec(),
        q_network=q_net,
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        
        # Params for collect
        # epsilon_greedy=epsilon_greedy,
        epsilon_greedy=epsilon_greedy_func(global_step),
        
        # Params for target network updates
        target_q_network=None,
        target_update_tau=target_update_tau,  # Default: 1.0, "Factor for soft update of the target network"
        target_update_period=target_update_period,  # Default: 1, "Period for soft update of the target network"
        
        # Params for training
        td_errors_loss_fn=common.element_wise_squared_loss,  # Default: common.element_wise_huber_loss
        gamma=gamma,  # Default: 1.0, Discount for future rewards.
        reward_scale_factor=reward_scale_factor,  # Default: 1.0
        gradient_clipping=gradient_clipping,  # Default: None, "Norm length to clip gradients"
        
        # Params for debugging
        train_step_counter=global_step,
        debug_summaries=debug_summaries,
        summarize_grads_and_vars=summarize_grads_and_vars,
        name=None,  # Default: class name. The agent name.
    )
    tf_agent.initialize()
    
    # Train Metrics
    train_metrics = [
        tf_metrics.NumberOfEpisodes(),
        tf_metrics.EnvironmentSteps(),
        tf_metrics.AverageReturnMetric(),
        tf_metrics.AverageEpisodeLengthMetric(),
    ]
    
    # Policies
    eval_policy = tf_agent.policy
    collect_policy = tf_agent.collect_policy
    
    # Replay Buffer
    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=tf_agent.collect_data_spec,
        batch_size=tf_env.batch_size,
        max_length=replay_buffer_capacity)
    
    # Collect Driver
    collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        collect_policy,
        observers=[replay_buffer.add_batch] + train_metrics,
        num_episodes=collect_episodes_per_iteration)

    # Speed up with common.function
    if use_tf_functions:
        collect_driver.run = common.function(collect_driver.run)
        tf_agent.train = common.function(tf_agent.train)

        # dynamic_episode_driver.run = common.function(dynamic_episode_driver.run)

        ############################################
    print("#1")
    # Collect initial replay buffer data.

    initial_collect_policy = random_tf_policy.RandomTFPolicy(
        tf_env.time_step_spec(),
        tf_env.action_spec(),
    )

    logging.info(
        'Initializing replay buffer by collecting experience for %d episodes with '
        'a random policy.', initial_collect_episodes)
    
    

    initial_driver = dynamic_episode_driver.DynamicEpisodeDriver(
        tf_env,
        initial_collect_policy,
        observers=[replay_buffer.add_batch] + train_metrics + [initial_logger],
        num_episodes=initial_collect_episodes)

    if use_tf_functions:
        initial_driver.run = common.function(initial_driver.run)
    initial_driver.run()
    
    
    ##################################################
    print("#2")

    results = metric_utils.eager_compute(
        eval_metrics,
        eval_tf_env,
        eval_policy,
        num_episodes=num_eval_episodes,
        train_step=global_step,
        summary_writer=eval_summary_writer,
        summary_prefix='Metrics',
    )
    if eval_metrics_callback is not None:
        eval_metrics_callback(results, global_step.numpy())
    metric_utils.log_metrics(eval_metrics)

    time_step = None
    policy_state = collect_policy.get_initial_state(tf_env.batch_size)

    # Variables for logging time (steps_per_sec)
    timed_at_step = global_step.numpy()
    time_acc = 0  # Time accumulation

    # Dataset
    dataset = replay_buffer.as_dataset(
        num_parallel_calls=3,
        sample_batch_size=batch_size,
        num_steps=train_sequence_length + 1,
        single_deterministic_pass=False,
    ).prefetch(3)
    iterator = iter(dataset)

    def train_step():
        experience, _ = next(iterator)
        return tf_agent.train(experience)

    if use_tf_functions:
        train_step = common.function(train_step)
    
    #############################
    print("#3")

    for _ in range(num_iterations):
        start_time = time.time()
        time_step, policy_state = collect_driver.run(
            time_step=time_step,
            policy_state=policy_state,
        )

        for _ in range(train_steps_per_iteration):
            train_loss = train_step()
        time_acc += time.time() - start_time

        if global_step.numpy() % log_interval == 0:
            logging.info('step = %d, loss = %f', global_step.numpy(),
                         train_loss.loss)
            steps_per_sec = (global_step.numpy() - timed_at_step) / time_acc
            logging.info('%.3f steps/sec', steps_per_sec)
            tf.compat.v2.summary.scalar(
                name='global_steps_per_sec', data=steps_per_sec, step=global_step)

            # Reset time.
            timed_at_step = global_step.numpy()
            time_acc = 0

        for train_metric in train_metrics:
            train_metric.tf_summaries(
                train_step=global_step, step_metrics=train_metrics[:2])

        if global_step.numpy() % eval_interval == 0:
            results = metric_utils.eager_compute(
                eval_metrics,
                eval_tf_env,
                eval_policy,
                num_episodes=num_eval_episodes,
                train_step=global_step,
                summary_writer=eval_summary_writer,
                summary_prefix='Metrics',
            )
            if eval_metrics_callback is not None:
                eval_metrics_callback(results, global_step.numpy())
            metric_utils.log_metrics(eval_metrics)

        # # Record a video of current eval agent policy
        # if global_step.numpy() % video_recording_interval == 0:
        #     filename = '{}_{}'.format(global_step.numpy(), get_timestamp())
        #     full_filename = os.path.join(video_dir, filename)
        #     create_policy_eval_video(
        #         eval_policy,
        #         eval_tf_env,
        #         eval_py_env,
        #         filename=full_filename,
        #         fps=15,
        #         freeze_seconds=3,
        #         num_episodes=1,
        #         step=global_step.numpy(),
        #     )

In [None]:
# saved_filename = create_policy_eval_video(tf_agent.policy, eval_tf_env, eval_py_env, fps=15, freeze_seconds=3)

In [None]:
# embed_mp4(saved_filename)

In [103]:
tf_agent.policy

<tf_agents.policies.greedy_policy.GreedyPolicy at 0x7fb4a53b92e0>

## Create Video

In [104]:
from IPython.display import clear_output


In [118]:
def play_env(eval_tf_env, policy, sleep_time=0.1, env_seed=None, max_steps=200):

    def get_state_and_reward_from_time_step(ts):
        return time_step.observation[0].numpy(), time_step.reward[0].numpy()

    states = []
    rewards = []
    actions = []
    
    time_step = eval_tf_env.reset()
    state, reward = get_state_and_reward_from_time_step(time_step)
    
    states.append(state)
    rewards.append(reward)
    actions.append(None)
    
    total_reward = 0
    is_done = False
    current_step = 0

    while not time_step.is_last():
        clear_output(wait=True)
        # Get a random action
        action_step = policy.action(time_step)
        time_step = eval_tf_env.step(action_step.action)
        
        print(time_step)
        
        state, reward = get_state_and_reward_from_time_step(time_step)
        action = action_step.action[0].numpy()

        states.append(state)
        rewards.append(reward)
        actions.append(action)

        total_reward += reward
        current_step += 1


        # Print header
        print('Step: {:03d}, Reward: {}\n'.format(
            current_step,
            total_reward,
        ))
        

        time.sleep(sleep_time)
        
        
    if current_step < max_steps:
        print('\nResult: Done with {} steps and total reward is {}.'.format(
            current_step,
            total_reward,
        ))
    else:
        print('\nResult: Unsolved')
        
    return states, rewards, actions

In [119]:
states, rewards, actions = play_env(eval_tf_env, tf_agent.policy, max_steps=20)

TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
 'observation': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([85])>,
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([20.], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([2], dtype=int32)>})
Step: 007, Reward: 14.0


Result: Done with 7 steps and total reward is 14.0.


In [120]:
def get_char_txt(char_row, char_col, char='█'):
    txt = ''
    for r in range(char_row+2):
        for c in range(char_col * 2 + 2):
            if (char_row + 1) == r and (char_col * 2 + 1) == c:
                txt += char
            else:
                txt += ' '
        txt += '\n'
    
    return txt

def get_char_by_index(idx: int):
    if idx == 0:
        return 'R'
    elif idx == 1:
        return 'G'
    elif idx == 2:
        return 'Y'
    elif idx == 3:
        return 'B'
    elif idx == 4:
        return '_'
    
    return ' '

def get_char_pos_by_index(idx: int, taxi_row: int, taxi_col: int):
    if idx == 0:
        return [0, 0]
    elif idx == 1:
        return [0, 4]
    elif idx == 2:
        return [4, 0]
    elif idx == 3:
        return [4, 3]
    
    return [taxi_row, taxi_col]

def get_char_color_by_index(idx: int):
    if idx == 0:
        return (255, 0, 0) # Red
    elif idx == 1:
        return (0, 255, 0) # Green
    elif idx == 2:
        return (255, 255, 0) # Yellow
    elif idx == 3:
        return (0, 0, 255) # Blue

In [121]:
def enhance_frame(frame: np.ndarray, main_text=None, state=None, side_text=None, done=False) -> np.ndarray:
    if main_text is None:
        return frame
    
    # Convert array to PIl.Image
    image = PIL.Image.fromarray(frame).convert('RGB')

    # Get draw context
    draw = ImageDraw.Draw(image, 'RGB')

    # Get font
    font_file = '/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf'
    font = ImageFont.truetype(font_file, 24)
    
    # Variables
    draw_offset = (30, 30)
    side_offset = (220, 30)
    side_font_size = 20
    side_font = ImageFont.truetype(font_file, side_font_size)
    taxi_color = (255, 255, 0)
    passenger_color = (0, 0, 255)
    dest_color = (255, 0, 255)
    taxi_with_passenger_color = (0, 255, 0)
    
    # Render state
    if state is not None:
        # Draw taxi (color background)
        [taxi_row, taxi_col, passenger_location, destination] = list(env.decode(state))
        print([taxi_row, taxi_col, passenger_location, destination])
        taxi_txt = get_char_txt(taxi_row, taxi_col)
        
        taxi_color = taxi_with_passenger_color if (passenger_location == 4 and not done) else taxi_color 
        draw.text(draw_offset, taxi_txt, font=font, fill=taxi_color, stroke_width=1, stroke_fill=(100, 100, 100))
        
        # Draw map
        draw.text(draw_offset, main_text, font=font, fill=(0, 0, 0), stroke_width=1, stroke_fill=(255, 255, 255))
        
        # Draw passenger
        passenger_char = get_char_by_index(passenger_location)
        [passenger_row, passenger_col] = get_char_pos_by_index(passenger_location, taxi_row, taxi_col)
        passenger_txt = get_char_txt(passenger_row, passenger_col, char=passenger_char)
        # passenger_color = get_char_color_by_index(passenger_location)
        # print('passenger_txt', passenger_txt)
        draw.text(draw_offset, passenger_txt, font=font, fill=passenger_color, stroke_width=1, stroke_fill=(255, 255, 255))
        
        # Draw destination
        dest_char = get_char_by_index(destination)
        [dest_row, dest_col] = get_char_pos_by_index(destination, taxi_row, taxi_col)
        dest_txt = get_char_txt(dest_row, dest_col, char=dest_char)
        # dest_color = get_char_color_by_index(destination)
        # print('dest_txt', dest_txt)
        draw.text(draw_offset, dest_txt, font=font, fill=dest_color, stroke_width=1, stroke_fill=(255, 255, 255))
        
    else:
        # Draw background
        draw.text(draw_offset, main_text, font=font, fill=(0, 0, 0), stroke_width=1, stroke_fill=(255, 255, 255))
    
    if side_text is not None:
        draw.text(side_offset, side_text, font=side_font, fill=(0, 0, 0), stroke_width=1, stroke_fill=(255, 255, 255))

    return np.array(image)

In [132]:
env_name = 'Taxi-v3'
def create_states_video(
    states, rewards, filename=None, fps=30, 
    env_name=env_name, freeze_seconds=0, freeze_begin_seconds=0, step=None):
    if filename is None:
        filename = str(get_timestamp())
        
    filename = filename + '.mp4'
    logging.info('Env: %s', env_name)
    logging.info('Filename: %s', filename)
    map_txt = '\n'.join(taxi_map)

    with imageio.get_writer(filename, fps=fps) as video:
        logging.info('Begin')
        total_reward = 0.0
        frame_idx = 0
                
        for idx, (state, reward, action) in enumerate(zip(states, rewards, actions)):
            done = reward == 20
            # Freeze frame for a few seconds - At beginning
            if idx == 0 and freeze_begin_seconds > 0:
                text = f'Env: {env_name}'
                if step is not None:
                    text += f'\nStp: {step}'
                text += f'\nFrm: {frame_idx}'
                text += f'\nRw:  {total_reward:.2f}'

                frame = np.full((270, 480), 240.0)
                frame = enhance_frame(frame, '{}'.format(map_txt), side_text=text, state=state)

                for _ in range(fps * freeze_begin_seconds):
                    video.append_data(frame)

            if action is not None:
                action_name = action_names[action]
            else:
                action_name = '--'
            
            total_reward += reward
            
            text = f'Env: {env_name}'
            if step is not None:
                text += f'\nStp: {step}'
            text += f'\nFrm: {frame_idx}'
            text += f'\nRw:  {total_reward:.2f}'
            text += f'\nAct: {action_name}'
            
            if done:
                text += f'\n\nDone!\nFeb 26, 2022'

            frame = np.full((270, 480), 240.0)
            frame = enhance_frame(frame, '{}'.format(map_txt), side_text=text, state=state, done=done)
            
            video.append_data(frame)
            
            frame_idx += 1
            
            # Freeze frame for a few seconds
            if frame_idx+1 > len(states) and freeze_seconds > 0:
                for _ in range(fps * freeze_seconds):
                    video.append_data(frame)

    logging.info('All done')
    return filename
    # return embed_mp4(filename)

In [133]:
from gym.envs.toy_text.taxi import MAP as taxi_map
import gym

In [134]:
env = gym.make('Taxi-v3')

In [130]:
action_names = [
    'South (↓)',
    'North (↑)',
    'East (→)',
    'West (←)',
    'Pickup',
    'Drop off',
]

In [135]:
states, rewards, actions = play_env(eval_tf_env, tf_agent.policy, max_steps=100)

TimeStep(
{'discount': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.], dtype=float32)>,
 'observation': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([85])>,
 'reward': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([20.], dtype=float32)>,
 'step_type': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([2], dtype=int32)>})
Step: 016, Reward: 5.0


Result: Done with 16 steps and total reward is 5.0.


In [136]:
create_states_video(states, rewards, filename='taxi', fps=2, env_name='Taxi-v3', freeze_seconds=3, freeze_begin_seconds=2)

INFO:absl:Env: Taxi-v3
INFO:absl:Filename: taxi.mp4
INFO:absl:Begin


[2, 4, 0, 1]
[2, 4, 0, 1]
[2, 3, 0, 1]
[2, 2, 0, 1]
[2, 1, 0, 1]
[1, 1, 0, 1]
[0, 1, 0, 1]
[0, 0, 0, 1]
[0, 0, 4, 1]
[1, 0, 4, 1]
[1, 1, 4, 1]
[2, 1, 4, 1]
[2, 2, 4, 1]
[2, 3, 4, 1]
[1, 3, 4, 1]
[1, 4, 4, 1]
[0, 4, 4, 1]
[0, 4, 1, 1]


INFO:absl:All done


'taxi.mp4'