In [10]:
import tensorflow as tf
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.networks import q_network
from tf_agents.agents.dqn import dqn_agent
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from tf_agents.policies import random_tf_policy
from tf_agents.metrics import tf_metric
from tf_agents.drivers import dynamic_step_driver

import numpy as np

In [3]:
## Create the cartpole environment

env_name = 'CartPole-v1'
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

In [4]:
## Convert to tf environment

train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

In [7]:
## Create an agent

q_net = q_network.QNetwork(train_env.observation_spec(),
                           train_env.action_spec(),
                           fc_layer_params=(100,))

optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
train_step_counter = tf.Variable(0)
agent = dqn_agent.DdqnAgent(train_env.time_step_spec(),
                            train_env.action_spec(),
                            q_network=q_net,
                            optimizer=optimizer,
                            td_errors_loss_fn=common.element_wise_squared_loss,
                            train_step_counter=train_step_counter)

In [11]:
## Creating a replay buffer

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=100000)

In [13]:
## Create Policy for Data Collection

random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())

## Function to collect steps
def collect_step(environment, policy):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    replay_buffer.add_batch(traj)

In [14]:
## Prefill the replay buffer

for _ in range(1000):
    collect_step(train_env, random_policy)

In [15]:
## Evaluation

def compute_avg_return(environment, policy, num_episodes=10):
    total_return = 0.0
    for _ in range(num_episodes):
        time_step = environment.reset()
        episode_return = 0.0
        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return
    average_return = total_return/num_episodes
    return average_return.numpy()[0]

In [16]:
## Data Collection

dataset = replay_buffer.as_dataset(num_parallel_calls=3,
                                   sample_batch_size=16,
                                   num_steps=2).prefetch(3)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


In [17]:
iterator = iter(dataset)

In [19]:
## Training the agent

for _ in range(20000):
    collect_step(train_env, agent.collect_policy)
    experience, _ = next(iterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()
    if step % 100 == 0:
        avg_return = compute_avg_return(eval_env, agent.policy)
        print('Step: {}, Avg Return: {}, Loss: {}'.format(step, avg_return, train_loss))

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))
Step: 100, Avg Return: 9.0, Loss: 1.725217342376709
Step: 200, Avg Return: 9.600000381469727, Loss: 7.7565178871154785
Step: 300, Avg Return: 9.399999618530273, Loss: 5.708325386047363
Step: 400, Avg Return: 9.600000381469727, Loss: 31.228527069091797
Step: 500, Avg Return: 9.800000190734863, Loss: 2.7479429244995117
Step: 600, Avg Return: 9.600000381469727, Loss: 19.563976287841797
Step: 700, Avg Return: 15.800000190734863, Loss: 18.677509307861328
Step: 800, Avg Return: 14.600000381469727, Loss: 16.022676467895508
Step: 900, Avg Return: 12.699999809265137, Loss: 39.27983856201172
Step: 1000, Avg Return: 14.899999618530273, Loss: 36.966400146484375
Step: 1100, Avg Return: 15.399999618530273, Loss: 1.9076933860778809
Step: 1200, Avg Return: 10.899999618530

KeyboardInterrupt: 