In [1]:
import tensorflow as tf
import tf_agents as tfa
from tf_agents.agents.dqn import dqn_agent
from tf_agents.environments import suite_gym, tf_py_environment
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
import numpy as np

# Exercise 1: Prepare the Environment and Tools

In [2]:
train_py_env = suite_gym.load('CartPole-v0')
eval_py_env = suite_gym.load('CartPole-v0')

In [3]:
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

In [4]:
print("Observation Spec:", train_env.observation_spec())
print("Action Spec:", train_env.action_spec())

Observation Spec: BoundedTensorSpec(shape=(4,), dtype=tf.float32, name='observation', minimum=array([-4.8000002e+00, -3.4028235e+38, -4.1887903e-01, -3.4028235e+38],
      dtype=float32), maximum=array([4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38],
      dtype=float32))
Action Spec: BoundedTensorSpec(shape=(), dtype=tf.int64, name='action', minimum=array(0, dtype=int64), maximum=array(1, dtype=int64))


In [5]:
num_test_episodes = 3
for episode in range(num_test_episodes):
    time_step = eval_env.reset()
    episode_reward = 0
    while not time_step.is_last():
        action = np.random.choice([0, 1])  
        time_step = eval_env.step(action)
        episode_reward += time_step.reward
    print(f"Test Episode {episode + 1}, Reward: {episode_reward}")

Test Episode 1, Reward: [10.]
Test Episode 2, Reward: [16.]
Test Episode 3, Reward: [15.]


# Exercise 2 Create the Network and the Agent

In [6]:
fc_layer_params = (100, 50)

In [7]:
q_net = q_network.QNetwork(
    train_env.observation_spec(),
    train_env.action_spec(),
    fc_layer_params=fc_layer_params
)

In [8]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
train_step_counter = tf.Variable(0)

# Exercise 3: Training and Evaluation

In [9]:
agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter,
    epsilon_greedy=0.1, 
    target_update_period=100,  
    gamma=0.99 
)

In [10]:
agent.initialize()

In [11]:
agent.train = common.function(agent.train)
agent.train_step_counter.assign(0)

<tf.Variable 'UnreadVariable' shape=() dtype=int32, numpy=0>

In [12]:
replay_buffer_max_length = 100000
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=replay_buffer_max_length
)

In [13]:
collect_policy = agent.collect_policy

In [14]:
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())

In [15]:
def collect_step(environment, policy, buffer):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    buffer.add_batch(traj)


In [16]:
initial_collect_steps = 1000
for _ in range(initial_collect_steps):
    collect_step(train_env, random_policy, replay_buffer)

In [17]:
num_iterations = 20000
collect_steps_per_iteration = 1
batch_size = 64
log_interval = 200
eval_interval = 1000
num_eval_episodes = 10

In [18]:
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3,
    sample_batch_size=batch_size,
    num_steps=2
).prefetch(3)
iterator = iter(dataset)

Instructions for updating:
Use `tf.data.Dataset.counter(...)` instead.
Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


In [19]:
avg_return = 0
returns = []

In [20]:
for iteration in range(num_iterations):
    for _ in range(collect_steps_per_iteration):
        collect_step(train_env, collect_policy, replay_buffer)

    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print(f"Step {step}, Loss: {train_loss}")

    if step % eval_interval == 0:
        total_return = 0
        for _ in range(num_eval_episodes):
            time_step = eval_env.reset()
            episode_return = 0
            while not time_step.is_last():
                action_step = agent.policy.action(time_step)
                time_step = eval_env.step(action_step.action)
                episode_return += time_step.reward
            total_return += episode_return
        avg_return = total_return / num_eval_episodes
        print(f"Step {step}, Average Return: {avg_return}")
        returns.append(avg_return)

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))
Step 200, Loss: 0.06393811106681824
Step 400, Loss: 0.4865455627441406
Step 600, Loss: 0.28280216455459595
Step 800, Loss: 0.18510796129703522
Step 1000, Loss: 0.29275640845298767
Step 1000, Average Return: [145.]
Step 1200, Loss: 1.2382336854934692
Step 1400, Loss: 0.4353741407394409
Step 1600, Loss: 0.42893850803375244
Step 1800, Loss: 2.8919677734375
Step 2000, Loss: 0.9182277917861938
Step 2000, Average Return: [163.]
Step 2200, Loss: 1.0051295757293701
Step 2400, Loss: 0.8649256229400635
Step 2600, Loss: 1.0985932350158691
Step 2800, Loss: 2.6630988121032715
Step 3000, Loss: 7.576536178588867
Step 3000, Average Return: [196.5]
Step 3200, Loss: 11.505270957946777
Step 3400, Loss: 1.1949626207351685
Step 3600, Loss: 1.1867820024490356
Step 3800, Loss: 0

In [21]:

print(f"Final Average Return after training: {avg_return}")

Final Average Return after training: [200.]
