# TF-Agents Basic Demo
This notebook demonstrates how to:

- Install TF-Agents
- Create a Gym environment wrapper
- Build a DQN agent with a simple Q-Network
- Collect data into a replay buffer
- Train the agent on sampled experiences
- Evaluate the trained policy


In [1]:
#!pip install tf-agents

In [2]:
import tensorflow as tf
from tf_agents.environments import suite_gym
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.networks.q_network import QNetwork
from tf_agents.agents.dqn.dqn_agent import DqnAgent
from tf_agents.utils import common
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.trajectories import trajectory
from tf_agents.policies import random_tf_policy

2025-05-18 03:36:34.660902: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-18 03:36:34.672158: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-18 03:36:34.853203: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-18 03:36:34.856004: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Create the Gym environment
env_name = 'CartPole-v0'
py_env = suite_gym.load(env_name)
tf_env = TFPyEnvironment(py_env)

In [4]:
# Build a simple Q-Network
fc_layer_params = (100,)
q_net = QNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    fc_layer_params=fc_layer_params)

optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=1e-3)
train_step_counter = tf.Variable(0)

agent = DqnAgent(
    tf_env.time_step_spec(),
    tf_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)
agent.initialize()

In [5]:
# Set up replay buffer
replay_buffer = TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=1000)

# Function to collect a step using a policy
def collect_step(environment, policy, buffer):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    buffer.add_batch(traj)

# Collect some initial random data
random_policy = random_tf_policy.RandomTFPolicy(
    tf_env.time_step_spec(), tf_env.action_spec())
for _ in range(100):
    collect_step(tf_env, random_policy, replay_buffer)

In [6]:
# Create a dataset from the replay buffer
dataset = replay_buffer.as_dataset(
    sample_batch_size=64, num_steps=2).prefetch(3)
iterator = iter(dataset)

# Convert train to a graph function for speed
agent.train = common.function(agent.train)

Instructions for updating:
Use `tf.data.Dataset.counter(...)` instead.
Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


In [7]:
# Training loop
num_iterations = 200
for _ in range(num_iterations):
    experience, _ = next(iterator)
    train_loss = agent.train(experience)
print(f"Training completed after {num_iterations} iterations.")

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))
Training completed after 200 iterations.


In [8]:
# Helper to compute average return
def compute_avg_return(environment, policy, num_episodes=5):
    total_return = 0.0
    for _ in range(num_episodes):
        time_step = environment.reset()
        episode_return = 0.0
        while not time_step.is_last():
            action_step = policy.action(time_step)
            time_step = environment.step(action_step.action)
            episode_return += time_step.reward
        total_return += episode_return
    avg_return = total_return / num_episodes
    return avg_return.numpy()[0]

# Evaluate the trained policy
avg_return = compute_avg_return(tf_env, agent.policy, num_episodes=5)
print(f'Average Return over 5 episodes: {avg_return}')

Average Return over 5 episodes: 9.399999618530273
