# MADRE - 12/01/2026

DQN agent for the training of Atari games

Code mostly taken from Keras documentation at
https://keras.io/examples/rl/deep_q_network_breakout/

# Install Dependencies to Render Gym Environment

In [1]:
# Run these cells ASAP since it can take up to 30 seconds
%%capture
!apt-get update
!apt-get install -y xvfb python-opengl ffmpeg


In [2]:
!pip install pyglet==1.3.2 gymnasium==1.2.2 pyvirtualdisplay==3.0



In [3]:

!pip install tensorflow==2.19.0 gast==0.7

Collecting tensorflow==2.19.0
  Downloading tensorflow-2.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Downloading tensorflow-2.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (645.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m645.0/645.0 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorflow
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.19.1
    Uninstalling tensorflow-2.19.1:
      Successfully uninstalled tensorflow-2.19.1
Successfully installed tensorflow-2.19.0


In [4]:
!pip install ale-py gymnasium[other]
!pip freeze > requirements.txt



In [5]:
import gymnasium

from gymnasium import logger as gymlogger
from gymnasium.wrappers import AtariPreprocessing, FrameStackObservation #Monitor
from gymnasium.wrappers import RecordVideo

import ale_py

import tensorflow as tf
import numpy as np
import keras
from keras import layers

import glob
import io
import base64

In [6]:
# virtual display for notebookfrom IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display

display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7c99d1f76f90>

#### Check that there is a GPU avaiable

In [7]:
gpu_list = tf.config.experimental.list_physical_devices('GPU')
print('Number of GPUS available is {}'.format(len(gpu_list)))

Number of GPUS available is 1


## Helper functions to visualize the performance of the agent

In [8]:
def show_video():
  """Enables video recording of gym environment and shows it."""
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else:
    print("Video not found")


def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

# Start Environment and Build Dueling DQN Agent

In [9]:
gymnasium.register_envs(ale_py)

In [None]:
# Selection of the environment
#Full list of Atari games environment at https://ale.farama.org/environments/
environment_name = "Breakout" # @param ["Pong", "Breakout", "MontezumaRevenge","MsPacman","SpaceInvaders"]

save_model = True # @param {type:"boolean"}

# Hyperparameters.
num_episodes = 10000 # @param {type:"integer"}
epsilon = 1.0 # @param {type:"number"}
batch_size = 32 # @param {type:"integer"}
discount = 0.99 # @param {type:"number"}
# How often to update the target network
update_target_network = 500 # @param {type:"integer"}
# Train the model after 4 actions
update_after_actions = 4 # @param {type:"integer"}

seed = 42


In [11]:
def make_env(environment_name, render='rgb_array'):
  # Load gym environment and get action and state spaces
  #env = gymnasium.make("ALE/"+environment_name+"-v5", render_mode="rgb_array")
  env = gymnasium.make(environment_name+"NoFrameskip-v4", render_mode=render)

  env = AtariPreprocessing(env)
  # Stack four frames
  env = FrameStackObservation(env, 4)

  # Record a video every num_episodes steps
  trigger = lambda t: t % 100 == 0
  env = RecordVideo(env, video_folder="./video", episode_trigger=trigger, name_prefix=environment_name, disable_logger=True)
  return env

In [12]:
env = make_env(environment_name)
num_state_feats = env.observation_space.shape
num_actions = env.action_space.n
max_observation_values = env.observation_space.high
print('Number of state features: {}'.format(num_state_feats))
print('Number of possible actions: {}'.format(num_actions))

Number of state features: (4, 84, 84)
Number of possible actions: 6


  logger.warn(


# DQN model

In [13]:
def create_q_model():
    # Network defined by the Deepmind paper
    return keras.Sequential(
        [
            layers.Lambda(
                lambda tensor: keras.ops.transpose(tensor, [0, 2, 3, 1]),
                output_shape=(84, 84, 4),
                input_shape=(4, 84, 84),
            ),
            # Convolutions on the frames on the screen
            layers.Conv2D(32, 8, strides=4, activation="relu", input_shape=(4, 84, 84)),
            layers.Conv2D(64, 4, strides=2, activation="relu"),
            layers.Conv2D(64, 3, strides=1, activation="relu"),
            layers.Flatten(),
            layers.Dense(512, activation="relu"),
            layers.Dense(num_actions, activation="linear"),
        ]
    )

# Create Helper Functions

In [14]:
def select_epsilon_greedy_action(state, epsilon):
  """Take random action with probability epsilon, else take best action."""
  result = np.random.uniform()
  if result < epsilon:
    return env.action_space.sample() # Random action.
  else:
    qs = main_nn(state).numpy()
    return np.argmax(qs) # Greedy action for state.

# Set Up Function to Perform a Training Step

In [15]:
@tf.function
def train_step(states, actions, rewards, next_states, dones):
  """Perform a training iteration on a batch of data."""
  next_qs_main = main_nn(next_states)
  next_qs_argmax = tf.argmax(next_qs_main, axis=-1)
  next_action_mask = tf.one_hot(next_qs_argmax, num_actions)
  next_qs_target = target_nn(next_states)
  masked_next_qs = tf.reduce_sum(next_action_mask * next_qs_target, axis=-1)
  target = rewards + (1. - dones) * discount * masked_next_qs
  with tf.GradientTape() as tape:
    qs = main_nn(states)
    action_mask = tf.one_hot(actions, num_actions)
    masked_qs = tf.reduce_sum(action_mask * qs, axis=-1)
    loss = loss_fn(target, masked_qs)
  grads = tape.gradient(loss, main_nn.trainable_variables)
  optimizer.apply_gradients(zip(grads, main_nn.trainable_variables))
  return loss

# Start running the DQN algorithm and see how the algorithm learns.

In [16]:
# Experience replay buffers
action_history = []
state_history = []
state_next_history = []
rewards_history = []
done_history = []
episode_reward_history = []
# Maximum replay length
# Note: The Deepmind paper suggests 1000000 however this causes memory issues
max_memory_length = 100000

In [17]:
main_nn = create_q_model()
target_nn = create_q_model()

# Loss function and optimizer.
loss_fn = tf.keras.losses.Huber()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5, clipnorm=10)

  super().__init__(**kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Start training.
cur_frame, last_100_ep_rewards = 0, []
for episode in range(num_episodes + 1):
  state, _info = env.reset(seed=seed)
  ep_reward, done = 0, False
  while not done:
    state_in = np.expand_dims(np.array(state) / 255., axis=0)
    action = select_epsilon_greedy_action(state_in, epsilon)
    next_state, reward, done, _truncated, info = env.step(action)
    ep_reward += reward
    reward = np.sign(reward)

    # Save to experience replay.
    action_history.append(action)
    state_history.append(state)
    state_next_history.append(next_state)
    done_history.append(done)
    rewards_history.append(reward)

    state = next_state
    cur_frame += 1
    if epsilon > 0.01:
      epsilon -= 1.1e-6

    if cur_frame % update_after_actions == 0 and len(done_history) > batch_size:
      indices = np.random.choice(range(len(done_history)), size=batch_size)
      # Using list comprehension to sample from replay buffer
      states = np.array([state_history[i] for i in indices])
      next_states = np.array([state_next_history[i] for i in indices])
      rewards = [rewards_history[i] for i in indices]
      actions = [action_history[i] for i in indices]
      dones = keras.ops.convert_to_tensor(
        [float(done_history[i]) for i in indices]
      )

      states = states / 255.
      next_states = next_states / 255.
      loss = train_step(states, actions, rewards, next_states, dones)

    # Copy main_nn weights to target_nn.
    if cur_frame % update_target_network == 0:
      target_nn.set_weights(main_nn.get_weights())

    # Limit the state and reward history
    if len(rewards_history) > max_memory_length:
      del rewards_history[:1]
      del state_history[:1]
      del state_next_history[:1]
      del action_history[:1]
      del done_history[:1]

  if len(last_100_ep_rewards) == 100:
    last_100_ep_rewards = last_100_ep_rewards[1:]
  last_100_ep_rewards.append(ep_reward)

  if episode % 10 == 0:
    print(f'Episode: {episode}/{num_episodes}, Epsilon: {epsilon:.3f}, '\
          f'Loss: {loss:.4f}, Return: {np.mean(last_100_ep_rewards):.2f}')

  if episode > 0 and episode % 1000 == 0 and save_model:
      model_path = environment_name + ".keras"
      main_nn.save(model_path)
      #print(f"model saved to {model_path}")

env.close()
show_video()

Episode: 0/10000, Epsilon: 0.999, Loss: 0.0003, Return: -21.00
Episode: 10/10000, Epsilon: 0.989, Loss: 0.0009, Return: -20.55
Episode: 20/10000, Epsilon: 0.980, Loss: 0.0002, Return: -20.52
Episode: 30/10000, Epsilon: 0.969, Loss: 0.0003, Return: -20.32
Episode: 40/10000, Epsilon: 0.959, Loss: 0.0011, Return: -20.34


# Display Result of Trained DQN Agent on Pong Environment

In [None]:
env = gymnasium.make('PongNoFrameskip-v4')
env = AtariPreprocessing(env,
                         grayscale_obs=True,
                         scale_obs=True,
                         terminal_on_life_loss=False)
env = wrap_env(FrameStack(env, num_stack=4))

state = env.reset()
done = False
ep_rew = 0
while not done:
  env.render()
  state = np.array(state)
  state = np.expand_dims(state, axis=0)
  action = select_epsilon_greedy_action(state, epsilon=0.01)
  state, reward, done, info = env.step(action)
  ep_rew += reward
print(f'Total Return: {ep_rew}')
env.close()
show_video()