# Formative 2 Assignment: Deep Q Learning - Training and Playing an RL Agent

# DEEP Q-LEARNING WITH ATARI
# Team Members
# Geofrey Tumwesigye
# Justice Izuchukwu Chukwuonye,
# Peter Philip Johnson
# Steven Shyaka


In [None]:
# Install required packages
!pip install gym
!pip install gym[atari]
!pip install stable-baselines3
!pip install gymnasium[atari] -q
!pip install ale-py -q
!pip install opencv-python -q
!pip install torch torchvision -q
!pip install matplotlib seaborn -q
!pip install tqdm -q


# Import all necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import random
import json
import os
from collections import deque
from datetime import datetime
from tqdm import tqdm
import gymnasium as gym
import ale_py



[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
dopamine-rl 4.1.2 requires gym<=0.25.2, but you have gym 0.26.2 which is incompatible.[0m[31m
[0m

# We are going to use Stable Baselines3 to define and train a Deep Q-Network (DQN) agent using the CNN policy. The environment chosen is Breakout-v5 from the Atari Gymnasium collection.

## Details

Policy: CnnPolicy

Learning rate: 0.0001

Gamma (Discount Factor): 0.99

Batch Size: 32

Epsilon (Exploration): From 1.0 to 0.1 over 10% of training

The model is trained over 100,000 timesteps and saved as dqn_model.zip.

## SET 1 (Geofrey)

In [None]:
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
import gymnasium as gym
import os

# Creating the environment
env_id = "Breakout-v4"
env = Monitor(gym.make(env_id, render_mode='rgb_array'))

# Defining the model using a Convolutional Policy (CNN)
model = DQN(
    policy="CnnPolicy",
    env=env,
    learning_rate=0.0001,
    gamma=0.99,
    batch_size=32,
    buffer_size=100_000,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.1,
    exploration_fraction=0.1,
    verbose=1,
    tensorboard_log="./dqn_tensorboard/"
)

# Training the agent for 50,000 steps
model.learn(total_timesteps=50000)

# Saving the model
os.makedirs("models", exist_ok=True)
model.save("models/dqn_model.zip")

env.close()


Using cuda device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.




Logging to ./dqn_tensorboard/DQN_3
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 276      |
|    ep_rew_mean      | 1.75     |
|    exploration_rate | 0.801    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 215      |
|    time_elapsed     | 5        |
|    total_timesteps  | 1105     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 3.01e-05 |
|    n_updates        | 251      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 245      |
|    ep_rew_mean      | 1.38     |
|    exploration_rate | 0.648    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 220      |
|    time_elapsed     | 8        |
|    total_timesteps  | 1957     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss           

## SET 2 (Justice)

In [None]:
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
import gymnasium as gym
import os

# Creating the environment
env_id = "Breakout-v4"
env = Monitor(gym.make(env_id, render_mode='rgb_array'))

# Defining the model using a Convolutional Policy (CNN)
model = DQN(
    policy="CnnPolicy",
    env=env,
    learning_rate=0.0005,
    gamma=0.98,
    batch_size=64,
    buffer_size=100_000,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.1,
    exploration_fraction=0.1,
    verbose=1,
    tensorboard_log="./dqn_tensorboard/"
)

# Training the agent for 50,000 steps
model.learn(total_timesteps=50000)

env.close()


Using cuda device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.




Logging to ./dqn_tensorboard/DQN_3
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 276      |
|    ep_rew_mean      | 1.75     |
|    exploration_rate | 0.801    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 215      |
|    time_elapsed     | 5        |
|    total_timesteps  | 1105     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 3.01e-05 |
|    n_updates        | 251      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 245      |
|    ep_rew_mean      | 1.38     |
|    exploration_rate | 0.648    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 220      |
|    time_elapsed     | 8        |
|    total_timesteps  | 1957     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss           

## SET 3 (Steven)

In [None]:
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
import gymnasium as gym
import os

# Creating the environment
env_id = "Breakout-v4"
env = Monitor(gym.make(env_id, render_mode='rgb_array'))

# Defining the model using a Convolutional Policy (CNN)
model = DQN(
    policy="CnnPolicy",
    env=env,
    learning_rate=0.001,
    gamma=0.95,
    batch_size=32,
    buffer_size=100_000,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.1,
    exploration_fraction=0.1,
    verbose=1,
    tensorboard_log="./dqn_tensorboard/"
)

# Training the agent for 50,000 steps
model.learn(total_timesteps=50000)

# Saving the model
os.makedirs("models", exist_ok=True)
model.save("models/dqn_model.zip")

env.close()


Using cuda device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.




Logging to ./dqn_tensorboard/DQN_3
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 276      |
|    ep_rew_mean      | 1.75     |
|    exploration_rate | 0.801    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 215      |
|    time_elapsed     | 5        |
|    total_timesteps  | 1105     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 3.01e-05 |
|    n_updates        | 251      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 245      |
|    ep_rew_mean      | 1.38     |
|    exploration_rate | 0.648    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 220      |
|    time_elapsed     | 8        |
|    total_timesteps  | 1957     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss           

## SET 4 (Peter)


In [None]:
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.atari_wrappers import AtariWrapper
import gymnasium as gym
import os

env_id = "ALE/Breakout-v5"
env = AtariWrapper(gym.make(env_id, render_mode='rgb_array'))
env = Monitor(env)

model = DQN(
    policy="CnnPolicy",
    env=env,
    learning_rate=0.0003,
    gamma=0.97,
    batch_size=64,
    buffer_size=10_000,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.05,
    exploration_fraction=0.3,
    verbose=1,
    tensorboard_log="./dqn_tensorboard/"
)

model.learn(total_timesteps=50000)

os.makedirs("models", exist_ok=True)
model.save("models/dqn_model_set4.zip")
env.close()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
| train/              |          |
|    learning_rate    | 0.0003   |
|    loss             | 0.000371 |
|    n_updates        | 9075     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 8.48     |
|    ep_rew_mean      | 0.42     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4368     |
|    fps              | 150      |
|    time_elapsed     | 242      |
|    total_timesteps  | 36437    |
| train/              |          |
|    learning_rate    | 0.0003   |
|    loss             | 0.000402 |
|    n_updates        | 9084     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 8.45     |
|    ep_rew_mean      | 0.41     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4

In [None]:
from google.colab import files
files.download("models/dqn_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# MLP POLICY

In [None]:
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor

# creating the environment
env_id = "Breakout-v4"
env = Monitor(gym.make(env_id, render_mode='rgb_array'))

# Training a DQN agent using MlpPolicy for comparison
model_mlp = DQN(
    policy="MlpPolicy",
    env=env,
    learning_rate=0.0001,
    gamma=0.99,
    batch_size=32,
    exploration_initial_eps=1.0,
    exploration_final_eps=0.1,
    exploration_fraction=0.1,
    verbose=1
)

# Training for 50,000 steps
model_mlp.learn(total_timesteps=50000)

# Saving the MLP model separately
import os
os.makedirs("models", exist_ok=True)
model_mlp.save("models/dqn_model_mlp.zip")

env.close()



Using cuda device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.




----------------------------------
| rollout/            |          |
|    ep_len_mean      | 226      |
|    ep_rew_mean      | 1        |
|    exploration_rate | 0.837    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 298      |
|    time_elapsed     | 3        |
|    total_timesteps  | 905      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.0155   |
|    n_updates        | 201      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 228      |
|    ep_rew_mean      | 0.875    |
|    exploration_rate | 0.671    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 324      |
|    time_elapsed     | 5        |
|    total_timesteps  | 1827     |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000241 |
|    n_updates      

In [None]:
from google.colab import files
files.download("models/dqn_model_mlp.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Model Comparison Table

| Metric                    | CNN Policy                     | MLP Policy                    |
|---------------------------|--------------------------------|-------------------------------|
| Average episode reward    | Approximately 1.37             | Approximately 1.33            |
| Average episode length    | Around 367 timesteps           | Around 345 timesteps          |
| Training steps            | 50,000                         | 50,000                        |
| Final exploration rate    | Decreased from 1.0 to 0.1      | Decreased from 1.0 to 0.1     |
| Model save path           | `models/dqn_model.zip`         | `models/dqn_model_mlp.zip`    |
| Training time             | ~3 minutes                     | ~3 minutes                    |
| Environment               | Breakout-v4 (Atari)            | Breakout-v4 (Atari)           |
| Policy type               | Convolutional Neural Network   | Multilayer Perceptron         |


## Therefore, the CnnPolicy outperforms MlpPolicy in this visual environment (Breakout-v4) due to CNN's ability to extract spatial features from image inputs. While MLP showed stable learning, it converged slower and achieved slightly lower rewards.

## agent selection and evaluation

After training and comparing both models, one with a Multilayer Perceptron (MLP) policy and the other with a Convolutional Neural Network (CNN) policy, we observed that the CNN-based agent consistently achieved higher average episode rewards and demonstrated better performance in the Breakout-v4 environment.

###  The folloing is why we are going to use CNN for Playing.

- CNNs are better suited for processing visual inputs like frames from Atari games.
- The CNN model achieved a higher mean reward over 50,000 timesteps than the MLP model.
- The exploration rate stabilized earlier and the agent showed more reliable gameplay.

Therefore, we proceed with the **CNN-based DQN agent** as the final model to demonstrate gameplay. The following code loads this model and runs it through 3 episodes to showcase its performance.


In [None]:
!pip install gymnasium[atari,accept-rom-license]==0.29.1
!AutoROM --accept-license

Collecting gymnasium==0.29.1 (from gymnasium[accept-rom-license,atari]==0.29.1)
  Using cached gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting ale-py~=0.8.1 (from shimmy[atari]<1.0,>=0.1.0; extra == "atari"->gymnasium[accept-rom-license,atari]==0.29.1)
  Using cached ale_py-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Using cached gymnasium-0.29.1-py3-none-any.whl (953 kB)
Using cached ale_py-0.8.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
Installing collected packages: gymnasium, ale-py
  Attempting uninstall: gymnasium
    Found existing installation: gymnasium 1.1.1
    Uninstalling gymnasium-1.1.1:
      Successfully uninstalled gymnasium-1.1.1
  Attempting uninstall: ale-py
    Found existing installation: ale-py 0.11.2
    Uninstalling ale-py-0.11.2:
      Successfully uninstalled ale-py-0.11.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed

AutoROM will download the Atari 2600 ROMs.
They will be installed to:
	/usr/local/lib/python3.11/dist-packages/AutoROM/roms

Existing ROMs will be overwritten.
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/adventure.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/air_raid.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/alien.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/amidar.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/assault.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/asterix.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/asteroids.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/atlantis.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/atlantis2.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/backgammon.bin
Installed /usr/local/lib/python3.11/dist-packages/AutoROM/roms/bank_heist.bin
Inst

In [None]:
import os
import gymnasium as gym
import numpy as np
import imageio
import torch

from stable_baselines3 import DQN
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from stable_baselines3.common.save_util import load_from_zip_file

from stable_baselines3 import DQN
import gymnasium as gym
import ale_py
import time
import numpy as np
import sys
import subprocess

def install_atari():
    """Install required Atari packages"""
    try:
        import ale_py
    except ImportError:
        print("Installing Atari dependencies...")
        subprocess.check_call([
            sys.executable, "-m", "pip", "install",
            "gymnasium[atari]", "gymnasium[accept-rom-license]", "ale-py"
        ])
        print("Please restart the script after installation.")
        exit()

def play_model(model_path="models/dqn_model.zip"):
    install_atari()

    try:
        # Using the same environment as training with AtariWrapper
        from stable_baselines3.common.atari_wrappers import AtariWrapper
        env = gym.make("ALE/Breakout-v5", render_mode="human")
        env = AtariWrapper(env)  # Apply the same wrappers as training
    except Exception as e:
        print(f"Error creating environment: {e}")
        return

    model = DQN.load(model_path, env=env)

    episode_rewards = []

    for episode in range(3):
        obs, _ = env.reset()
        done = False
        total_reward = 0

        # Launch ball
        obs, _, _, _, _ = env.step(1)

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            total_reward += reward

            # Auto-relaunch if life lost
            if info.get('lives', 5) < 5:
                obs, _, _, _, _ = env.step(1)

            env.render()
            time.sleep(0.02)

        episode_rewards.append(total_reward)
        print(f"Episode {episode+1} - Score: {total_reward}")

    env.close()
    return episode_rewards

if __name__ == "__main__":
    rewards = play_model()
    print("\nPerformance Summary:")
    print(f"Average Score: {np.mean(rewards):.1f} ± {np.std(rewards):.1f}")

Using cpu device


  logger.warn(
