## Install Important Libraries

In [1]:
!pip install memory_profiler psutil gymnasium ale-py tensorflow matplotlib

Collecting memory_profiler
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Collecting ale-py
  Downloading ale_py-0.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.2 kB)
Downloading memory_profiler-0.61.0-py3-none-any.whl (31 kB)
Downloading ale_py-0.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m70.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: memory_profiler, ale-py
Successfully installed ale-py-0.10.2 memory_profiler-0.61.0


In [2]:
import os
import ale_py
import gymnasium as gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import Model, layers
from collections import deque, namedtuple
import random
import time
import gc
import matplotlib.pyplot as plt
import psutil

class TrainingConfig:
    CHECKPOINT_DIR = "./pacman_models"
    TARGET_UPDATE_FREQ = 1000
    TOTAL_EPISODES = 1000
    CHECKPOINT_FREQ = 100
    MIN_REPLAY_HISTORY = 10000

    def __init__(self):
        mem = psutil.virtual_memory()
        if mem.available > 12 * 1024**3:
            self.REPLAY_BUFFER_SIZE = 80000
            self.BATCH_SIZE = 64
            self.FRAME_STACK_SIZE = 4
            self.LEARNING_RATE = 0.00025
        else:
            self.REPLAY_BUFFER_SIZE = 50000
            self.BATCH_SIZE = 32
            self.FRAME_STACK_SIZE = 3
            self.LEARNING_RATE = 0.0001

        self.EPSILON_START = 1.0
        self.EPSILON_END = 0.01
        self.EPSILON_DECAY_STEPS = 200000
        self.GAMMA = 0.99
        self.FRAME_SKIP = 4

        print(f"Using configuration (RAM available: {mem.available/1024**3:.1f}GB):")
        print(f"- Replay buffer: {self.REPLAY_BUFFER_SIZE}")
        print(f"- Batch size: {self.BATCH_SIZE}")
        print(f"- Frame stack: {self.FRAME_STACK_SIZE}")

Experience = namedtuple('Experience', ['state', 'action', 'reward', 'next_state', 'done'])

class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

class PacManDQN(Model):
    def __init__(self, action_size=9):
        super().__init__()
        self.conv1 = layers.Conv2D(32, (8,8), strides=4, activation='relu')
        self.conv2 = layers.Conv2D(64, (4,4), strides=2, activation='relu')
        self.conv3 = layers.Conv2D(64, (3,3), strides=1, activation='relu')
        self.flatten = layers.Flatten()
        self.dense = layers.Dense(512, activation='relu')
        self.output_layer = layers.Dense(action_size, activation='linear')

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.flatten(x)
        x = self.dense(x)
        return self.output_layer(x)

class DQNAgent:
    def __init__(self, config):
        self.config = config
        os.makedirs(self.config.CHECKPOINT_DIR, exist_ok=True)

        self.env = gym.make("ALE/MsPacman-v5",
                          render_mode="rgb_array",
                          frameskip=self.config.FRAME_SKIP,
                          repeat_action_probability=0.0)

        self.action_size = self.env.action_space.n
        self.online_net = PacManDQN(self.action_size)
        self.target_net = PacManDQN(self.action_size)

        # Initialize networks
        dummy_input = tf.random.normal((1, 88, 80, self.config.FRAME_STACK_SIZE))
        _ = self.online_net(dummy_input)
        _ = self.target_net(dummy_input)

        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.config.LEARNING_RATE)
        self.replay_buffer = ReplayBuffer(self.config.REPLAY_BUFFER_SIZE)
        self.steps = 0
        self.episode_rewards = []

        # Epsilon decay
        self.epsilon_decay = (self.config.EPSILON_START - self.config.EPSILON_END) / self.config.EPSILON_DECAY_STEPS

    def preprocess_frame(self, frame):
        """Downsample and grayscale frame"""
        frame = frame[1:176:2, ::2]  # Downsample to 88x80
        return np.mean(frame, axis=-1, dtype=np.float32) / 255.0

    def get_epsilon(self):
        return max(self.config.EPSILON_END,
                  self.config.EPSILON_START - self.steps * self.epsilon_decay)

    def update_target_network(self):
        self.target_net.set_weights(self.online_net.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.add(Experience(state, action, reward, next_state, done))

    def act(self, state):
        if random.random() < self.get_epsilon():
            return random.randint(0, self.action_size - 1)

        q_values = self.online_net(np.expand_dims(state, axis=0))
        return np.argmax(q_values.numpy()[0])

    def create_initial_state(self, frame):
        """Create initial state by repeating the first frame"""
        return np.stack([frame] * self.config.FRAME_STACK_SIZE, axis=-1)

    def update_state(self, state, new_frame):
        """Update state by shifting frames and adding new frame"""
        return np.concatenate([state[..., 1:], np.expand_dims(new_frame, axis=-1)], axis=-1)

    def train(self):
        print("Starting training...")
        start_time = time.time()

        # Warmup phase
        print(f"Warming up replay buffer (min: {self.config.MIN_REPLAY_HISTORY})...")
        frame, _ = self.env.reset()
        frame = self.preprocess_frame(frame)
        state = self.create_initial_state(frame)

        while len(self.replay_buffer) < self.config.MIN_REPLAY_HISTORY:
            action = self.env.action_space.sample()
            next_frame, reward, done, _, _ = self.env.step(action)
            next_frame = self.preprocess_frame(next_frame)
            next_state = self.update_state(state, next_frame)

            self.remember(state, action, reward, next_state, done)
            if not done:
                state = next_state
            else:
                frame, _ = self.env.reset()
                frame = self.preprocess_frame(frame)
                state = self.create_initial_state(frame)

        print(f"Starting main training with {len(self.replay_buffer)} samples...")

        for episode in range(1, self.config.TOTAL_EPISODES + 1):
            episode_start = time.time()
            frame, _ = self.env.reset()
            frame = self.preprocess_frame(frame)
            state = self.create_initial_state(frame)

            total_reward = 0
            done = False

            while not done:
                action = self.act(state)
                next_frame, reward, done, _, _ = self.env.step(action)
                next_frame = self.preprocess_frame(next_frame)
                next_state = self.update_state(state, next_frame)

                self.remember(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward
                self.steps += 1

                if len(self.replay_buffer) >= self.config.BATCH_SIZE and self.steps % 4 == 0:
                    self.replay()

                if self.steps % self.config.TARGET_UPDATE_FREQ == 0:
                    self.update_target_network()

            self.episode_rewards.append(total_reward)

            if episode % self.config.CHECKPOINT_FREQ == 0:
                self.save_checkpoint(episode)
                gc.collect()

            if episode % 10 == 0 or episode == 1:
                avg_reward = np.mean(self.episode_rewards[-10:])
                print(f"Ep {episode:4d} | R: {total_reward:6.1f} | "
                      f"Avg R: {avg_reward:6.1f} | ε: {self.get_epsilon():.3f} | "
                      f"Steps: {self.steps}")

        self.save_checkpoint('final')
        total_time = time.time() - start_time
        print(f"\nTraining completed in {total_time/60:.2f} minutes")
        print(f"Average reward: {np.mean(self.episode_rewards):.1f}")
        print(f"Best reward: {np.max(self.episode_rewards):.1f}")
        self.plot_training()

    def replay(self):
        batch = self.replay_buffer.sample(self.config.BATCH_SIZE)
        states = np.stack([e.state for e in batch])
        actions = np.array([e.action for e in batch])
        rewards = np.array([e.reward for e in batch], dtype=np.float32)
        next_states = np.stack([e.next_state for e in batch])
        dones = np.array([e.done for e in batch], dtype=np.float32)

        with tf.GradientTape() as tape:
            current_q = self.online_net(states)
            current_action_q = tf.reduce_sum(
                current_q * tf.one_hot(actions, self.action_size),
                axis=1
            )

            next_q = self.target_net(next_states)
            target_q = rewards + (1 - dones) * self.config.GAMMA * tf.reduce_max(next_q, axis=1)

            loss = tf.keras.losses.Huber()(target_q, current_action_q)

        grads = tape.gradient(loss, self.online_net.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.online_net.trainable_variables))

    def save_checkpoint(self, episode):
        path = os.path.join(self.config.CHECKPOINT_DIR, f"pacman_ep{episode}.keras")
        self.online_net.save(path, include_optimizer=False)
        print(f"Saved checkpoint: {path}")

    def plot_training(self):
        plt.figure(figsize=(12, 5))
        plt.plot(self.episode_rewards)
        plt.title("Training Progress")
        plt.xlabel("Episode")
        plt.ylabel("Reward")

        plot_path = os.path.join(self.config.CHECKPOINT_DIR, "training_results.png")
        plt.savefig(plot_path)
        plt.close()
        print(f"Saved training plot: {plot_path}")

if __name__ == "__main__":
    gc.collect()
    tf.keras.backend.clear_session()

    try:
        import ale_py
    except ImportError:
        print("Installing required packages...")
        import sys
        import subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install",
                             "ale-py", "gymnasium", "tensorflow", "psutil"])

    config = TrainingConfig()
    agent = DQNAgent(config)
    agent.train()

  File "/usr/local/lib/python3.10/dist-packages/gymnasium/envs/registration.py", line 594, in load_plugin_envs
    fn()
  File "/usr/local/lib/python3.10/dist-packages/shimmy/registration.py", line 304, in register_gymnasium_envs
    _register_atari_envs()
  File "/usr/local/lib/python3.10/dist-packages/shimmy/registration.py", line 244, in _register_atari_envs
    _register_atari_configs(
  File "/usr/local/lib/python3.10/dist-packages/shimmy/registration.py", line 168, in _register_atari_configs
    from ale_py.roms import utils as rom_utils
ImportError: cannot import name 'utils' from 'ale_py.roms' (/usr/local/lib/python3.10/dist-packages/ale_py/roms/__init__.py)
[0m
  logger.warn(f"plugin: {plugin.value} raised {traceback.format_exc()}")
  from jax import xla_computation as _xla_computation


Using configuration (RAM available: 30.1GB):
- Replay buffer: 80000
- Batch size: 64
- Frame stack: 4
Starting training...
Warming up replay buffer (min: 10000)...
Starting main training with 10000 samples...
Ep    1 | R:  310.0 | Avg R:  310.0 | ε: 0.997 | Steps: 513
Ep   10 | R:  190.0 | Avg R:  252.0 | ε: 0.975 | Steps: 5014
Ep   20 | R:  350.0 | Avg R:  285.0 | ε: 0.950 | Steps: 10092
Ep   30 | R:  640.0 | Avg R:  283.0 | ε: 0.924 | Steps: 15334
Ep   40 | R:  280.0 | Avg R:  332.0 | ε: 0.898 | Steps: 20546
Ep   50 | R:  240.0 | Avg R:  239.0 | ε: 0.875 | Steps: 25260
Ep   60 | R:  220.0 | Avg R:  314.0 | ε: 0.849 | Steps: 30508
Ep   70 | R:  120.0 | Avg R:  294.0 | ε: 0.824 | Steps: 35650
Ep   80 | R:  210.0 | Avg R:  278.0 | ε: 0.798 | Steps: 40850
Ep   90 | R:  120.0 | Avg R:  313.0 | ε: 0.771 | Steps: 46288
Saved checkpoint: ./pacman_models/pacman_ep100.keras
Ep  100 | R:  170.0 | Avg R:  279.0 | ε: 0.744 | Steps: 51696
Ep  110 | R:  230.0 | Avg R:  228.0 | ε: 0.719 | Steps: 568