# DQN Training for Q*bert - Kaggle Notebook

This notebook trains 3 DQN variants (Vanilla, Double, Dueling) on Q*bert and logs metrics for comparison.

**Algorithms:**
- Vanilla DQN (Mnih et al., 2015)
- Double DQN (Van Hasselt et al., 2015)
- Dueling DQN (Wang et al., 2015)

**Expected Performance:**
- Vanilla DQN: ~734 avg score
- Double DQN: ~1,428 avg score
- Dueling DQN: ~2,256 avg score

## 1. Install Dependencies

In [None]:
!pip install -q gymnasium[atari]
!pip install -q opencv-python
!pip install -q tqdm
!pip install -q pandas
!pip install -q autorom[accept-rom-license]
!AutoROM --accept-license

## 2. Imports

In [None]:
import gymnasium as gym
import ale_py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import cv2
import random
import time
import csv
import json
import os
from collections import deque
from datetime import datetime
from tqdm import tqdm

gym.register_envs(ale_py)
print("✓ Imports complete")

## 3. Configuration

In [None]:
# Hyperparameters
FRAME_STACK = 4
FRAME_WIDTH = 84
FRAME_HEIGHT = 84
NUM_ACTIONS = 6

LEARNING_RATE = 0.0001
GAMMA = 0.99
BATCH_SIZE = 32
UPDATE_FREQUENCY = 4
TARGET_UPDATE = 1000

EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY_STEPS = 500000

REPLAY_BUFFER_SIZE = 100000
MIN_REPLAY_SIZE = 10000

TOTAL_EPISODES = 5000  # Adjust based on Kaggle time limit
MAX_STEPS_PER_EPISODE = 10000
SAVE_FREQUENCY = 500
STEP_LOG_FREQUENCY = 10

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"✓ Using device: {DEVICE}")

## 4. Preprocessing Utils

In [None]:
def preprocess_frame(frame):
    """Convert 210x160x3 RGB to 84x84x1 grayscale"""
    gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    resized = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_AREA)
    normalized = resized / 255.0
    return normalized

class FrameStack:
    """Stack last N frames"""
    def __init__(self, num_frames=4):
        self.num_frames = num_frames
        self.frames = deque(maxlen=num_frames)
    
    def reset(self, frame):
        processed = preprocess_frame(frame)
        for _ in range(self.num_frames):
            self.frames.append(processed)
        return self._get_state()
    
    def step(self, frame):
        processed = preprocess_frame(frame)
        self.frames.append(processed)
        return self._get_state()
    
    def _get_state(self):
        return np.stack(self.frames, axis=0)

print("✓ Preprocessing utils defined")

## 5. Replay Buffer

In [None]:
class ReplayBuffer:
    """Circular buffer for experience replay"""
    def __init__(self, capacity=100000):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
    
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
    
    def sample(self, batch_size=32):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            np.array(states),
            np.array(actions),
            np.array(rewards, dtype=np.float32),
            np.array(next_states),
            np.array(dones, dtype=np.float32)
        )
    
    def get_reward_stats(self):
        if len(self.buffer) == 0:
            return {'min': 0, 'max': 0, 'mean': 0, 'std': 0}
        rewards = [t[2] for t in self.buffer]
        return {
            'min': np.min(rewards),
            'max': np.max(rewards),
            'mean': np.mean(rewards),
            'std': np.std(rewards)
        }
    
    def __len__(self):
        return len(self.buffer)

print("✓ Replay buffer defined")

## 6. Logger

In [None]:
class DQNLogger:
    """Logging system for DQN training"""
    def __init__(self, log_dir, algorithm_name):
        self.log_dir = log_dir
        self.algorithm_name = algorithm_name
        os.makedirs(log_dir, exist_ok=True)
        
        self.episode_log_path = os.path.join(log_dir, "training_log.csv")
        self.episode_log_file = open(self.episode_log_path, 'w', newline='')
        self.episode_writer = csv.writer(self.episode_log_file)
        self.episode_writer.writerow([
            'episode', 'total_reward', 'episode_length', 'avg_loss', 'avg_q_value',
            'epsilon', 'timestamp', 'total_steps', 'training_time_seconds',
            'level_reached', 'buffer_reward_min', 'buffer_reward_max',
            'buffer_reward_mean', 'buffer_reward_std'
        ])
        
        self.step_log_path = os.path.join(log_dir, "step_log.csv")
        self.step_log_file = open(self.step_log_path, 'w', newline='')
        self.step_writer = csv.writer(self.step_log_file)
        self.step_writer.writerow([
            'global_step', 'episode', 'step_in_episode', 'action',
            'reward', 'loss', 'q_value', 'epsilon', 'timestamp'
        ])
        
        self.episode_metrics = {'losses': [], 'q_values': [], 'rewards': []}
    
    def log_step(self, global_step, episode, step_in_episode, action, 
                 reward, loss, q_value, epsilon):
        self.step_writer.writerow([
            global_step, episode, step_in_episode, action, reward,
            loss if loss is not None else '', q_value, epsilon,
            datetime.now().isoformat()
        ])
        self.episode_metrics['rewards'].append(reward)
        if loss is not None:
            self.episode_metrics['losses'].append(loss)
        if q_value is not None:
            self.episode_metrics['q_values'].append(q_value)
    
    def log_episode(self, episode, total_reward, episode_length, epsilon,
                    total_steps, training_time, level_reached=1,
                    buffer_reward_stats=None):
        avg_loss = sum(self.episode_metrics['losses']) / len(self.episode_metrics['losses']) \
                   if self.episode_metrics['losses'] else 0
        avg_q = sum(self.episode_metrics['q_values']) / len(self.episode_metrics['q_values']) \
                if self.episode_metrics['q_values'] else 0
        
        if buffer_reward_stats is None:
            buffer_reward_stats = {'min': 0, 'max': 0, 'mean': 0, 'std': 0}
        
        self.episode_writer.writerow([
            episode, total_reward, episode_length, avg_loss, avg_q, epsilon,
            datetime.now().isoformat(), total_steps, training_time, level_reached,
            buffer_reward_stats['min'], buffer_reward_stats['max'],
            buffer_reward_stats['mean'], buffer_reward_stats['std']
        ])
        
        self.episode_log_file.flush()
        self.step_log_file.flush()
        self.episode_metrics = {'losses': [], 'q_values': [], 'rewards': []}
    
    def save_config(self, config_dict):
        config_path = os.path.join(self.log_dir, "config.json")
        with open(config_path, 'w') as f:
            json.dump(config_dict, f, indent=2)
    
    def close(self):
        self.episode_log_file.close()
        self.step_log_file.close()

print("✓ Logger defined")

## 7. Neural Network Models

In [None]:
class VanillaDQN(nn.Module):
    def __init__(self, input_channels=4, num_actions=6):
        super(VanillaDQN, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(7 * 7 * 64, 512)
        self.fc2 = nn.Linear(512, num_actions)
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

class DoubleDQN(nn.Module):
    def __init__(self, input_channels=4, num_actions=6):
        super(DoubleDQN, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(7 * 7 * 64, 512)
        self.fc2 = nn.Linear(512, num_actions)
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

class DuelingDQN(nn.Module):
    def __init__(self, input_channels=4, num_actions=6):
        super(DuelingDQN, self).__init__()
        self.conv1 = nn.Conv2d(input_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        
        self.value_fc1 = nn.Linear(7 * 7 * 64, 512)
        self.value_fc2 = nn.Linear(512, 1)
        
        self.advantage_fc1 = nn.Linear(7 * 7 * 64, 512)
        self.advantage_fc2 = nn.Linear(512, num_actions)
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        
        value = F.relu(self.value_fc1(x))
        value = self.value_fc2(value)
        
        advantage = F.relu(self.advantage_fc1(x))
        advantage = self.advantage_fc2(advantage)
        
        q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))
        return q_values

print("✓ Models defined")
print(f"  Vanilla/Double DQN: {sum(p.numel() for p in VanillaDQN().parameters()):,} params")
print(f"  Dueling DQN: {sum(p.numel() for p in DuelingDQN().parameters()):,} params")

## 8. DQN Agents

In [None]:
class VanillaDQNAgent:
    def __init__(self, policy_net, target_net, lr=0.0001, gamma=0.99, device="cuda"):
        self.policy_net = policy_net.to(device)
        self.target_net = target_net.to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=lr)
        self.gamma = gamma
        self.device = device
    
    def select_action(self, state, epsilon):
        if random.random() < epsilon:
            return random.randrange(NUM_ACTIONS)
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            q_values = self.policy_net(state_tensor)
            return q_values.argmax(1).item()
    
    def get_max_q_value(self, state):
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            q_values = self.policy_net(state_tensor)
            return q_values.max().item()
    
    def train_step(self, replay_buffer, batch_size=32):
        states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
        
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        
        current_q = self.policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        
        with torch.no_grad():
            next_q = self.target_net(next_states).max(1)[0]
            target_q = rewards + self.gamma * next_q * (1 - dones)
        
        loss = F.mse_loss(current_q, target_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()
    
    def update_target_network(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())

class DoubleDQNAgent(VanillaDQNAgent):
    def train_step(self, replay_buffer, batch_size=32):
        states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
        
        states = torch.FloatTensor(states).to(self.device)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        
        current_q = self.policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        
        with torch.no_grad():
            best_actions = self.policy_net(next_states).argmax(1)
            next_q = self.target_net(next_states).gather(1, best_actions.unsqueeze(1)).squeeze(1)
            target_q = rewards + self.gamma * next_q * (1 - dones)
        
        loss = F.mse_loss(current_q, target_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        return loss.item()

class DuelingDQNAgent(DoubleDQNAgent):
    pass

print("✓ Agents defined")

## 9. Training Function

In [None]:
def train_dqn(algorithm_name, model_class, agent_class, num_episodes=5000):
    """
    Train DQN agent
    
    Args:
        algorithm_name: "vanilla_dqn", "double_dqn", or "dueling_dqn"
        model_class: VanillaDQN, DoubleDQN, or DuelingDQN
        agent_class: VanillaDQNAgent, DoubleDQNAgent, or DuelingDQNAgent
        num_episodes: Number of episodes to train
    """
    log_dir = f"logs/{algorithm_name}"
    checkpoint_dir = f"checkpoints/{algorithm_name}"
    os.makedirs(log_dir, exist_ok=True)
    os.makedirs(checkpoint_dir, exist_ok=True)
    
    env = gym.make("ALE/Qbert-v5")
    
    policy_net = model_class(input_channels=FRAME_STACK, num_actions=NUM_ACTIONS)
    target_net = model_class(input_channels=FRAME_STACK, num_actions=NUM_ACTIONS)
    agent = agent_class(policy_net, target_net, lr=LEARNING_RATE, gamma=GAMMA, device=DEVICE)
    
    replay_buffer = ReplayBuffer(capacity=REPLAY_BUFFER_SIZE)
    logger = DQNLogger(log_dir, algorithm_name)
    logger.save_config({
        'algorithm': algorithm_name,
        'learning_rate': LEARNING_RATE,
        'gamma': GAMMA,
        'batch_size': BATCH_SIZE,
        'epsilon_start': EPSILON_START,
        'epsilon_end': EPSILON_END,
        'epsilon_decay_steps': EPSILON_DECAY_STEPS,
        'replay_buffer_size': REPLAY_BUFFER_SIZE,
        'target_update': TARGET_UPDATE,
    })
    
    global_step = 0
    epsilon = EPSILON_START
    epsilon_decay = (EPSILON_START - EPSILON_END) / EPSILON_DECAY_STEPS
    
    print(f"\n{'='*60}")
    print(f"Training {algorithm_name.upper()}")
    print(f"{'='*60}")
    
    for episode in tqdm(range(num_episodes), desc=f"Training {algorithm_name}"):
        episode_start_time = time.time()
        
        obs, info = env.reset()
        frame_stack = FrameStack(num_frames=FRAME_STACK)
        state = frame_stack.reset(obs)
        
        episode_reward = 0
        episode_length = 0
        level_reached = 1
        
        for step in range(MAX_STEPS_PER_EPISODE):
            action = agent.select_action(state, epsilon)
            next_obs, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated
            next_state = frame_stack.step(next_obs)
            
            if 'level' in info:
                level_reached = max(level_reached, info['level'])
            
            replay_buffer.push(state, action, reward, next_state, done)
            
            loss = None
            if len(replay_buffer) > MIN_REPLAY_SIZE and global_step % UPDATE_FREQUENCY == 0:
                loss = agent.train_step(replay_buffer, BATCH_SIZE)
            
            if global_step % TARGET_UPDATE == 0:
                agent.update_target_network()
            
            if global_step % STEP_LOG_FREQUENCY == 0:
                q_value = agent.get_max_q_value(state)
                logger.log_step(global_step, episode, step, action, reward, loss, q_value, epsilon)
            
            state = next_state
            episode_reward += reward
            episode_length += 1
            global_step += 1
            epsilon = max(EPSILON_END, epsilon - epsilon_decay)
            
            if done:
                break
        
        episode_time = time.time() - episode_start_time
        buffer_stats = replay_buffer.get_reward_stats()
        
        logger.log_episode(episode, episode_reward, episode_length, epsilon, global_step,
                          episode_time, level_reached, buffer_stats)
        
        if (episode + 1) % SAVE_FREQUENCY == 0:
            checkpoint_path = os.path.join(checkpoint_dir, f"ep_{episode+1}.pth")
            torch.save({
                'policy_net_state_dict': policy_net.state_dict(),
                'target_net_state_dict': target_net.state_dict(),
                'optimizer_state_dict': agent.optimizer.state_dict(),
            }, checkpoint_path)
        
        if (episode + 1) % 100 == 0:
            print(f"\nEp {episode+1}/{num_episodes} | Reward: {episode_reward:.0f} | "
                  f"Steps: {episode_length} | Eps: {epsilon:.3f} | Time: {episode_time:.1f}s")
    
    final_path = os.path.join(checkpoint_dir, "final.pth")
    torch.save({
        'policy_net_state_dict': policy_net.state_dict(),
        'target_net_state_dict': target_net.state_dict(),
        'optimizer_state_dict': agent.optimizer.state_dict(),
    }, final_path)
    
    logger.close()
    env.close()
    
    print(f"\n✓ {algorithm_name} training complete!")
    print(f"  Logs: {log_dir}")
    print(f"  Checkpoints: {checkpoint_dir}")

print("✓ Training function defined")

## 10. Train Vanilla DQN

In [None]:
train_dqn("vanilla_dqn", VanillaDQN, VanillaDQNAgent, num_episodes=TOTAL_EPISODES)

## 11. Train Double DQN

In [None]:
train_dqn("double_dqn", DoubleDQN, DoubleDQNAgent, num_episodes=TOTAL_EPISODES)

## 12. Train Dueling DQN

In [None]:
train_dqn("dueling_dqn", DuelingDQN, DuelingDQNAgent, num_episodes=6000)

## 13. Download Logs

Download the logs for visualization on your local machine.

In [None]:
# Zip logs for download
!zip -r dqn_logs.zip logs/
!zip -r dqn_checkpoints.zip checkpoints/

print("✓ Logs and checkpoints zipped")
print("  Download: dqn_logs.zip, dqn_checkpoints.zip")

## 14. Quick Analysis

In [None]:
import pandas as pd

# Load logs
vanilla_df = pd.read_csv("logs/vanilla_dqn/training_log.csv")
double_df = pd.read_csv("logs/double_dqn/training_log.csv")
dueling_df = pd.read_csv("logs/dueling_dqn/training_log.csv")

# Print summary
print("="*60)
print("TRAINING SUMMARY")
print("="*60)

for name, df in [("Vanilla DQN", vanilla_df), ("Double DQN", double_df), ("Dueling DQN", dueling_df)]:
    print(f"\n{name}:")
    print(f"  Average Reward: {df['total_reward'].mean():.2f} ± {df['total_reward'].std():.2f}")
    print(f"  Max Reward: {df['total_reward'].max():.2f}")
    print(f"  Average Training Time: {df['training_time_seconds'].mean():.2f}s per episode")
    print(f"  Total Training Time: {df['training_time_seconds'].sum() / 3600:.2f} hours")
    print(f"  Max Level Reached: {df['level_reached'].max()}")

print("\n" + "="*60)