In [56]:
import gymnasium as gym
import torch.nn as nn
import torch
from collections import deque
import random
from pathlib import Path
from torch import optim
import numpy as np
from datetime import datetime
import subprocess
import json
import time
import webbrowser

# Lunar landing problem
Game general descripcion

- Game specific description
- Action space
- Observation space
- Rewards
- Final states

In [57]:
SEED = 42
class Environment:
    def __init__(self, model_id: str, render_mode=None, seed=None):
        self.env = gym.make(model_id, render_mode=render_mode)
        self.seed = seed
        self.observation_space = self.env.observation_space
        self.action_space = self.env.action_space

    def reset(self):
        observation, info = self.env.reset(seed=self.seed)
        return observation, info

    def step(self, action):
        return self.env.step(action)

    def render(self):
        return self.env.render()

    def close(self):
        self.env.close()

    def get_action_space_size(self):
        return self.action_space.n

    def get_observation_space_shape(self):
        return self.observation_space.shape

In [58]:
lunar_landing_env = Environment(model_id="LunarLander-v3", render_mode="human", seed=SEED)

In [59]:
print(f"Action space: {lunar_landing_env.action_space}")
print(f"Observation space: {lunar_landing_env.observation_space}")

Action space: Discrete(4)
Observation space: Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)


As we can observe in the observation space, we have a 8 dimensional vector where its values are almost in the same
scale.

# Modeling: DQN

To solve this problem he have chose DQN algorithm because:

## Tech stack for modeling

Pytorch...

Architecture diagram

Layer sizes experiments... and results

In [60]:
class DQNNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_sizes=[128, 128]):
        super(DQNNetwork, self).__init__()

        layers = []
        input_dim = state_dim

        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(input_dim, hidden_size))
            layers.append(nn.ReLU())
            input_dim = hidden_size

        layers.append(nn.Linear(input_dim, action_dim))

        self.network = nn.Sequential(*layers)

        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.constant_(m.bias, 0)

    def forward(self, state):
        return self.network(state)

# Training process

- Target network and q network, why
- Actualization strategy, why
- Experience replay, why (maybe pondered experience replay, why)
- Epsilon-greedy strategy, why

## Replay buffer

In [61]:
class ReplayBuffer:
    def __init__(self, capacity, device='cpu'):
        self.buffer = deque(maxlen=capacity)
        self.device = device

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = zip(*batch)

        state = torch.FloatTensor(np.array(state)).to(self.device)
        action = torch.LongTensor(action).to(self.device)
        reward = torch.FloatTensor(reward).to(self.device)
        next_state = torch.FloatTensor(np.array(next_state)).to(self.device)
        done = torch.FloatTensor(done).to(self.device)

        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)

# DQN Agent and DQN Agent Updater

In [62]:
class DQNAgent:
    def __init__(
            self,
            state_dim,
            action_dim,
            model_dir='models/dqn',
            epsilon=1.0,
    ):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model_dir = Path(model_dir)
        self.model_dir.mkdir(parents=True, exist_ok=True)
        self.epsilon = epsilon

        if torch.cuda.is_available():
            device = "cuda"
        elif torch.backends.mps.is_available():
            device = "mps"
        else:
            device = "cpu"

        self.device = device

        self.q_network = DQNNetwork(state_dim, action_dim, hidden_sizes=[256, 256]).to(self.device)
        self.target_network = DQNNetwork(state_dim, action_dim, hidden_sizes=[256, 256]).to(self.device)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.target_network.eval()

    def act(self, state, training=True):
        if training and np.random.random() < self.epsilon:
            return np.random.randint(self.action_dim)

        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            q_values = self.q_network(state_tensor)
            return q_values.argmax().item()

    def save(self, episode=None):
        filename = f"dqn_episode_{episode}.pth" if episode else "dqn_final.pth"
        filepath = self.model_dir / filename

        checkpoint = {
            'q_network_state_dict': self.q_network.state_dict(),
            'target_network_state_dict': self.target_network.state_dict(),
            'episode': episode
        }

        torch.save(checkpoint, filepath)
        print(f"Model saved to {filepath}")

        if episode is None:
            best_path = self.model_dir / "dqn_best.pth"
            torch.save(checkpoint, best_path)

    def load(self, filepath=None):
        if filepath is None:
            filepath = self.model_dir / "dqn_best.pth"
            if not filepath.exists():
                filepath = self.model_dir / "dqn_final.pth"

        if not Path(filepath).exists():
            print(f"No model found at {filepath}")
            return False

        checkpoint = torch.load(filepath, map_location=self.device)
        self.q_network.load_state_dict(checkpoint['q_network_state_dict'])
        self.target_network.load_state_dict(checkpoint['target_network_state_dict'])

        print(f"Model loaded from {filepath}")
        return True

class DQNUpdater:
    def __init__(
            self,
            agent: DQNAgent,
            target_update_freq=10,
            buffer_size=10000,
            batch_size=64,
            lr=1e-3,
            gamma=0.99,
            epsilon_min=0.01,
            epsilon_decay=0.995,
            device='cpu'
    ):
        self.agent = agent
        self.device = device
        self.batch_size = batch_size
        self.target_update_freq = target_update_freq
        self.gamma = gamma
        self.lr = lr
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay

        self.optimizer = optim.Adam(self.agent.q_network.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()

        self.update_counter = 0

        self.replay_buffer = ReplayBuffer(buffer_size, self.device)

    def store_transition(self, state, action, reward, next_state, done):
        self.replay_buffer.push(state, action, reward, next_state, done)

    def update(self):
        if len(self.replay_buffer) < self.batch_size:
            return None

        states, actions, rewards, next_states, dones = self.replay_buffer.sample(self.batch_size)

        current_q_values = self.agent.q_network(states).gather(1, actions.unsqueeze(1)).squeeze(1)

        with torch.no_grad():
            next_q_values = self.agent.target_network(next_states).max(1)[0]
            target_q_values = rewards + self.gamma * next_q_values * (1 - dones)  # TODO: check it

        loss = self.loss_fn(current_q_values, target_q_values)

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.agent.q_network.parameters(), 1.0)
        self.optimizer.step()

        self.update_counter += 1
        if self.update_counter % self.target_update_freq == 0:
            self.agent.target_network.load_state_dict(self.agent.q_network.state_dict())

        if self.agent.epsilon > self.epsilon_min:
            self.agent.epsilon *= self.epsilon_decay

        return loss.item()

    def save(self, episode=None):
        checkpoint = {
            'optimizer_state_dict': self.optimizer.state_dict(),
            'update_counter': self.update_counter,
            'episode': episode
        }

        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.update_counter = checkpoint.get('update_counter', 0)

        # TODO: save agent updater

In [63]:
import os
from torch.utils.tensorboard import SummaryWriter

class Logger:
    def __init__(self, log_dir='logs', tensorboard_dir='runs'):
        self.log_dir = log_dir
        self.history = {
            'episodes': [],
            'rewards': [],
            'steps': [],
            'timestamps': []
        }

        if not os.path.exists(log_dir):
            os.makedirs(log_dir)

        # Initialize TensorBoard writer
        run_name = f"DQN_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
        self.tensorboard_dir = os.path.join(tensorboard_dir, run_name)
        self.writer = SummaryWriter(self.tensorboard_dir)

    def log_episode(self, episode, reward, steps):
        self.history['episodes'].append(episode)
        self.history['rewards'].append(reward)
        self.history['steps'].append(steps)
        self.history['timestamps'].append(datetime.now().isoformat())

        # Log to TensorBoard
        self.writer.add_scalar('Episode/Reward', reward, episode)
        self.writer.add_scalar('Episode/Steps', steps, episode)

        # Log moving averages
        if len(self.history['rewards']) >= 10:
            avg_10 = np.mean(self.history['rewards'][-10:])
            self.writer.add_scalar('Average/Reward_10ep', avg_10, episode)

        if len(self.history['rewards']) >= 100:
            avg_100 = np.mean(self.history['rewards'][-100:])
            self.writer.add_scalar('Average/Reward_100ep', avg_100, episode)

    def save_history(self, filename=None):
        if filename is None:
            filename = f"training_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

        filepath = os.path.join(self.log_dir, filename)
        with open(filepath, 'w') as f:
            json.dump(self.history, f, indent=2)

        print(f"Training history saved to {filepath}")

    def get_history(self):
        return self.history

    def save(self, filepath):
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        np.savez(filepath, **self.history)
        print(f"Training history saved to {filepath}")

    def get_statistics(self, last_n=100):
        if len(self.history['rewards']) < last_n:
            last_n = len(self.history['rewards'])

        if last_n == 0:
            return {}

        recent_rewards = self.history['rewards'][-last_n:]
        recent_steps = self.history['steps'][-last_n:]

        stats = {
            'mean_reward': sum(recent_rewards) / len(recent_rewards),
            'max_reward': max(recent_rewards),
            'min_reward': min(recent_rewards),
            'mean_steps': sum(recent_steps) / len(recent_steps),
            'total_episodes': len(self.history['episodes'])
        }

        return stats

    def log_training_metrics(self, episode, loss=None, epsilon=None, eval_reward=None):
        """Log additional training metrics to TensorBoard"""
        if loss is not None:
            self.writer.add_scalar('Training/Loss', loss, episode)
        if epsilon is not None:
            self.writer.add_scalar('Training/Epsilon', epsilon, episode)
        if eval_reward is not None:
            self.writer.add_scalar('Evaluation/Reward', eval_reward, episode)

    def close(self):
        """Close the TensorBoard writer"""
        self.writer.close()
        print(f"TensorBoard logs saved to {self.tensorboard_dir}")
        return self.tensorboard_dir

# Training loop

In [64]:
def train_dqn(
    episodes=1000,
    max_steps=1000,
    save_freq=100,
    eval_freq=50,
    eval_episodes=10,
    seed=42
):
    torch.manual_seed(seed)
    np.random.seed(seed)

    env = Environment("LunarLander-v3", render_mode=None, seed=seed)
    eval_env = Environment("LunarLander-v3", render_mode=None, seed=seed + 1000)

    state_dim = env.get_observation_space_shape()[0]
    action_dim = env.get_action_space_size()
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    agent = DQNAgent(
        state_dim=state_dim,
        action_dim=action_dim,
    )

    agent_updater = DQNUpdater(
        agent=agent,
        lr=5e-4,
        gamma=0.99,
        epsilon_min=0.01,
        epsilon_decay=0.995,
        buffer_size=50000,
        batch_size=64,
        target_update_freq=100,
        device=device,
    )

    logger = Logger()
    best_avg_reward = -float('inf')
    training_info = {
        'start_time': datetime.now().isoformat(),
        'episodes': episodes,
        'seed': seed,
        'device': agent.device
    }

    print(f"Starting DQN training on {agent.device}")
    print(f"State dimension: {state_dim}, Action dimension: {action_dim}")
    print("-" * 50)

    for episode in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        steps = 0
        losses = []

        for step in range(max_steps):
            action = agent.act(state, training=True)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            agent_updater.store_transition(state, action, reward, next_state, done)

            loss = agent_updater.update()
            if loss is not None:
                losses.append(loss)

            total_reward += reward
            steps += 1
            state = next_state

            if done:
                break

        logger.log_episode(episode, total_reward, steps)

        avg_loss = np.mean(losses) if losses else 0

        # Log additional metrics to TensorBoard
        logger.log_training_metrics(
            episode=episode,
            loss=avg_loss if losses else None,
            epsilon=agent.epsilon,
        )

        if episode % 10 == 0:
            avg_reward_100 = np.mean(logger.get_history()['rewards'][-100:]) if episode >= 99 else np.mean(logger.get_history()['rewards'])
            print(f"Episode {episode:4d} | Reward: {total_reward:7.2f} | Steps: {steps:3d} | "
                  f"Avg100: {avg_reward_100:7.2f} | Loss: {avg_loss:.4f} | ε: {agent.epsilon:.3f}")

        if episode % eval_freq == 0 and episode > 0:
            eval_rewards = []
            for _ in range(eval_episodes):
                state, _ = eval_env.reset()
                eval_reward = 0
                for _ in range(max_steps):
                    action = agent.act(state, training=False)
                    state, reward, terminated, truncated, _ = eval_env.step(action)
                    eval_reward += reward
                    if terminated or truncated:
                        break
                eval_rewards.append(eval_reward)

            avg_eval_reward = np.mean(eval_rewards)
            print(f"  [EVAL] Average reward over {eval_episodes} episodes: {avg_eval_reward:.2f}")

            # Log evaluation results to TensorBoard
            logger.log_training_metrics(episode=episode, eval_reward=avg_eval_reward)

            if avg_eval_reward > best_avg_reward:
                best_avg_reward = avg_eval_reward
                agent.save()
                print(f"  [SAVE] New best model! Average reward: {best_avg_reward:.2f}")

        if episode % save_freq == 0 and episode > 0:
            agent.save(episode)

    agent.save()

    training_info['end_time'] = datetime.now().isoformat()
    training_info['final_avg_reward'] = float(np.mean(logger.get_history()['rewards'][-100:]))
    training_info['best_avg_reward'] = float(best_avg_reward)

    with open('models/dqn/training_info.json', 'w') as f:
        json.dump(training_info, f, indent=2)

    logger.save('logs/dqn/training_history.npz')

    print("\n" + "=" * 50)
    print("Training completed!")
    print(f"Final average reward (last 100 episodes): {training_info['final_avg_reward']:.2f}")
    print(f"Best evaluation average reward: {best_avg_reward:.2f}")
    print(f"Models saved in: models/dqn/")

    env.close()
    eval_env.close()

    # Close TensorBoard writer and get log directory
    tensorboard_dir = logger.close()

    # Launch TensorBoard
    print("\n" + "=" * 50)
    print("Launching TensorBoard...")
    print(f"TensorBoard will show results from: {tensorboard_dir}")

    try:
        # Start TensorBoard process
        tb_process = subprocess.Popen(
            ["tensorboard", "--logdir", tensorboard_dir, "--port", "6006"],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )

        # Give TensorBoard time to start
        time.sleep(3)

        # Open in browser
        url = "http://localhost:6006"
        print(f"Opening TensorBoard at {url}")
        webbrowser.open(url)

        print("\nTensorBoard is running! Press Ctrl+C to stop it.")
        print("You can also manually visit: http://localhost:6006")

        # Keep the process running
        try:
            tb_process.wait()
        except KeyboardInterrupt:
            print("\nStopping TensorBoard...")
            tb_process.terminate()
            tb_process.wait(timeout=5)
    except FileNotFoundError:
        print("TensorBoard not found. Please install it with: pip install tensorboard")
        print(f"You can manually view the logs by running: tensorboard --logdir {tensorboard_dir}")
    except Exception as e:
        print(f"Error launching TensorBoard: {e}")
        print(f"You can manually view the logs by running: tensorboard --logdir {tensorboard_dir}")

    return logger.get_history()

In [None]:
train_dqn(1000)

Starting DQN training on cuda
State dimension: 8, Action dimension: 4
--------------------------------------------------
Episode    0 | Reward: -307.65 | Steps: 107 | Avg100: -307.65 | Loss: 1.9213 | ε: 0.802
