In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque
from pathlib import Path
from world.delivery_environment import Environment

def get_device() -> torch.device:
    if torch.cuda.is_available():
        return torch.device("cuda")
    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
        return torch.device("mps")
    return torch.device("cpu")

device = get_device()

pygame 2.6.1 (SDL 2.28.4, Python 3.11.11)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
env = Environment(Path("grid_configs/A1_grid.npy"), no_gui=True)
env.reset()
n_rows, n_cols = env.grid.shape
max_deliveries = env.initial_target_count
state_dim = 3

# def encode_state(state: tuple[int, int, int]) -> torch.Tensor:
#     i, j, rem = state
#     vec = np.zeros(state_dim, dtype=np.float32)
#     # one-hot for row
#     vec[i] = 1.0
#     # one-hot for column
#     vec[n_rows + j] = 1.0
#     # one-hot for remaining deliveries
#     vec[n_rows + n_cols + rem] = 1.0
#     return torch.tensor(vec, device=device)

# def encode_state(state: tuple[int, int, int]) -> torch.Tensor:
#     i, j, rem = state
#     return torch.tensor([i, j, rem], device=device, dtype=torch.float32)

def encode_state_norm(raw: tuple[int,int,int]) -> torch.Tensor:
    i, j, rem = raw
    return torch.tensor([
        i / (n_rows - 1),
        j / (n_cols - 1),
        rem / max_deliveries
    ], device=device, dtype=torch.float32)

  warn("No reward function provided. Using default reward.")
  warn("No initial agent positions given. Randomly placing agents "


In [3]:
class DQN(nn.Module):
    def __init__(self, input_dim: int, n_actions: int):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
        )
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.network(x)

In [4]:
class ReplayBuffer:
    def __init__(self, capacity: int):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        # store raw state tuples for later encoding
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size: int):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        # encode and stack
        states_enc = torch.stack([encode_state_norm(s) for s in states])
        next_states_enc = torch.stack([encode_state_norm(s) for s in next_states])
        return (
            states_enc,
            torch.tensor(actions, dtype=torch.int64, device=device),
            torch.tensor(rewards, dtype=torch.float32, device=device),
            next_states_enc,
            torch.tensor(dones, dtype=torch.float32, device=device)
        )
    
    def __len__(self):
        return len(self.buffer)

In [5]:
n_actions = 4
buffer_capacity = 10000
batch_size = 128
gamma = 0.99
lr = 1e-3
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.997
target_update_freq = 50
num_episodes = 2000
max_steps_per_episode = 1000

In [6]:
policy_net = DQN(state_dim, n_actions).to(device)
target_net = DQN(state_dim, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=lr)
replay_buffer = ReplayBuffer(buffer_capacity)

In [None]:
epsilon = epsilon_start
for episode in range(1, num_episodes + 1):
    raw_state = env.reset()
    state = encode_state_norm(raw_state)
    total_reward = 0
    
    for _ in range(max_steps_per_episode):
        if random.random() < epsilon:
            action = random.randrange(n_actions)
        else:
            with torch.no_grad():
                q_vals = policy_net(state.unsqueeze(0))
                action = q_vals.argmax(dim=1).item()
        
        raw_next, reward, done, _ = env.step(action)
        next_state = encode_state_norm(raw_next)
        
        replay_buffer.push(raw_state, action, reward, raw_next, done)
        total_reward += reward
        raw_state = raw_next
        state = next_state
        
        if len(replay_buffer) >= batch_size:
            s_b, a_b, r_b, ns_b, d_b = replay_buffer.sample(batch_size)
            q_values = policy_net(s_b).gather(1, a_b.unsqueeze(1)).squeeze(1)
            with torch.no_grad():
                next_q = target_net(ns_b).max(dim=1)[0]
                target_q = r_b + gamma * next_q * (1 - d_b)
            loss = nn.MSELoss()(q_values, target_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        if done:
            break
    
    # decay ε
    epsilon = max(epsilon_end, epsilon * epsilon_decay)
    # update target network
    if episode % target_update_freq == 0:
        target_net.load_state_dict(policy_net.state_dict())
    # log
    if episode % 10 == 0:
        print(f"Episode {episode}, Total Reward: {total_reward:.2f}, Epsilon: {epsilon:.2f}")

Episode 10, Total Reward: -2136.00, Epsilon: 0.97
Episode 20, Total Reward: -2260.00, Epsilon: 0.94
Episode 30, Total Reward: -1969.00, Epsilon: 0.91
Episode 40, Total Reward: -2496.00, Epsilon: 0.89
Episode 50, Total Reward: -2245.00, Epsilon: 0.86
Episode 60, Total Reward: -2160.00, Epsilon: 0.84
Episode 70, Total Reward: -1848.00, Epsilon: 0.81
Episode 80, Total Reward: -1941.00, Epsilon: 0.79
Episode 90, Total Reward: -2564.00, Epsilon: 0.76
Episode 100, Total Reward: -2144.00, Epsilon: 0.74
Episode 110, Total Reward: -2492.00, Epsilon: 0.72
Episode 120, Total Reward: -2256.00, Epsilon: 0.70
Episode 130, Total Reward: -2092.00, Epsilon: 0.68
Episode 140, Total Reward: -2236.00, Epsilon: 0.66
Episode 150, Total Reward: -2268.00, Epsilon: 0.64
Episode 160, Total Reward: -2112.00, Epsilon: 0.62
Episode 170, Total Reward: -2196.00, Epsilon: 0.60
Episode 180, Total Reward: -1637.00, Epsilon: 0.58
Episode 190, Total Reward: -1848.00, Epsilon: 0.57
Episode 200, Total Reward: -2008.00, Eps

In [8]:
torch.save(policy_net.state_dict(), "models/A1_grid_policy.pt")

In [10]:
env.grid.shape

(15, 15)