In [None]:
from env import Environment

import gymnasium as gym
from gymnasium import spaces

import numpy as np
import torch
import matplotlib.pyplot as plt
import random
import os

from algorithms import DQNAgent

In [2]:
def convert_state(state):
    ret_state = {}
    # state["time_step"] = np.array([state["time_step"]]).astype(np.float32).flatten(0)
    # state["map"] = np.array(state["map"]).astype(np.float32)
    ret_state["robots"] = np.array(state["robots"]).astype(np.float32).flatten()
    ret_state["packages"] = np.array(state["packages"]).astype(np.float32).flatten()[:100]
    if len(ret_state["packages"]) < 1000:
        ret_state["packages"] = np.concatenate((ret_state["packages"], np.zeros(100-len(ret_state["packages"]))))
    return np.concatenate(list(ret_state.values()))

In [3]:
def reward_shaping(r, env, state, action):
    return r

In [None]:
class Env(gym.Env):
    def __init__(self, *args, **kwargs):
        super(Env, self).__init__()
        self.env = Environment(*args, **kwargs)

        self.action_space = spaces.multi_discrete.MultiDiscrete([5, 3]*self.env.n_robots)

        self.prev_state = self.env.reset()
        first_state=convert_state(self.prev_state)
        # Define observation space as a dictionary

        self.observation_space = spaces.Box(low=0, high=100, shape=first_state.shape, dtype=np.float32)

        from sklearn.preprocessing import LabelEncoder
        self.le1, self.le2= LabelEncoder(), LabelEncoder()
        self.le1.fit(['S', 'L', 'R', 'U', 'D'])
        self.le2.fit(['0','1', '2'])

    def reset(self, *args, **kwargs):
        self.prev_state = self.env.reset()
        return convert_state(self.prev_state), {}

    def render(self, *args, **kwargs):
        return self.env.render()

    def step(self, action):
        ret = []
        ret.append(self.le1.inverse_transform(action.reshape(-1, 2).T[0]))
        ret.append(self.le2.inverse_transform(action.reshape(-1, 2).T[1]))
        action = list(zip(*ret))

        # You should not modify the infos object
        s, r, done, infos = self.env.step(action)
        new_r = reward_shaping(r, self.env, self.prev_state, action)
        self.prev_state = s
        return convert_state(s), new_r, \
            done, False, infos

In [5]:
def evaluate_agent(agent, env, num_episode=10):
    total_rewards = []

    original_epsilon = agent.epsilon
    agent.epsilon = 0 
    
    for _ in range(num_episode):
        state, _ = env.reset()
        done = False
        
        while not done:
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(agent.device)
            action = agent.select_action(state_tensor)
            next_state, reward, done, _, infos = env.step(action)
            state = next_state

        ep_reward = infos.get("total_reward", reward)
        total_rewards.append(ep_reward)

    agent.epsilon = original_epsilon
    avg_reward = np.mean(total_rewards)
    print(f"Average Evaluation reward over {num_episode} episodes: {avg_reward}")
    return avg_reward

In [6]:
def plot_rewards(rewards, filename="dqn_learning_curve.png", window=10):
    plt.figure(figsize=(10, 5))
    rewards = np.array(rewards)
    averages = np.convolve(rewards, np.ones(window)/window, mode="valid")
    plt.plot(rewards, label="Reward per episode")
    plt.plot(range(window-1, len(rewards)), averages, label=f"{window}-episode average")
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.legend()
    plt.title("DQN Learning Curve")
    plt.grid()
    plt.savefig(f"plots/{filename}")
    plt.show()

In [7]:
def save_model(agent, filename="dqn", map_name="map"):
    os.makedirs("models", exist_ok=True)
    torch.save(agent.policy_net.state_dict(), f"models/{filename}_{map_name}.pth")
    print(f"Model saved to {filename}")

In [8]:
def load_model(agent, filename="dqn_model.pth"):
    agent.policy_net.load_state_dict(torch.load(f"models/{filename}", map_location=agent.device))
    agent.target_net.load_state_dict(agent.policy_net.state_dict())

In [9]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [10]:
def train_dqn(agent, env,
              map_name="map",
              num_episodes=500,
              target_update=10,
              eval_every=10, seed=42):
    set_seed(seed)
    rewards_history = []
    
    for ep in range(num_episodes):
        state, _ = env.reset()
        done = False
        
        while not done:
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(agent.device)
            action = agent.select_action(state_tensor)
            next_state, reward, done, _, infos = env.step(action)

            agent.buffer_push(state, action, reward, next_state, done)
            state = next_state

            agent.update()
        
        # Use total_reward from infos
        total_reward = infos.get("total_reward", reward)
        rewards_history.append(total_reward)

        if ep % target_update == 0:
            agent.update_target_network()
        
        if eval_every and ep % eval_every == 0:
            avg_reward = evaluate_agent(agent, env)
            print(f"Episode {ep}, Average Evaluation Reward: {avg_reward}")
    
    print("Training complete.")
    save_model(agent, filename="dqn", map_name=map_name)
    return rewards_history

### Parameters

In [11]:
seed = 42
map_name = "map"
mapfile = map_name + ".txt"
env = Env(mapfile, seed=seed)


In [12]:
obs_size = env.observation_space.shape[0]
n_robots = env.env.n_robots
agent = DQNAgent(obs_size=obs_size, n_robots=n_robots)
set_seed(seed)

In [13]:
rewards = train_dqn(agent, env, map_name=map_name, num_episodes=100000, eval_every=10, seed=seed)

Average Evaluation reward over 10 episodes: -0.068
Episode 0, Average Evaluation Reward: -0.068
Average Evaluation reward over 10 episodes: -0.076
Episode 10, Average Evaluation Reward: -0.076
Average Evaluation reward over 10 episodes: -0.5980000000000004
Episode 20, Average Evaluation Reward: -0.5980000000000004
Average Evaluation reward over 10 episodes: 1.530000000000001
Episode 30, Average Evaluation Reward: 1.530000000000001
Average Evaluation reward over 10 episodes: -0.2340000000000003
Episode 40, Average Evaluation Reward: -0.2340000000000003
Average Evaluation reward over 10 episodes: -0.3730000000000004
Episode 50, Average Evaluation Reward: -0.3730000000000004
Average Evaluation reward over 10 episodes: -0.37700000000000033
Episode 60, Average Evaluation Reward: -0.37700000000000033
Average Evaluation reward over 10 episodes: -0.41000000000000014
Episode 70, Average Evaluation Reward: -0.41000000000000014
Average Evaluation reward over 10 episodes: -0.24800000000000036
Epis

KeyboardInterrupt: 