In [1]:
from env import Environment

import gymnasium as gym
from gymnasium import spaces

In [2]:
def convert_state(state):
    ret_state = {}
    # state["time_step"] = np.array([state["time_step"]]).astype(np.float32).flatten(0)
    # state["map"] = np.array(state["map"]).astype(np.float32)
    ret_state["robots"] = np.array(state["robots"]).astype(np.float32).flatten()
    ret_state["packages"] = np.array(state["packages"]).astype(np.float32).flatten()[:100]
    if len(ret_state["packages"]) < 1000:
        ret_state["packages"] = np.concatenate((ret_state["packages"], np.zeros(100-len(ret_state["packages"]))))
    return np.concatenate(list(ret_state.values()))

In [3]:
def reward_shaping(r, env, state, action):
    return r

In [4]:
class Env(gym.Env):
    def __init__(self, *args, **kwargs):
        super(Env, self).__init__()
        self.env = Environment(*args, **kwargs)

        self.action_space = spaces.multi_discrete.MultiDiscrete([5, 3]*self.env.n_robots)

        self.prev_state = self.env.reset()
        first_state=convert_state(self.prev_state)
        # Define observation space as a dictionary

        self.observation_space = spaces.Box(low=0, high=100, shape=first_state.shape, dtype=np.float32)

        from sklearn.preprocessing import LabelEncoder
        self.le1, self.le2= LabelEncoder(), LabelEncoder()
        self.le1.fit(['S', 'L', 'R', 'U', 'D'])
        self.le2.fit(['0','1', '2'])

    def reset(self, *args, **kwargs):
        self.prev_state = self.env.reset()
        return convert_state(self.prev_state), {}

    def render(self, *args, **kwargs):
        return self.env.render()

    def step(self, action):
        ret = []
        ret.append(self.le1.inverse_transform(action.reshape(-1, 2).T[0]))
        ret.append(self.le2.inverse_transform(action.reshape(-1, 2).T[1]))
        action = list(zip(*ret))

        # You should not modify the infos object
        s, r, done, infos = self.env.step(action)
        new_r = reward_shaping(r, self.env, self.prev_state, action)
        self.prev_state = s
        return convert_state(s), new_r, \
            done, False, infos

In [6]:

import torch
import numpy as np
from types import SimpleNamespace
from algorithms import MAPPO
from algorithms.replay_buffer import ReplayBufferPPO as ReplayBuffer

# 1. Khởi tạo môi trường wrapper
map_file = "map.txt"  # đường dẫn tới file bản đồ
n_robots = 5
n_packages = 20
max_time_steps = 100
env = Env(
    map_file=map_file,
    n_robots=n_robots,
    n_packages=n_packages,
    max_time_steps=max_time_steps,
    seed=2025
)

# 2. Tính toán kích thước đầu vào/đầu ra tự động từ env
args = SimpleNamespace()
args.N = env.env.n_robots
# Actor input = quan sát vector từ env
args.obs_dim = env.observation_space.shape[0]
# Critic input = global state vector; ở đây wrapper trả về cũng chính là observation
args.state_dim = args.obs_dim
# Action dimension = số kết hợp move × pkg per agent
factor_dims = env.action_space.nvec.reshape(env.env.n_robots, 2)
args.action_dim = int(factor_dims[0, 0] * factor_dims[0, 1])

# Siêu tham số huấn luyện
args.episode_limit = max_time_steps
args.rnn_hidden_dim = 64
args.batch_size = 32
args.mini_batch_size = 8
args.max_train_steps = 100000
args.lr = 3e-4
args.gamma = 0.99
args.lamda = 0.95
args.epsilon = 0.2
args.K_epochs = 4
args.entropy_coef = 0.01
args.set_adam_eps = True
args.use_grad_clip = True
args.use_lr_decay = True
args.use_adv_norm = True
args.use_rnn = False
args.add_agent_id = True
args.use_value_clip = True

args.mlp_hidden_dim = 64
args.rnn_hidden_dim = 64

args.use_relu = False
args.use_orthogonal_init = True


# 3. Khởi tạo agent và replay buffer
agent = MAPPO(args)
buffer = ReplayBuffer(
    args
)

# 4. Vòng lặp huấn luyện
import time

total_steps = 0
while total_steps < args.max_train_steps:
    buffer.reset_buffer()
    for ep in range(args.batch_size):
        obs = env.reset()
        done = False
        t = 0
        obs_batch, state_batch, acts_batch, logps_batch, rews_batch, dones_batch = [], [], [], [], [], []

        # rollout tối đa episode_limit bước
        while not done and t < args.episode_limit:
            # Ở đây obs và state dùng cùng vector
            state = obs.copy()
            actions, logps = agent.choose_action(obs, evaluate=False)
            obs_next, reward, done, info = env.step(actions)

            obs_batch.append(obs)
            state_batch.append(state)
            acts_batch.append(actions)
            logps_batch.append(logps)
            rews_batch.append(reward)
            dones_batch.append(done)

            obs = obs_next
            t += 1

        # Lấy giá trị cho mỗi bước trạng thái
        values = agent.get_value(state_batch)
        buffer.push(
            obs_batch, state_batch, acts_batch,
            logps_batch, rews_batch, dones_batch, values
        )

    # Cập nhật actor-critic
    agent.train(buffer, total_steps)
    total_steps += args.batch_size * args.episode_limit

    # Lưu mô hình định kỳ
    if total_steps % 10000 == 0:
        agent.save_model("EnvCustom", number=1, seed=2025, total_steps=total_steps)


------add agent id------
------use_orthogonal_init------
------use_orthogonal_init------
------set adam eps------


AttributeError: 'tuple' object has no attribute 'copy'