In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np

from dummy_gym import DummyGym

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = DummyGym()

In [7]:
# 超参数
learning_rate = 0.0003
gamma = 0.95
num_episodes = 200
global_update_freq = 5  # 每隔 5 个步骤更新一次全局模型



# state return self.state（self.visit_count, self.fov_map, self.car.pos）, reward, done, {}
# step()执行完之后自动返回。 observe()返回的self.visit_count, self.fov_map, self.car.pos 直接返回state



In [3]:
class GlobalModel(nn.Module):
    def __init__(self, action_size):
        super(GlobalModel, self).__init__()
        # 修改网络结构以更好地处理地图特征
        self.conv1 = nn.Conv2d(2, 32, kernel_size=3, stride=1, padding=1)  # 输入通道改为2，包含当前状态和探索历史
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.fc = nn.Linear(64 * 8 * 8, 512)  # 增加网络容量

        self.policy_logits = nn.Linear(512, action_size)
        self.value = nn.Linear(512, 1)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc(x))
        return self.policy_logits(x), self.value(x)

In [4]:
class Worker(mp.Process):
    def __init__(self, global_model, optimizer, action_size, worker_id, gamma=0.95):
        # ... existing code ...
        self.exploration_history = np.zeros((8, 8))  # 添加探索历史记录

    def preprocess_state(self, state):
        # 添加状态预处理
        current_map = state[1]
        # 将当前状态和探索历史组合
        state_tensor = torch.tensor(
            np.stack([current_map, self.exploration_history]), 
            dtype=torch.float32
        ).unsqueeze(0)
        return state_tensor

    def run(self):
        global_episode = 0
        while global_episode < num_episodes:
            state = self.env.reset()
            current_state = self.preprocess_state(state)
            self.exploration_history = np.zeros((8, 8))  # 重置探索历史
            episode_reward = 0
            done = False
            
            while not done:
                action = self.choose_action(current_state)
                next_state, reward, done, _ = self.env.step(action)
                
                # 更新探索历史
                pos = next_state[0]  # 假设返回的状态包含智能体位置
                self.exploration_history[int(pos[0]), int(pos[1])] = 1
                
                # 计算探索奖励
                exploration_reward = 0.5 if self.exploration_history[int(pos[0]), int(pos[1])] == 0 else -0.1
                total_reward = reward + exploration_reward
                
                next_state_tensor = self.preprocess_state(next_state)
                loss = self.compute_loss(done, current_state, action, total_reward, next_state_tensor)
                self.update_global(loss)
                
                episode_reward += total_reward
                current_state = next_state_tensor

                if done:
                    coverage = np.sum(self.exploration_history) / (8 * 8)
                    print(f"Worker: {self.worker_id}, Episode: {global_episode}, "
                          f"Reward: {episode_reward:.2f}, Coverage: {coverage:.2%}")
                    global_episode += 1

In [5]:
# 定义全局模型
class GlobalModel(nn.Module):
    def __init__(self, action_size):
        super(GlobalModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1)
        self.fc = nn.Linear(64 * 6 * 6, 256)  # 假设输入是一个 8x8 的网格

        # 策略网络输出每个动作的概率
        self.policy_logits = nn.Linear(256, action_size)
        # 价值网络输出当前状态的价值
        self.value = nn.Linear(256, 1)

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc(x))
        return self.policy_logits(x), self.value(x)

# 定义 Worker 类
class Worker(mp.Process):
    def __init__(self, global_model, optimizer, action_size, worker_id, gamma=0.99):
        super(Worker, self).__init__()
        self.global_model = global_model
        self.optimizer = optimizer
        self.worker_id = worker_id
        self.action_size = action_size
        self.env = DummyGym()
        self.gamma = gamma
        self.local_model = GlobalModel(action_size)
        self.local_model.load_state_dict(self.global_model.state_dict())

    def choose_action(self, state):
        logits, _ = self.local_model(state)
        action_prob = torch.softmax(logits, dim=-1)
        action = np.random.choice(self.action_size, p=action_prob.detach().numpy().flatten())
        return action

    def compute_loss(self, done, state, action, reward, next_state):
        logits, value = self.local_model(state)
        _, next_value = self.local_model(next_state)
        target = reward + (1 - done) * self.gamma * next_value.item()
        delta = target - value
        policy_loss = -torch.log(torch.softmax(logits, dim=-1)[0, action]) * delta
        value_loss = delta ** 2
        total_loss = policy_loss + 0.5 * value_loss
        return total_loss

    def update_global(self, loss):
        self.optimizer.zero_grad()
        loss.backward()
        for local_param, global_param in zip(self.local_model.parameters(), self.global_model.parameters()):
            global_param.grad = local_param.grad  # 同步梯度到全局模型
        self.optimizer.step()
        self.local_model.load_state_dict(self.global_model.state_dict())  # 同步模型参数

    def run(self):
        global_episode = 0
        while global_episode < num_episodes:
            current_state = torch.tensor([self.env.reset()[1]], dtype=torch.float32).unsqueeze(0)
            episode_reward = 0
            done = False
            while not done:
                action = self.choose_action(current_state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = torch.tensor([next_state[1]], dtype=torch.float32).unsqueeze(0)

                loss = self.compute_loss(done, current_state, action, reward, next_state)
                self.update_global(loss)
                
                episode_reward += reward
                current_state = next_state

                if done:
                    global_episode += 1
                    print(f"Worker: {self.worker_id}, Episode: {global_episode}, Reward: {episode_reward}")

# 启动多线程 A3C
def main():
    env = DummyGym()  # 创建一次环境以获取动作空间的大小
    action_size = env.action_space.n
    global_model = GlobalModel(action_size)
    global_model.share_memory()  # 使全局模型在不同线程间共享

    optimizer = optim.Adam(global_model.parameters(), lr=learning_rate)
    workers = [Worker(global_model, optimizer, action_size, worker_id=i) for i in range(4)]

    for worker in workers:
        worker.start()
    for worker in workers:
        worker.join()

In [8]:
if __name__ == "__main__":
    try:
        mp.set_start_method('spawn', force=True)  # 强制使用 'spawn' 方法
    except RuntimeError:
        pass  # 如果已经设置，跳过此异常
    main()

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/nuplan/miniconda3/envs/me5418-group10/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main
    exitcode = _main(fd)
  File "/home/nuplan/miniconda3/envs/me5418-group10/lib/python3.7/multiprocessing/spawn.py", line 115, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'Worker' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/nuplan/miniconda3/envs/me5418-group10/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main
    exitcode = _main(fd)
  File "/home/nuplan/miniconda3/envs/me5418-group10/lib/python3.7/multiprocessing/spawn.py", line 115, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'Worker' on <module '__main__' (built-in)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/nuplan