## MAN-A2C 方法

### 网络结构

In [1]:
import torch
import torch.nn.functional as F
import numpy as np

# 策略网络
class PolicyNet(torch.nn.Module):

    def __init__(self, state_dim, hidden_dim, action_dim):
        super(PolicyNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = F.sigmoid(self.fc1(x))
        return F.softmax(self.fc2(x), dim=1)


# 价值网络    
class ValueNet(torch.nn.Module):
    
    def __init__(self, state_dim, hidden_dim):
        super(ValueNet, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

       
class MutiActorCritic:
 
    def __init__(self, agent_n, observation_dim, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, tau, gamma, device):
        self.agent_n = agent_n
        self.actors = [PolicyNet(observation_dim, hidden_dim, action_dim).to(device) for _ in range(agent_n)] # 策略网络
        self.critics = [ValueNet(state_dim, hidden_dim).to(device) for _ in range(agent_n)] # 价值网络
        self.target_critics = [ValueNet(state_dim, hidden_dim).to(device) for _ in range(agent_n)] # 目标网络
        self.actor_optimizers = [torch.optim.Adam(actor.parameters(), lr=actor_lr) for actor in self.actors] # 策略网络优化器
        self.critic_optimizers = [torch.optim.Adam(critic.parameters(), lr=critic_lr) for critic in self.critics] # 价值网络优化器
        self.actor_schedulers =  [torch.optim.lr_scheduler.StepLR(optimizer, 100, gamma=0.5, last_epoch=-1) for optimizer in self.actor_optimizers] # 学习率优化器
        self.critic_schedulers =  [torch.optim.lr_scheduler.StepLR(optimizer, 100, gamma=0.5, last_epoch=-1) for optimizer in self.critic_optimizers]
        self.tau = tau  # 目标网络软更新参数
        self.gamma = gamma
        self.device = device

    def take_action(self, observations):
        observations_tensor = [torch.tensor([observations[i]], dtype=torch.float).to(self.device) for i in range(self.agent_n)] # 转化为tensor
        probs = [self.actors[i](observations_tensor[i]) for i in range(self.agent_n)] # 动作概率
        actions = [prob.tolist()[0] for prob in probs]
        return actions
    
    def soft_update(self, net, target_net):
        for param_target, param in zip(target_net.parameters(), net.parameters()):
            param_target.data.copy_(param_target.data * (1.0 - self.tau) + param.data * self.tau)
        
    def actor_update(self, agent_i, transition_dict):
        observations = torch.tensor(transition_dict['observations'], dtype=torch.float).to(self.device)
        states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device)
        actions = torch.tensor(transition_dict['actions']).view(-1, 1).to(self.device)
        rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device)

        td_target = rewards + self.gamma * self.target_critics[agent_i](next_states) # 时序差分目标
        td_delta = td_target - self.critics[agent_i](states) # 时序差分误差
        log_probs = torch.log(self.actors[agent_i](observations).gather(1, actions))
        actor_loss = torch.mean(-log_probs * td_delta.detach())
        self.actor_optimizers[agent_i].zero_grad()
        actor_loss.backward() # 计算策略网络的梯度
        self.actor_optimizers[agent_i].step() # 更新策略网络的参数
        self.actor_schedulers[agent_i].step() # 更新学习率

    def critic_update(self, agent_i, transition_dict):
        states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device)
        rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device)

        td_target = rewards + self.gamma * self.target_critics[agent_i](next_states) # 时序差分目标
        critic_loss = torch.mean(F.mse_loss(self.critics[agent_i](states), td_target.detach())) # 均方误差损失函数
        self.critic_optimizers[agent_i].zero_grad()
        critic_loss.backward() # 计算价值网络的梯度
        self.critic_optimizers[agent_i].step() # 更新价值网络的参数
        self.critic_schedulers[agent_i].step()
        self.soft_update(self.critics[agent_i], self.target_critics[agent_i])  # 软更新目标价值网络

    def save(self, file):
        for i in range(self.agent_n):
            torch.save(self.actors[i].state_dict(), file + 'actor'+str(i)+'.params')
            torch.save(self.critics[i].state_dict(), file + 'critic'+str(i)+'.params')

    def load(self, file):
        for i in range(self.agent_n):
            self.actors[i].load_state_dict(torch.load(file + 'actor'+str(i)+'.params'))
            self.critics[i].load_state_dict(torch.load(file + 'critic'+str(i)+'.params'))

### 训练模型

In [6]:
from tqdm import tqdm
import simulator
import os

# 训练
def train_on_policy_agent():
    return_list = [[], [], [], [], []]
    for i in range(num_episodes // num_show):
        with tqdm(total=num_show, desc='Iteration %d' % i) as pbar:
            for i_episode in range(num_show):
                episode_return = 0
                transition_dict = [{'observations': [],'states': [], 'actions': [], 'next_states': [], 'rewards': []} for _ in range(env.grid_number)]
                critic_transition_dict = [{'states': [], 'next_states': [], 'rewards': []} for _ in range(env.grid_number)]
                observations, state = env.reset()
                done = False
                while not done:
                    for k in range(env.action_dim):
                        for j in range(env.grid_number):
                            transition_dict[j]['observations'].append(observations[j].copy()) # .copy() 很重要
                            transition_dict[j]['states'].append(state.copy())
                    for j in range(env.grid_number):
                        critic_transition_dict[j]['states'].append(state.copy())

                    actions = agents.take_action(observations)
                    observations, state, raw_reward, rewards, done = env.step(actions)

                    for k in range(env.action_dim):
                        for j in range(env.grid_number):
                            transition_dict[j]['actions'].append(k)
                            transition_dict[j]['next_states'].append(state.copy())
                            transition_dict[j]['rewards'].append(rewards[j][k])
                    critic_reward = 0
                    for j in range(env.grid_number):
                        critic_reward += raw_reward[j]
                        critic_transition_dict[j]['next_states'].append(state.copy())
                        critic_transition_dict[j]['rewards'].append(raw_reward[j])
                    
                    episode_return += critic_reward
                return_list[0].append(episode_return)
                response_rate, response_time, occupied_rate, gmv = env.rate_view()
                return_list[1].append(response_rate)
                return_list[2].append(response_time)
                return_list[3].append(occupied_rate)
                return_list[4].append(gmv)
                
                for j in range(env.grid_number):
                    agents.critic_update(j, critic_transition_dict[j])
                    agents.actor_update(j, transition_dict[j])
                if (i_episode+1) % num_show == 0:
                    pbar.set_postfix({"episode": "%d" % (num_show * i + i_episode+1), "return": "%.2f" % np.mean(return_list[0][-num_show:]), "response_rate": "%.4f" % np.mean(return_list[1][-num_show:]), "response_time": "%.2f" %np.mean(return_list[2][-num_show:]), "occupied_rate": "%.4f" %np.mean(return_list[3][-num_show:]), "gmv": "%.2f" %np.mean(return_list[4][-num_show:])})
                pbar.update(1)
    
    # 参数的变化趋势
    for i in range(len(return_list)):
        rt = []
        for j in range(num_episodes):
            if j % num_show == 0:
                rt.append(return_list[i][j])
        print(rt)


actor_lr = 1e-4 # 学习率
critic_lr = 1e-3
num_episodes = 1000
num_show = 20 # 打印结果时每组的轮数
hidden_dim = 256
tau = 0.005 # 软更新参数
gamma = 0.9 # 折扣因子
device = torch.device('cuda') if torch.cuda.is_available() else torch.device("cpu")

env_param = simulator.CityParam() # 环境参数
env_param.file_path1 = os.path.join(os.path.dirname('__file__'), '../data/shanghai/2015-04-04/')
env_param.file_path2 = os.path.join(os.path.dirname('__file__'), '../data/shanghai/2500m/')
env_param.taxi_number = 6000
env_param.simulated_start_time = 8*60*60
env_param.simulated_end_time = 20*60*60
env = simulator.ManyToManyManner(env_param) # 生成环境

torch.manual_seed(0) # 设置随机数种子，方便复现
agent_n = env.grid_number
observation_dim = env.observation_dim
state_dim = env.state_dim
action_dim = env.action_dim
agents = MutiActorCritic(agent_n, observation_dim, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, tau, gamma, device)

train_on_policy_agent()
model_param_file = os.path.join(os.path.dirname('__file__'), '../data/model-param/shanghai-2015-04-04-2500m/MAN-A2C/')
agents.save(model_param_file) # 保存模型

env.clear()

Finish load grids! 49
Finish load nodes! 11668 710
Finish construct taxis! 6000
Finish load trips! 225532


Iteration 0: 100%|██████████| 20/20 [03:51<00:00, 11.58s/it, episode=20, return=1719.01, response_rate=0.8419, response_time=162.25, occupied_rate=0.3657, gmv=57.50]
Iteration 1: 100%|██████████| 20/20 [03:45<00:00, 11.29s/it, episode=40, return=1876.56, response_rate=0.8978, response_time=111.02, occupied_rate=0.3891, gmv=61.17]
Iteration 2: 100%|██████████| 20/20 [03:33<00:00, 10.67s/it, episode=60, return=2045.85, response_rate=0.9430, response_time=60.06, occupied_rate=0.4076, gmv=64.07]
Iteration 3: 100%|██████████| 20/20 [03:22<00:00, 10.11s/it, episode=80, return=2174.55, response_rate=0.9670, response_time=29.75, occupied_rate=0.4172, gmv=65.55]
Iteration 4: 100%|██████████| 20/20 [03:27<00:00, 10.39s/it, episode=100, return=2243.13, response_rate=0.9764, response_time=15.97, occupied_rate=0.4208, gmv=66.12]
Iteration 5: 100%|██████████| 20/20 [03:29<00:00, 10.46s/it, episode=120, return=2269.67, response_rate=0.9791, response_time=10.95, occupied_rate=0.4218, gmv=66.27]
Iterat

[1652.9439398990653, 1806.0940977248322, 1969.0798147914065, 2124.576994860223, 2212.1116232832433, 2263.950209900352, 2283.903283560456, 2279.8181830141107, 2297.3444024087953, 2289.7862626886713, 2288.93545383476, 2291.295162239753, 2294.822382742729, 2295.1905562784686, 2296.404239828322, 2294.7511384610843, 2297.6092902143314, 2290.322156296516, 2294.643832542076, 2292.721592300613, 2282.013994693593, 2298.0213763112833, 2297.4986205639166, 2297.22695912128, 2290.933818615144, 2301.160170472053, 2289.079961118181, 2299.754456511798, 2300.1352277808633, 2290.3905418966488, 2296.1468049387195, 2304.3914688686264, 2305.7955364940835, 2298.3256823279085, 2301.393859041922, 2298.5882792715884, 2303.9426398262663, 2285.7029318452196, 2298.3477187699627, 2305.6200688003332, 2300.8977017652765, 2299.401489647833, 2293.4183373552687, 2304.451060678657, 2290.7480331953043, 2296.111198875844, 2305.2829827833866, 2295.98568976584, 2300.6024435648246, 2295.6091946195884]
[0.8178801483505759, 0.

### 加载现有模型

In [11]:
import simulator
import os

actor_lr = 1e-4 # 学习率
critic_lr = 1e-3
hidden_dim = 256
tau = 0.005 # 软更新参数
gamma = 0.9 # 折扣因子
device = torch.device('cuda') if torch.cuda.is_available() else torch.device("cpu")

env_param = simulator.CityParam() # 环境参数
env_param.file_path1 = os.path.join(os.path.dirname('__file__'), '../data/shanghai/2015-04-04/')
env_param.file_path2 = os.path.join(os.path.dirname('__file__'), '../data/shanghai/2500m/')
env_param.taxi_number = 2000
env_param.simulated_start_time =8*60*60
env_param.simulated_end_time = 20*60*60
env = simulator.ManyToManyManner(env_param) # 生成环境

agent_n = env.grid_number
observation_dim = env.observation_dim
state_dim = env.state_dim
action_dim = env.action_dim
agents = MutiActorCritic(agent_n, observation_dim, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, tau, gamma, device)
model_param_file = os.path.join(os.path.dirname('__file__'), '../data/model-param/shanghai-2015-04-04-2500m/MAN-A2C/')
agents.load(model_param_file) # 加载模型

N = 10
N_total_reward = 0
N_response_rate = 0
N_response_time = 0
N_occupied_rate = 0
N_gmv = 0
for _ in range(N):
    observations, state = env.reset()
    total_reward = 0
    done = False
    while done == False:
        actions = agents.take_action(observations)
        observations, state, raw_reward, rewards, done = env.step(actions)
        for r in raw_reward:
            total_reward += r
    response_rate, response_time, occupied_rate, gmv = env.rate_view()
    N_total_reward += total_reward
    N_response_rate += response_rate
    N_response_time += response_time
    N_occupied_rate += occupied_rate
    N_gmv += gmv
    print(total_reward, response_rate, response_time, occupied_rate, gmv)

print('ave: ', N_total_reward / N, N_response_rate / N, N_response_time / N, N_occupied_rate / N, N_gmv / N)

env.clear()

Finish load grids! 49
Finish load nodes! 11668 710
Finish construct taxis! 2000
Finish load trips! 225532
1105.577184490375 0.5508757297748124 355.5470871116001 0.7151373726851852 112.72985208333242
1090.1988900064594 0.5498287580075595 355.72276542508826 0.7171296527777777 112.9885595833316
1090.033597642055 0.548276045640871 356.56974783951165 0.714125613425926 112.60811333333123
1097.7454846623127 0.5502768264333753 355.59979060564655 0.7172885416666667 112.99773999999825
1091.499727203479 0.5507914396749064 355.20158642840664 0.7176953009259259 113.11062249999827
1093.4889203127595 0.5512217628165315 354.93753659964864 0.7179322453703704 113.17723291666611
1108.1444852094514 0.5533378879562756 353.9936649335439 0.720612337962963 113.52408624999798
1098.6423931495672 0.5531870530406544 354.0572817773677 0.7196575578703703 113.42191708333134
1099.4519553164755 0.5525925860202651 354.40704132876687 0.7192250925925926 113.44540583333186
1089.2687962432005 0.545614253012262 358.08271076