In [5]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from env import load_uavs, load_tasks, initialize_targets
from calculate import calculate_all_voyage_distance
from runEnv import UAVEnv
from model.dqn import DQN, ReplayBuffer
from model.pso import PSO
import matplotlib.pyplot as plt

In [6]:
# 训练流程
def train_dqn(
    env,
    episodes=500,
    batch_size=64,
    gamma=0.99,
    lr=1e-3,
    eps_start=1.0,
    eps_end=0.01,
    eps_decay=0.995,
):
    state_dim = len(env.reset())
    action_dim = len(env.uavs)
    policy_net = DQN(state_dim, action_dim)
    target_net = DQN(state_dim, action_dim)
    target_net.load_state_dict(policy_net.state_dict())
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)
    buffer = ReplayBuffer(10000)
    eps = eps_start

    # 用于存储每集的总 reward
    rewards_per_episode = []

    for ep in range(episodes):
        state = env.reset()
        # 记录每次实验的数据，判断优化程度
        total_reward = 0
        total_success = 0
        total_distance = 0
        
        done = False
        while not done:
            if random.random() < eps:
                action = random.randrange(action_dim)
            else:
                with torch.no_grad():
                    q_vals = policy_net(torch.tensor(state).unsqueeze(0))
                    action = q_vals.argmax().item()
            next_state, reward, done, _ = env.step(action)
            if reward > 0:
                total_success += 1
            buffer.push(
                state,
                action,
                reward,
                next_state if next_state is not None else np.zeros_like(state),
                done,
            )
            state = next_state
            total_reward += reward

            if len(buffer) >= batch_size:
                s, a, r, s2, d = buffer.sample(batch_size)
                s = torch.tensor(s, dtype=torch.float32)
                a = torch.tensor(a)
                r = torch.tensor(r, dtype=torch.float32)
                s2 = torch.tensor(s2, dtype=torch.float32)
                d = torch.tensor(d, dtype=torch.float32)

                q_pred = policy_net(s).gather(1, a.unsqueeze(1)).squeeze()
                with torch.no_grad():
                    q_next = target_net(s2).max(1)[0]
                q_target = r + gamma * q_next * (1 - d)

                loss = nn.functional.mse_loss(q_pred, q_target)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        # 每集结束，记录 total_r
        rewards_per_episode.append(total_reward)        
        eps = max(eps_end, eps * eps_decay)
        if ep % 10 == 0:
            target_net.load_state_dict(policy_net.state_dict())
        total_distance = calculate_all_voyage_distance(env.uavs)
        print(f"Episode {ep} | Total Reward: {total_reward:.2f} | Total Distance: {total_distance:.2f} | \
Total Success : {total_success} | Epsilon: {eps:.3f}")

        # 每 50 轮绘制一次 reward 曲线
        if (ep + 1) % 50 == 0:
            plt.figure(figsize=(8, 4))
            plt.plot(range(1, ep+2), rewards_per_episode, marker='o')
            plt.xlabel('Episode')
            plt.ylabel('Total Reward')
            plt.title(f'Total Reward up to Episode {ep+1}')
            plt.grid(True)
            plt.show()

    return policy_net

In [None]:
# 设置数据目录
uav_csv = "data/uav.csv"
task_csv = "data/task.csv"
uavs = load_uavs(uav_csv)
target = initialize_targets(load_tasks(task_csv))
env = UAVEnv(uavs, target)

# model = train_dqn(env)

In [None]:
"""
运行PSO优化并返回结果
"""
pso = PSO(env, num_particles=30, max_iter=100)
best_position, best_fitness = pso.optimize()
metrics = pso.get_metrics()
print("\nPSO Optimization Results:")
print(f"Total Voyage: {metrics['total_voyage']:.2f}")
print(f"Task Completion Rate: {metrics['completion_rate']:.2%}")
print(f"Average Task-UAV Fitness: {metrics['avg_fitness']:.4f}")
print(f"Overall Fitness: {best_fitness:.4f}")

Iteration 10/100 - Best Fitness: 0.8927
Iteration 20/100 - Best Fitness: 0.9188
Iteration 30/100 - Best Fitness: 0.9197
Iteration 40/100 - Best Fitness: 0.9208
Iteration 50/100 - Best Fitness: 0.9208
Iteration 60/100 - Best Fitness: 0.9208
Iteration 70/100 - Best Fitness: 0.9208
Iteration 80/100 - Best Fitness: 0.9208
Iteration 90/100 - Best Fitness: 0.9208
Iteration 100/100 - Best Fitness: 0.9208

PSO Optimization Results:
Total Voyage: 0.00
Task Completion Rate: 100.00%
Average Task-UAV Fitness: 0.8021
Overall Fitness: 0.9208
