In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from env import load_uavs, load_tasks, initialize_targets
from runEnv import UAVEnv
from model.dqn import DQN, ReplayBuffer

In [2]:
# 训练流程
def train_dqn(
    env,
    episodes=500,
    batch_size=64,
    gamma=0.99,
    lr=1e-3,
    eps_start=1.0,
    eps_end=0.01,
    eps_decay=0.995,
):
    state_dim = len(env.reset())
    action_dim = len(env.uavs)
    policy_net = DQN(state_dim, action_dim)
    target_net = DQN(state_dim, action_dim)
    target_net.load_state_dict(policy_net.state_dict())
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)
    buffer = ReplayBuffer(10000)
    eps = eps_start

    for ep in range(episodes):
        state = env.reset()
        total_r = 0
        done = False
        while not done:
            if random.random() < eps:
                action = random.randrange(action_dim)
            else:
                with torch.no_grad():
                    q_vals = policy_net(torch.tensor(state).unsqueeze(0))
                    action = q_vals.argmax().item()
            next_state, reward, done, _ = env.step(action)
            buffer.push(
                state,
                action,
                reward,
                next_state if next_state is not None else np.zeros_like(state),
                done,
            )
            state = next_state
            total_r += reward

            if len(buffer) >= batch_size:
                s, a, r, s2, d = buffer.sample(batch_size)
                s = torch.tensor(s, dtype=torch.float32)
                a = torch.tensor(a)
                r = torch.tensor(r, dtype=torch.float32)
                s2 = torch.tensor(s2, dtype=torch.float32)
                d = torch.tensor(d, dtype=torch.float32)

                q_pred = policy_net(s).gather(1, a.unsqueeze(1)).squeeze()
                with torch.no_grad():
                    q_next = target_net(s2).max(1)[0]
                q_target = r + gamma * q_next * (1 - d)

                loss = nn.functional.mse_loss(q_pred, q_target)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        eps = max(eps_end, eps * eps_decay)
        if ep % 10 == 0:
            target_net.load_state_dict(policy_net.state_dict())
        print(f"Episode {ep} | Total Reward: {total_r:.2f} | Epsilon: {eps:.3f}")

    return policy_net

In [None]:
# 设置数据目录
uav_csv = "data/uav.csv"
task_csv = "data/task.csv"

env = UAVEnv(load_uavs(uav_csv), initialize_targets(load_tasks(task_csv)))

model = train_dqn(env)

Episode 0 | Total Reward: -9.93 | Epsilon: 0.995
Episode 1 | Total Reward: -16.92 | Epsilon: 0.990
Episode 2 | Total Reward: -21.07 | Epsilon: 0.985
Episode 3 | Total Reward: -22.35 | Epsilon: 0.980


  s2 = torch.tensor(s2, dtype=torch.float32)


Episode 4 | Total Reward: -25.52 | Epsilon: 0.975
Episode 5 | Total Reward: -26.29 | Epsilon: 0.970
Episode 6 | Total Reward: -26.85 | Epsilon: 0.966
Episode 7 | Total Reward: -30.00 | Epsilon: 0.961
Episode 8 | Total Reward: -30.00 | Epsilon: 0.956
Episode 9 | Total Reward: -30.00 | Epsilon: 0.951
Episode 10 | Total Reward: -30.00 | Epsilon: 0.946
Episode 11 | Total Reward: -30.00 | Epsilon: 0.942
Episode 12 | Total Reward: -30.00 | Epsilon: 0.937
Episode 13 | Total Reward: -30.00 | Epsilon: 0.932
Episode 14 | Total Reward: -30.00 | Epsilon: 0.928
Episode 15 | Total Reward: -30.00 | Epsilon: 0.923
Episode 16 | Total Reward: -30.00 | Epsilon: 0.918
Episode 17 | Total Reward: -30.00 | Epsilon: 0.914
Episode 18 | Total Reward: -30.00 | Epsilon: 0.909
Episode 19 | Total Reward: -30.00 | Epsilon: 0.905
Episode 20 | Total Reward: -30.00 | Epsilon: 0.900
Episode 21 | Total Reward: -30.00 | Epsilon: 0.896
Episode 22 | Total Reward: -30.00 | Epsilon: 0.891
Episode 23 | Total Reward: -30.00 | E