DDPG（Deep Deterministic Policy Gradient）是一种结合了策略梯度方法和Q学习的算法，特别适用于连续动作空间的问题。

使用DDPG方法处理具有连续动作空间的Box2D Car Racing环境是一个更合适的选择。

## 导入必要的模块

In [17]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from collections import deque
import random


## 初始化环境

In [18]:
# 环境初始化和参数设定
import gym
import numpy as np

env = gym.make('CarRacing-v2')

# 对于CarRacing-v0，状态是一幅图像，我们使用图像的维度来定义状态空间的大小
# 这里我们简化处理，直接将状态维度设置为图像的像素数量
# 注意，这种处理方式仅为示例，实际应用中需要根据网络输入调整
state_dim = env.observation_space.shape[0] * env.observation_space.shape[1] * env.observation_space.shape[2]

action_dim = env.action_space.shape[0]  # CarRacing的动作空间是连续的，具有3个动作（加速、转向、刹车）
max_action = float(env.action_space.high[0])  # 假设所有动作的最大值相同


## 定义Actor和Critic模型

In [19]:
class ActorCNN(nn.Module):
    def __init__(self, action_dim, max_action):
        super(ActorCNN, self).__init__()
        # 定义卷积层
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=5, stride=2),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=5, stride=2),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=5, stride=2),
            nn.ReLU()
        )
        
        # 定义全连接层
        self.fc_layers = nn.Sequential(
            nn.Linear(64 * 7 * 7, 512),  # 注意：这里的尺寸取决于卷积层的输出
            nn.ReLU(),
            nn.Linear(512, action_dim)
        )
        
        self.max_action = max_action

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)  # 将卷积层的输出展平
        x = self.fc_layers(x)
        return self.max_action * torch.tanh(x)


class CriticCNN(nn.Module):
    def __init__(self, action_dim):
        super(CriticCNN, self).__init__()
        # 定义针对状态的卷积层
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=5, stride=2),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=5, stride=2),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=5, stride=2),
            nn.ReLU()
        )
        
        # 定义全连接层，同时处理状态和动作
        self.fc_layers = nn.Sequential(
            nn.Linear(64 * 7 * 7 + action_dim, 512),  # 注意：尺寸取决于卷积层输出和动作维度
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, x, u):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)  # 将卷积层输出展平
        x = torch.cat([x, u], 1)  # 将状态和动作融合
        x = self.fc_layers(x)
        return x



## DDPG算法

In [20]:
# 经验回放

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = deque(maxlen=capacity)

    def add(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return np.array(state), np.array(action), np.array(reward), np.array(next_state), np.array(done)


In [25]:
# DDPG算法类将包含初始化、选择动作、存储转换、从经验回放中采样、更新Critic和Actor模型等方法。

import torch
import torch.nn.functional as F
import numpy as np
import copy

class DDPG(object):
    def __init__(self, state_dim, action_dim, max_action):
        self.actor = ActorCNN(action_dim, max_action)
        self.actor_target = copy.deepcopy(self.actor)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

        self.critic = CriticCNN(action_dim)
        self.critic_target = copy.deepcopy(self.critic)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())


        self.replay_buffer = ReplayBuffer(1000000)
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.max_action = max_action

        self.gamma = 0.99
        self.tau = 0.005
        self.batch_size = 100

    def select_action(self, state):
        # 预处理状态数据，例如，调整通道顺序和添加批量维度
        state = np.transpose(state, (2, 0, 1))
        state = np.expand_dims(state, axis=0)
        state = torch.FloatTensor(state)
        action = self.actor(state).cpu().data.numpy().flatten()
        return action

    def train(self):
        if len(self.replay_buffer.buffer) < self.batch_size:
            return
        state, action, reward, next_state, done = self.replay_buffer.sample(self.batch_size)

        state = torch.FloatTensor(state)
        action = torch.FloatTensor(action)
        reward = torch.FloatTensor(reward).unsqueeze(1)
        next_state = torch.FloatTensor(next_state)
        done = torch.FloatTensor(done).unsqueeze(1)

        # Critic loss
        target_Q = self.critic_target(next_state, self.actor_target(next_state))
        target_Q = reward + ((1 - done) * self.gamma * target_Q).detach()
        current_Q = self.critic(state, action)
        critic_loss = F.mse_loss(current_Q, target_Q)

        # Critic update
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Actor loss
        actor_loss = -self.critic(state, self.actor(state)).mean()

        # Actor update
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Update the frozen target models
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

    def save(self, filename):
        torch.save(self.actor.state_dict(), filename + "_actor")
        torch.save(self.critic.state_dict(), filename + "_critic")

    def load(self, filename):
        self.actor.load_state_dict(torch.load(filename + "_actor"))
        self.critic.load_state_dict(torch.load(filename + "_critic"))



In [26]:
# 初始化和训练模型

# Assuming the environment is correctly initialized

ddpg = DDPG(state_dim, action_dim, max_action)
episodes = 100  # Define the number of episodes for training

for episode in range(episodes):
    state = env.reset()
    episode_reward = 0
    done = False

    while not done:
        action = ddpg.select_action(np.array(state))
        next_state, reward, done, _ = env.step(action)
        ddpg.replay_buffer.add(state, action, reward, next_state, done)

        ddpg.train()
        state = next_state
        episode_reward += reward

    print(f"Episode {episode} Reward: {episode_reward}")
    if episode % 10 == 0:
        ddpg.save(f"ddpg_{episode}")


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.