In [1]:
from schorl_utils.envs import *
from schorl_utils.functions import Train, Agent
from schorl_utils.net import generate_mlpnet, show_net_structure, ContinuousPolicyMlp
import gym

# 以Pendulum环境为例 连续状态连续动作
env = gym.make('Pendulum-v1')
print(f"obs : {env.observation_space}\n obs shape : {env.observation_space.shape}")
print(f"aciton : {env.action_space}\n aciton shape : {env.action_space.shape}")

obs : Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
 obs shape : (3,)
aciton : Box(-2.0, 2.0, (1,), float32)
 aciton shape : (1,)


  deprecation(
  deprecation(


## Actor-Critic：结合价值学习和策略学习的算法
Actor：策略网络，用于动作选择<br>
Critic：价值网络，给动作打分

$V_\pi(S)=\sum_a \pi(a|s) \cdot Q_\pi(s,a)$

使用神经网络 $\pi(a|s;\theta)$ 近似策略 $\pi(a|s)$ ，$\theta$为神经网络参数<br>
使用神经网络 $q(s,a;w)$ 近似动作价值函数 $Q_\pi(s,a)$，$w$神经网络参数

### 连续动作空间 策略网络
连续动作空间，policy网络预测一组 均值和std标准差来 描述 每个动作的分布<br>
依照此分布采样获得动作和该动作的log_prob

In [2]:
# 搭建策略网络
# 如果是离散动作空间，输出增加一个softmax层，将输出的动作向量变为概率分布；后续可以使用epsilon贪心平衡探索利用
# 如果是连续动作空间，可以增加一个tahn层，将输出变为(-1,1)内的值，再根据动作的范围进行放缩，
#       后续动作选择的时候通过增加噪声来平衡探索利用
policyNet = ContinuousPolicyMlp([env.observation_space.shape[0], 
                128, env.action_space.shape[0]])

# 价值网络
# 输入是 状态和actor选择的动作，输出是一个q值，表对决策的评价
valueNet = generate_mlpnet(mlp_layers=[env.observation_space.shape[0]+env.action_space.shape[0], 128, 1])

print("策略网络")
print(policyNet)
print("价值网络")
show_net_structure(valueNet, (env.observation_space.shape[0]+env.action_space.shape[0],))

策略网络
ContinuousPolicyMlp(
  (mlp): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=3, out_features=128, bias=True)
    (2): Tanh()
  )
  (fc_mean): Linear(in_features=128, out_features=1, bias=True)
  (fc_std): Linear(in_features=128, out_features=1, bias=True)
)
价值网络
The structure of the net:
Flatten output shape : torch.Size([5, 4])
Linear output shape : torch.Size([5, 128])
Tanh output shape : torch.Size([5, 128])
Linear output shape : torch.Size([5, 1])


### 训练过程
1. 观测状态获取状态state
2. 根据策略随机采样动作
3. 执行动作，获得下一个状态和奖励
4. 根据奖励时序差分方法更新价值网络
5. 使用策略梯度方法更新策略网络

#### TD更新价值网络
1. 计算$q(s_t,a_t;w_t)$和$q(s_{t+1},a_{t+1};w_t)$
2. TD target: $y_t=r_t + \gamma \cdot q(s_{t+1},a_{t+1};w_t)$  所以AC算法是on policy的方法
3. loss: $L(W)=1/2(q(s_t,a_t;w_t) - y_t)$

#### 策略梯度更新策略网络
策略网络根据价值网络的打分进行梯度上升更新<br>
即  $-log\_prob*q_t$

In [3]:
class ACagent(Agent):
    def __init__(self, policyNet, valueNet, action_scale, exploration_noise=0.1, device=get_device(), gamma=0.9, lr=0.002, optim=torch.optim.Adam, loss=F.mse_loss, datatype=torch.float) -> None:
        self.policy = policyNet.to(device)
        self.value = valueNet.to(device)
        self.device = device
        self.gamma_q = gamma
        self.lr = 0.002
        self.policy_optim = optim(self.policy.parameters(), lr=lr)
        self.value_optim = optim(self.value.parameters(), lr=lr)
        self.type = datatype
        self.action_scale = action_scale
        self.exploration_noise = exploration_noise
        self.qloss = loss
    
    def __call__(self, state:np.ndarray):
        state = torch.tensor([state], dtype=self.type).to(self.device)
        mean, std = self.policy(state)
        dist = torch.distributions.Normal(mean, std)
        normal_sample = dist.rsample()  # rsample()是重参数化采样
        log_prob = dist.log_prob(normal_sample)
        action = torch.tanh(normal_sample)
        # 计算tanh_normal分布的对数概率密度
        log_prob = log_prob - torch.log(1 - torch.tanh(action).pow(2) + 1e-7)
        return action * self.action_scale, log_prob

    def update(self, state, action, log_prob, reward, next_state, done):
        # 单步更新
        state = torch.tensor([state], dtype=self.type).to(self.device)
        action = torch.tensor(action, dtype=self.type).to(self.device)
        next_state_t = torch.tensor([next_state], dtype=self.type).to(self.device)
        reward = torch.tensor(reward, dtype=self.type).to(self.device)

        # print(state.shape)    #torch.Size([1, 3])
        # print(action.shape)   #torch.Size([1, 1])

        Q = self.value(torch.cat((state, action),1))    #价值网络对策略选择的动作进行评价
        
        # 计算价值梯度，更新价值网络
        self.value_optim.zero_grad()

        action_next, _ = self.__call__(next_state)  # action, prob
        action_next = torch.tensor(action_next, dtype=self.type).to(self.device)
        
        # print(next_state_t.shape)    #torch.Size([1, 3])
        # print(action_next.shape)   #torch.Size([1, 1])

        Qnext = self.value(torch.cat((next_state_t, action_next), 1))
        y_i = reward + self.gamma_q * Qnext
        loss_q = self.qloss(Q, y_i)
        loss_q.backward()

        self.value_optim.step()
        

        # 计算策略梯度, 更新策略网络
        self.policy_optim.zero_grad()

        q = Q.item()
        loss_p = -log_prob * q
        loss_p.backward()

        self.policy_optim.step()

        return loss_p, loss_q
        


In [4]:
class ACTrain(Train):
    def __init__(self, env, agent, tblogpath) -> None:
        super().__init__(env, agent, tblogpath)

    def run_episode(self):
        reward_list = []
        loss_list_p = []
        loss_list_q = []

        done = False
        reward_list = []
        prob_list = []
        state = self.env.reset()    # 这个场景有问题，reset获得的是 [ 0.31281108  0.9498154  -0.63081366] (3,)
        while not done:
            action, log_prob = self.agent(state)
            action = action.detach().numpy()        # 这个env需要转成numpy才能跑
            got = self.env.step(action)     #这里的state是 [[ 0.31505278]
                                            #                [ 0.94907415]
                                            #                [-0.04722166]] (3, 1)
            reward = got[1]
            next_state = got[0].transpose()[0]
            done = got[2]
            # print("next",state, next_state)
            loss_policy, loss_value = self.agent.update(state, action, log_prob, reward, next_state, done)
            loss_list_p.append(loss_policy.item())
            loss_list_q.append(loss_value.item())
            reward_list.append(reward)
            state = next_state
        
        self.env.close()
        return {'accumulated reward':sum(reward_list), 'policy_loss':np.mean(loss_list_p), 'qvalue_loss':np.mean(loss_list_q)}

In [5]:
num_episodes = 500

policyNet = policyNet
valueNet = valueNet

agent = ACagent(policyNet=policyNet, valueNet=valueNet, action_scale=2)

train = ACTrain(env=env, agent=agent, tblogpath='./aclog')

train.train(num_episodes)

  state = torch.tensor([state], dtype=self.type).to(self.device)
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  action_next = torch.tensor(action_next, dtype=self.type).to(self.device)
100%|██████████| 500/500 [03:57<00:00,  2.11it/s, episode=499]


In [6]:
agent.save_net(agent.policy ,'./model/ACPolicyPendulum.pt')
# tensorboard --logdir=./tensorlog --port 8123

In [7]:
import gym
import time

# env = gym.make('Pendulum-v1', new_step_api=True)

# agent = ACagent(policyNet=policyNet, valueNet=valueNet, action_scale=2)
net = policyNet = ContinuousPolicyMlp([env.observation_space.shape[0], 
                128, env.action_space.shape[0]])
model = torch.load('./model/ACPolicyPendulum.pt')
net.load_state_dict(model)

done = False
state = env.reset()
reward_list = []
while not done:
    action, log_prob = agent(state)
    action = action.detach().numpy()        # 这个env需要转成numpy才能跑
    got = env.step(action)     #这里的state是 [[ 0.31505278]
                                    #                [ 0.94907415]
                                    #                [-0.04722166]] (3, 1)
    reward = got[1]
    next_state = got[0].transpose()[0]
    done = got[2]
    loss_policy, loss_value = agent.update(state, action, log_prob, reward, next_state)
    state = next_state
    reward_list.append(reward)
env.close()

print(sum(reward_list))

  action_next = torch.tensor(action_next, dtype=self.type).to(self.device)


[-1591.1426]
