- references
    - https://github.com/DeepRLChinese/DeepRL-Chinese/blob/master/09_trpo.py
- 对于 DRL 而言
    - 神经网络反而是简单的，就是一个超强的 function approximator；训练一个 deep neural network，就是学习一个函数近似器
        - $\pi_\theta(\cdot|s)=\pi_\theta(a|s)$
        - $V(s)$
    - 且在 DRL 的问题及应用里，我们需要更灵活多样地组织 learning/training 的 pipeline；

In [1]:
import torch
from torch import nn
from collections import Counter
from torch.distributions.categorical import Categorical

In [2]:
# 类别型概率分布的(依概率)采样
c = Counter()
m = Categorical(probs=torch.tensor([0.25, 0.25, 0.25, 0.25]))
for _ in range(500):
    c.update([m.sample().numpy().tolist()])
print(c)
print(Counter(m.sample((500, )).numpy().tolist()))

Counter({3: 147, 0: 121, 1: 119, 2: 113})
Counter({0: 137, 2: 130, 3: 117, 1: 116})


In [4]:
class CategoricalLayer(nn.Module):
    """
    Implements a layer that outputs a multinomial distribution
    Methods
    ------
    __call__(log_action_probs)
        Takes as input log probabilities and outputs a pytorch multinomail
        distribution
    """

    def __init__(self):
        super().__init__()

    def __call__(self, log_action_probs):
        return Categorical(logits=log_action_probs)

In [3]:
class PolicyNet(nn.Module):
    def __init__(self, dim_obs, num_act):
        super().__init__()
        self.fc1 = nn.Linear(dim_obs, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, num_act)
        self.log_softmax = nn.LogSoftmax(dim=-1)
        self.categorical = CategoricalLayer()

    def forward(self, obs):
        x = F.relu(self.fc1(obs))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)  # logits
        x = self.log_softmax(x)
        x = self.categorical(x)
        return x

In [5]:
class ValueNet(nn.Module):
    def __init__(self, dim_obs):
        super().__init__()
        self.fc1 = nn.Linear(dim_obs, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, obs):
        x = F.relu(self.fc1(obs))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [9]:
# TRPO 实例，不再是一个纯 nn 的 model
# 而是一个 agent
# pi: s -> a
class TRPO:
    def __init__(self, args):
        self.discount = args.discount
        
        self.policy_net = PolicyNet(args.dim_obs, args.num_act)
        self.value_net = ValueNet(args.dim_obs)

        # 这里我们就可以看出 policy net 与 value net 训练的一个差异
        self.value_optimizer = torch.optim.AdamW(self.value_net.parameters(), lr=args.lr_value_net)

        # 最大化过程依赖的一些超参数
        self.max_kl_div = 0.01
        self.cg_max_iters = 10
        self.line_search_accept_ratio = 0.1

    def get_action(self, obs):
        # pi(a|s)：这是一个概率模型，
        action_dist = self.policy_net(obs)
        # 依概率采样
        act = action_dist.sample()
        return act