# 策略梯度

### 模型设计

最后用了一个 sigmoid 保证输出的概率在 (0, 1) 之间

In [1]:
import torch
from torch import nn
import numpy as np

def getnet(input_size, output_size):
    return  nn.Sequential(
        nn.Linear(input_size, input_size * 4), nn.ReLU(),
        nn.Linear(input_size * 4, input_size * 4), nn.ReLU(),
        nn.Linear(input_size * 4, output_size), nn.Sigmoid()
    )

In [2]:
class policy_gradient:
    def __init__(self, model, memory, config):
        self.gamma = config.gamma
        self.device = config.device
        self.policy_net = model.to(self.device)
        self.memory = memory
        self.optimizer = torch.optim.RMSprop(self.policy_net.parameters(), lr=config.lr) # 能缓解多变量下降速度不平衡的问题
    
    def sample_action(self, state):
        state = torch.Tensor(state, dtype=torch.float32)
        state = torch.autograd.Variable(state)
        probs = self.policy_net(state)
        action = torch.distributions.Bernoulli(probs).sample() # 从服从 bernoulli分布的概率中随机抽取, 返回相同 shape的 0 or 1
        action = action.data.numpy().astype(int)[0]
        return action
    
    def update(self):
        state_pool, action_pool, reward_pool = self.memory.sample()
        state_pool, action_pool, reward_pool = list(state_pool), list(action_pool), list(reward_pool)
        running_add = 0
        for i in reversed(range(len(reward_pool))):
            reward_pool[i] = (self.gamma * reward_pool[i-1] + reward_pool[i]) if reward_pool[i] !=0 else 0
        # normalize
        reward_mean = np.mean(reward_pool)
        reward_std = np.std(reward_pool)
        for i in range(len(reward_pool)):
            reward_pool[i] = (reward_pool[i] - reward_mean) / reward_std
        # gradient desent
        self.optimizer.zero_grad()
        for i in range(len(reward_pool)):
            state = torch.autograd.Variable(torch.Tensor(state_pool[i], dtype=torch.float32))
            action = torch.autograd.Variable(torch.Tensor(action_pool[i], dtype=torch.float32))
            reward = reward_pool[i]
            probs = self.policy_net(state)
            loss = - torch.distributions.Bernoulli(probs).log_prob(action) * reward # log (在给定的 Bernoulli分布中选中 action的概率)
            loss.backward()
        self.optimizer.step()
        self.memory.clear()