# Using CleanRL PPO to try and train Villagers

In [1]:
import numpy as np
import torch
import sys
sys.path.append('../')
from voting_games.werewolf_env_v0 import raw_env
import random
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
env = raw_env(num_agents=10, werewolves=2)
env.reset()

In [3]:
num_actions = env.action_spaces['player_1'].n
observation_size = env.observation_spaces['player_1']['observation']

In [4]:
env.observation_spaces['player_1']['observation'].shape

In [6]:
env.observation_spaces['player_1'].sample()

OrderedDict([('action_mask',
              array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
                      True])),
             ('observation',
              OrderedDict([('day', 2),
                           ('phase', 2),
                           ('player_status',
                            array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
                                    True])),
                           ('roles', array([1, 1, 0, 1, 1, 1, 0, 0, 1, 0])),
                           ('votes',
                            array([4.7560782, 4.1142387, 3.356251 , 6.512478 , 2.2719352, 4.3874006,
                                   6.4364295, 7.793982 , 7.5660954, 4.355758 ], dtype=float32))]))])

In [7]:
env.observation_spaces['player_1'].sample()['observation'].values()

odict_values([3, 2, array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True]), array([0, 1, 0, 0, 0, 1, 1, 1, 0, 0]), array([3.8632665, 1.7954935, 6.4708276, 9.651167 , 7.759436 , 2.4337952,
       3.148949 , 1.946332 , 4.701349 , 4.30324  ], dtype=float32)])

In [33]:
def flat_obs(observation):
    return  [observation['day']] + \
            [observation['phase']] + \
            [int(status) for status in observation['player_status']] + \
            [role for role in observation['roles']] + \
            [vote for vote in observation['votes']]

In [34]:
flat_obs(env.observation_spaces['player_1'].sample()['observation'])

[2,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 6.659124,
 3.6401567,
 7.3678255,
 9.407521,
 1.4717635,
 0.13030477,
 3.8706512,
 5.379664,
 9.285832,
 0.37486723]

In [None]:
class Agent(torch.nn.Module):
    def __init__(self, num_actions):
        super().__init__()

        self.network = torch.nn.Sequential(
            self._layer_init(torch.nn.Linear(obs_size, 64)),
            torch.nn.ReLU(),
            self._layer_init(torch.nn.Linear(64, 64)),
            torch.nn.ReLU(),
        )

        self.actor = self._layer_init(torch.nn.Linear(512, num_actions))
        self.critic = self._layer_init(torch.nn.Linear(512, 1))
    
    def _layer_init(self, layer, std=np.sqrt(2), bias_const=0.0):
        torch.nn.orthogonal_(layer.weight, std)
        torch.nn.constant_(layer.bias, bias_const)
        return layer

## PPO Self rolled

In [None]:
class ActorCriticNetworkt(torch.nn.Module):
  def __init__(self, obs_space_size, action_space_size):
    super().__init__()

    self.shared_layers = torch.nn.Sequential(
        torch.nn.Linear(obs_space_size, 64),
        torch.nn.ReLU(),
        torch.nn.Linear(64, 64),
        torch.nn.ReLU())
    
    self.policy_layers = torch.nn.Sequential(
        torch.nn.Linear(64, 64),
        torch.nn.ReLU(),
        torch.nn.Linear(64, action_space_size))
    
    self.value_layers = torch.nn.Sequential(
        torch.nn.Linear(64, 64),
        torch.nn.ReLU(),
        torch.nn.Linear(64, 1))
    
  def value(self, obs):
    z = self.shared_layers(obs)
    value = self.value_layers(z)
    return value
        
  def policy(self, obs):
    z = self.shared_layers(obs)
    policy_logits = self.policy_layers(z)
    return policy_logits

  def forward(self, obs):
    z = self.shared_layers(obs)
    policy_logits = self.policy_layers(z)
    value = self.value_layers(z)
    return policy_logits, value

In [None]:
def rollout(model, env, max_steps=100):

    train_data = [[], [], [], [], []] # obs, action, reward, values, act_log_probs
    obs = env.reset()

    ep_reward = 0
    for _ in range(max_steps):
        logits, val = model(torch.tensor([obs], dtype=torch.float32, device=DEVICE))

        # action distribution, grab one, get next stuff
        act_distribution = Categorical(logits=logits)
        act = act_distribution.sample()
        act_log_prob = act_distribution.log_prob(act).item()

        next_obs, reward, done, _ = env.step(act.item())

        for i, item in enumerate((obs, act, reward, val, act_log_prob)):
            train_data[i].append(item)
            
        # record data for training

        obs = next_obs
        ep_reward += reward
        if done:
            break
    
    # do train data filtering
    return train_data, ep_reward

In [None]:
class PPOTrainer():
    def __init__(self, actor_critic, ppo_clip_val=0.2, target_kl_div=0.01, max_policy_train_iters=80, value_train_iters=80, policy_lr=3e-4, value_lr=1e-2):
        self.ac = actor_critic
        self.ppo_clip_val = ppo_clip_val
        self.target_kl_div = target_kl_div
        self.max_policy_train_iters = max_policy_train_iters
        self.value_train_iters = value_train_iters

        # different optimizers for policy and value fucntions
        policy_params = list(self.ac.shared_layers.parameters()) + list(self.ac.policy_layers.parameters())
        self.policy_optim = torch.optim.Adam(policy_params, lr=policy_lr)

        value_params = list(self.ac.shared_layers.parameters()) + list(self.ac.value_layers.parameters())
        self.value_optim = torch.optim.Adam(value_params, lr=value_lr)

    def train_policy(self, obs, act, old_log_probs, gaes):
        self.policy_optim.zero_grad()

        new_logits = self.ac.policy(obs)

