# Using CleanRL PPO to try and train Villagers

In [1]:
import numpy as np
import torch
import sys
sys.path.append('../')
from voting_games.werewolf_env_v0 import raw_env
import random
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


Slightly based on [this](https://pettingzoo.farama.org/tutorials/cleanrl/implementing_PPO/), and the [following blogpost](https://iclr-blog-track.github.io/2022/03/25/ppo-implementation-details/). Another PZ implementation referenced is [here](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ppo_pettingzoo_ma_atari.py)

One more link I plan on reading, at least for PPO is [here](https://towardsdatascience.com/elegantrl-mastering-the-ppo-algorithm-part-i-9f36bc47b791)

In [2]:
env = raw_env(num_agents=10, werewolves=2)
env.reset()

In [3]:
num_actions = env.action_spaces['player_1'].n
observation_size = env.observation_spaces['player_1']['observation']

In [4]:
env.observation_spaces['player_1']['observation'].shape

In [4]:
env.observation_spaces['player_1'].sample()

OrderedDict([('action_mask',
              array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
                      True])),
             ('observation',
              OrderedDict([('day', 1),
                           ('phase', 2),
                           ('player_status',
                            array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
                                    True])),
                           ('roles', array([1, 1, 0, 1, 0, 0, 1, 1, 1, 1])),
                           ('votes',
                            array([10.892954 ,  0.8731968,  8.033873 ,  2.277371 ,  1.8526876,
                                   10.99731  ,  7.982339 ,  5.003656 ,  2.5943236,  2.754004 ],
                                  dtype=float32))]))])

In [27]:
env.observation_spaces['player_1'].sample()['observation'].values()

odict_values([1, 1, array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True]), array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0]), array([10.48173  ,  2.2281246,  2.331817 ,  4.617806 ,  5.0304337,
        1.4088393,  2.493021 , 10.6729   ,  5.5346136,  6.636181 ],
      dtype=float32)])

In [28]:
def flat_obs(observation):
    return  np.asarray([observation['day']] + \
            [observation['phase']] + \
            [int(status) for status in observation['player_status']] + \
            [role for role in observation['roles']] + \
            [vote for vote in observation['votes']])

In [29]:
flat_obs(env.observation_spaces['player_1'].sample()['observation']).shape[-1]

32

## PPO Training

In [18]:
class Agent(torch.nn.Module):
    def __init__(self, num_actions, obs_size):
        super().__init__()

        self.critic = torch.nn.Sequential(
            self._layer_init(torch.nn.Linear(obs_size, 64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64,64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64,1), std=1.0),
        )

        self.actor = torch.nn.Sequential(
            self._layer_init(torch.nn.Linear(obs_size, 64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64,64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64, num_actions), std=0.01),
        )
    
    def get_value(self, x):
        return self.critic(x)
    
    def get_action_and_value(self, x, action=None):
        logits = self.actor(x)

        probs = torch.distributions.categorical.Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(x)

    def _layer_init(self, layer, std=np.sqrt(2), bias_const=0.0):
        torch.nn.init.orthogonal_(layer.weight, std)
        torch.nn.init.constant_(layer.bias, bias_const)
        return layer

def batchify_obs(obs, device):
    """Converts PZ style observations to batch of torch arrays."""
    # convert to list of np arrays
    obs = np.stack([obs[a] for a in obs], axis=0)
    obs = torch.tensor(obs).to(device)

def batchify(x, device):
    """Converts PZ style returns to batch of torch arrays."""
    # convert to list of np arrays
    x = np.stack([x[a] for a in x], axis=0)
    # convert to torch
    x = torch.tensor(x).to(device)

    return x

def unbatchify(x, env):
    """Converts np array to PZ style arguments."""
    x = x.cpu().numpy()
    x = {a: x[i] for i, a in enumerate(env.possible_agents)}

    return x


In [None]:
# ALGORITHM PARAMETERS
# TODO: What is really necessary here?
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ent_coef = 0.1 #
vf_coef = 0.1 #
clip_coef = 0.1 #
gamma = 0.99 #
batch_size = 32 #
stack_size = 4 #
frame_size = (64, 64) #
max_cycles = 125 #
total_episodes = 2 #

In [22]:
### Env Setup
env = raw_env(num_agents=10, werewolves=2)
# env.reset()
num_agents = 10
num_actions = env.action_spaces['player_1'].n
observation_size = flat_obs(env.observation_spaces['player_1'].sample()['observation']).shape[-1]

# Learner Setup
agent = Agent(num_actions=num_actions, obs_size=observation_size)
optimizer = torch.optim.Adam(agent.parameters(), lr=0.001, eps=1e-5)
# for agents in 
# # Algorithm Logic : Episode Storage

# # rb = rollback
# end_step = 0
# total_episodic_return = 0
# rb_obs = 
# rb_actions =
# rb_logprobs = 
# rb_rewards = 
# rb_terms = 
# rb_values =


In [24]:
def random_policy(observation, agent):
    # these are the other wolves. we cannot vote for them either
    available_actions = list(range(len(observation['observation']['player_status'])))
    # dead players
    action_mask = observation['action_mask']

    legal_actions = [action for action,is_alive,is_wolf in zip(available_actions, action_mask, observation['observation']['roles']) if is_alive and not is_wolf]
    # wolves don't vote for other wolves. will select another villager at random
    action = random.choice(legal_actions)
    return action

In [48]:
# Training Logic
total_episodes = 1
for episode in range(total_episodes):
    with torch.no_grad():
        env.reset()

        # magent_list = {agent: [] for agent in env.agents}
        magent_list = {agent : [] for agent in env.agents if not env.agent_roles[agent]}
        print(magent_list.keys())
        for magent in env.agent_iter():
            observation, reward, termination, truncation, info = env.last()

            # werewolves have full role TODO: add logic for wolves herevisibility
            if sum(observation['observation']['roles']):
                # TODO: find a cleaner way to identify a wolf
                action = random_policy(observation, magent) if not termination or truncation else None
            else:
                obs = torch.Tensor(flat_obs(observation['observation']))
                if not termination or truncation:
                    action, logprobs, _, value = agent.get_action_and_value(obs)
                    
                else:
                    action = None

                magent_list[magent].append({
                    "obs": obs, 
                    "action": action,
                    "prev_reward": reward,
                    "logprobs": logprobs,
                    "value": value
                    })
                
            env.step(action)

            


        

dict_keys(['player_1', 'player_2', 'player_3', 'player_4', 'player_6', 'player_7', 'player_8', 'player_9'])


ValueError: list.remove(x): x not in list

In [39]:
magent_list.keys()

dict_keys(['player_1', 'player_2', 'player_3', 'player_4', 'player_5', 'player_6', 'player_7', 'player_8', 'player_9', 'player_10'])

In [None]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")