# Using CleanRL PPO to try and train Villagers

In [1]:
import numpy as np
import torch
import sys
sys.path.append('../')
from voting_games.werewolf_env_v0 import raw_env
import random
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


Slightly based on [this](https://pettingzoo.farama.org/tutorials/cleanrl/implementing_PPO/), and the [following blogpost](https://iclr-blog-track.github.io/2022/03/25/ppo-implementation-details/). Another PZ implementation referenced is [here](https://github.com/vwxyzjn/cleanrl/blob/master/cleanrl/ppo_pettingzoo_ma_atari.py)

One more link I plan on reading, at least for PPO is [here](https://towardsdatascience.com/elegantrl-mastering-the-ppo-algorithm-part-i-9f36bc47b791)

In [2]:
env = raw_env(num_agents=10, werewolves=2)
env.reset()

In [3]:
num_actions = env.action_spaces['player_1'].n
observation_size = env.observation_spaces['player_1']['observation']

In [4]:
env.observation_spaces['player_1']['observation'].shape

In [5]:
env.observation_spaces['player_1'].sample()

OrderedDict([('action_mask',
              array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
                      True])),
             ('observation',
              OrderedDict([('day', 5),
                           ('phase', 0),
                           ('player_status',
                            array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
                                    True])),
                           ('roles', array([1, 1, 1, 1, 0, 1, 0, 1, 0, 0])),
                           ('votes',
                            array([ 6.978397 ,  7.0205984,  7.6711574,  2.0655217,  9.694451 ,
                                    2.2381415,  3.993107 , 10.417962 , 10.78285  , 10.3780365],
                                  dtype=float32))]))])

In [6]:
env.observation_spaces['player_1'].sample()['observation'].values()

odict_values([2, 1, array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True]), array([0, 0, 1, 0, 1, 1, 0, 1, 0, 1]), array([1.1784745, 6.6172085, 4.434359 , 2.475156 , 3.2237024, 3.2628539,
       7.9956994, 6.5257916, 4.5016713, 6.69284  ], dtype=float32)])

In [7]:
def flat_obs(observation):
    return  np.asarray([observation['day']] + \
            [observation['phase']] + \
            [int(status) for status in observation['player_status']] + \
            [role for role in observation['roles']] + \
            [vote for vote in observation['votes']])

In [8]:
flat_obs(env.observation_spaces['player_1'].sample()['observation']).shape[-1]

32

## PPO Training

In [9]:
class Agent(torch.nn.Module):
    def __init__(self, num_actions, obs_size):
        super().__init__()

        self.critic = torch.nn.Sequential(
            self._layer_init(torch.nn.Linear(obs_size, 64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64,64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64,1), std=1.0),
        )

        self.actor = torch.nn.Sequential(
            self._layer_init(torch.nn.Linear(obs_size, 64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64,64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64, num_actions), std=0.01),
        )
    
    def get_value(self, x):
        return self.critic(x)
    
    def get_action_and_value(self, x, action=None):
        logits = self.actor(x)

        probs = torch.distributions.categorical.Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(x)

    def _layer_init(self, layer, std=np.sqrt(2), bias_const=0.0):
        torch.nn.init.orthogonal_(layer.weight, std)
        torch.nn.init.constant_(layer.bias, bias_const)
        return layer

def batchify_obs(obs, device):
    """Converts PZ style observations to batch of torch arrays."""
    # convert to list of np arrays
    obs = np.stack([obs[a] for a in obs], axis=0)
    obs = torch.tensor(obs).to(device)

def batchify(x, device):
    """Converts PZ style returns to batch of torch arrays."""
    # convert to list of np arrays
    x = np.stack([x[a] for a in x], axis=0)
    # convert to torch
    x = torch.tensor(x).to(device)

    return x

def unbatchify(x, env):
    """Converts np array to PZ style arguments."""
    x = x.cpu().numpy()
    x = {a: x[i] for i, a in enumerate(env.possible_agents)}

    return x


In [10]:
# ALGORITHM PARAMETERS
# TODO: What is really necessary here?
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ent_coef = 0.1 #
vf_coef = 0.1 #
clip_coef = 0.1 #
gamma = 0.99 #
gae_lambda = 0.95
batch_size = 32 #
stack_size = 4 #
frame_size = (64, 64) #
max_cycles = 125 #
total_episodes = 2 #

In [11]:
### Env Setup
env = raw_env(num_agents=10, werewolves=2)
# env.reset()
num_agents = 10
num_actions = env.action_spaces['player_1'].n
observation_size = flat_obs(env.observation_spaces['player_1'].sample()['observation']).shape[-1]

# Learner Setup
agent = Agent(num_actions=num_actions, obs_size=observation_size)
optimizer = torch.optim.Adam(agent.parameters(), lr=0.001, eps=1e-5)
# for agents in 
# # Algorithm Logic : Episode Storage

# # rb = rollback
# end_step = 0
# total_episodic_return = 0
# rb_obs = 
# rb_actions =
# rb_logprobs = 
# rb_rewards = 
# rb_terms = 
# rb_values =


In [12]:
def random_policy(observation, agent):
    # these are the other wolves. we cannot vote for them either
    available_actions = list(range(len(observation['observation']['player_status'])))
    # dead players
    action_mask = observation['action_mask']

    legal_actions = [action for action,is_alive,is_wolf in zip(available_actions, action_mask, observation['observation']['roles']) if is_alive and not is_wolf]
    # wolves don't vote for other wolves. will select another villager at random
    action = random.choice(legal_actions)
    return action

In [52]:
# Training Logic
total_episodes = 1
for episode in range(total_episodes):
    with torch.no_grad():
        env.reset()

        # magent_list = {agent: [] for agent in env.agents}
        magent_list = {agent : [] for agent in env.agents if not env.agent_roles[agent]}

        # print(magent_list.keys())
        for magent in env.agent_iter():
            observation, reward, termination, truncation, info = env.last()

            # werewolves have full role TODO: add logic for wolves herevisibility
            if sum(observation['observation']['roles']):
                # TODO: find a cleaner way to identify a wolf
                action = random_policy(observation, magent) if not termination or truncation else None
            else:
                obs = torch.Tensor(flat_obs(observation['observation']))
                if not termination or truncation:
                    action, logprobs, _, value = agent.get_action_and_value(obs)
                else:
                    action = None

                magent_list[magent].append({
                    "obs": obs, 
                    "action": action,
                    "prev_reward": reward,
                    "logprobs": logprobs,
                    "term": termination,
                    "value": value
                    })

            env.step(action)
        
        # take the sequential observations of each agent, and store them appropriately
        magent_obs = {agent: {'obs': [], 'rewards': [], 'actions': [], 'logprobs': [], 'values': [], 'terms': []} for agent in magent_list}
        for key, value in magent_list.items():
            # print(f'-- {key} --')
            for s1, s2 in zip(value, value[1:]):
                magent_obs[key]['obs'].append(s1['obs'])
                magent_obs[key]['rewards'].append(s2['prev_reward'])
                magent_obs[key]['actions'].append(s1['action'])
                magent_obs[key]['logprobs'].append(s1['logprobs'])
                magent_obs[key]['values'].append(s1['value'])
                magent_obs[key]['terms'].append(s2['term'])

    # We will do this for each agent in the episode
    # essentially we are calculating advantages and returns
    with torch.no_grad():
        for player, records in magent_obs.items():
            print(f'{player}')
            # print(f'{records}')
            advantages = torch.zeros_like(torch.tensor(records['rewards']))
            print(f'{[v.item() for v in records["actions"]]}')
            print(f'{[v for v in records["terms"]]}')
            print(f'{[v for v in records["rewards"]]}')

            print(list(reversed(range(len(records['obs'])))))
            for t in reversed(range(len(records['obs']))):
                # print(f'T: {t+1} - Rewards : {torch.tensor(records["rewards"])[t+1]} ')
                # not using terms, as these are episodic

                ## this was the last one. We are not using any terminal states in a good way

                if t == len(records['obs']) - 1:
                    #print(f'T: {t} - Rewards at end : {torch.tensor(records["rewards"])[t]} ')
                    #print(f'T: {t} - Actions at end : {torch.tensor(records["actions"])[t]} ')
                    delta = records["rewards"][t] - records["values"][t]
                    advantages[t]  = delta
                else:
                    #print(f'T: {t} - Rewards : {torch.tensor(records["rewards"])[t]} ')
                    #print(f'T: {t} - Actions : {torch.tensor(records["actions"])[t]} ')                    
                    delta = records["rewards"][t] + gamma * records["values"][t+1] - records["values"][t]
                    advantages[t]  = delta + gamma * gamma * advantages[t+1]

                #delta = records['rewards'][t] + gamma * records['values'][t+1] - records['values'][t]
            magent_obs[player]["advantages"] = advantages
            magent_obs[player]["returns"] = advantages + torch.tensor(records["values"])
                #advantages[t] = delta + gamma * gamma * advantages[t+1]
    
    
    # optimize the policy and the value network now
            
        

player_1
[5, 4, 8]
[False, False, True]
[0, 0, -33]
[2, 1, 0]
player_2
[2, 1, 6, 2, 3, 1, 4, 3, 7]
[False, False, False, False, False, False, False, False, True]
[0, -4, -1, 0, 0, -1, 0, -6, -31]
[8, 7, 6, 5, 4, 3, 2, 1, 0]
player_3
[5, 4, 9, 5, 4, 9, 6, 5, 5, 1, 7, 5]
[False, False, False, False, False, False, False, False, False, False, False, True]
[0, 0, -1, 0, -6, -1, 0, 0, -1, 0, 0, -25]
[11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
player_4
[4, 5, 5, 6, 4]
[False, False, False, False, True]
[0, -1, -1, 0, -37]
[4, 3, 2, 1, 0]
player_6
[8, 2, 1, 1, 2, 0, 1, 9]
[False, False, False, False, False, False, False, True]
[0, -1, -1, 0, -1, -1, 0, -31]
[7, 6, 5, 4, 3, 2, 1, 0]
player_7
[7, 6, 8, 8, 4, 1]
[False, False, False, False, False, True]
[0, -4, -1, 0, -6, -32]
[5, 4, 3, 2, 1, 0]
player_8
[7, 3, 1, 5, 9, 4, 5, 5, 2, 8, 4]
[False, False, False, False, False, False, False, False, False, False, True]
[0, -1, -1, 0, -1, -1, 0, 0, -1, 0, -35]
[10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
player_9
[7, 

In [21]:
None == 0

False

In [28]:
magent_obs['player_4']['actions'][-2]

tensor(9)

In [15]:
magent_list['player_2']

[{'obs': tensor([1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  'action': tensor(4),
  'prev_reward': 0,
  'logprobs': tensor(-2.3014),
  'value': tensor([0.8297])},
 {'obs': tensor([1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 6., 4., 7., 3., 1., 1., 7., 1., 4., 2.]),
  'action': tensor(0),
  'prev_reward': 0,
  'logprobs': tensor(-2.2969),
  'value': tensor([1.4532])},
 {'obs': tensor([1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 3., 0., 3., 1., 6., 5., 7., 2., 7., 2.]),
  'action': tensor(5),
  'prev_reward': -1,
  'logprobs': tensor(-2.3052),
  'value': tensor([0.6734])},
 {'obs': tensor([1., 2., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
  'action': tensor(2),
  'prev_reward': -1,
  'logprobs': t

In [None]:
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")