In [1]:
import numpy as np
import torch
import sys
sys.path.append('../')
from voting_games.werewolf_env_v0 import raw_env, Roles
import random
from tqdm import tqdm
import enum

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# class Roles(enum.IntEnum):
#     VILLAGER = 0
#     WEREWOLF = 1

In [3]:
class Agent(torch.nn.Module):
    def __init__(self, num_actions, obs_size):
        super().__init__()

        self.critic = torch.nn.Sequential(
            self._layer_init(torch.nn.Linear(obs_size, 64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64,64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64,1), std=1.0),
        )

        self.actor = torch.nn.Sequential(
            self._layer_init(torch.nn.Linear(obs_size, 64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64,64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64, num_actions), std=0.01),
        )
    
    def get_value(self, x):
        return self.critic(x)
    
    def get_action_and_value(self, x, action=None):
        logits = self.actor(x)

        probs = torch.distributions.categorical.Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(x)

    def _layer_init(self, layer, std=np.sqrt(2), bias_const=0.0):
        torch.nn.init.orthogonal_(layer.weight, std)
        torch.nn.init.constant_(layer.bias, bias_const)
        return layer

def batchify_obs(obs, device):
    """Converts PZ style observations to batch of torch arrays."""
    # convert to list of np arrays
    obs = np.stack([obs[a] for a in obs], axis=0)
    obs = torch.tensor(obs).to(device)

def batchify(x, device):
    """Converts PZ style returns to batch of torch arrays."""
    # convert to list of np arrays
    x = np.stack([x[a] for a in x], axis=0)
    # convert to torch
    x = torch.tensor(x).to(device)

    return x

def unbatchify(x, env):
    """Converts np array to PZ style arguments."""
    x = x.cpu().numpy()
    x = {a: x[i] for i, a in enumerate(env.possible_agents)}

    return x

In [4]:
def random_policy(observation, agent):
    # these are the other wolves. we cannot vote for them either
    available_actions = list(range(len(observation['observation']['player_status'])))
    # dead players
    action_mask = observation['action_mask']

    legal_actions = [action for action,is_alive,is_wolf in zip(available_actions, action_mask, observation['observation']['roles']) if is_alive and not is_wolf]
    # wolves don't vote for other wolves. will select another villager at random
    action = random.choice(legal_actions)
    return action

def true_random_policy(observation, agent):
    return random.choice(list(range(len(observation['observation']['player_status']))))


def revenge_wolf_policy(observation, agent, action=None):
    # we already know the agent is a werewolf
    me = observation['observation']['self_id']

    # who voted for me 
    votes_against_me = [i for i, x in enumerate(observation['observation']['votes']) if x == me and i != me]

    # remove any wolves who voted for me (they should not have)
    wolf_ids = [i for i, x in enumerate(observation['observation']['roles']) if x == 1 and i != me]
    votes_against_me = list(set(votes_against_me)^set(wolf_ids))

    # remove any players who voted for me but are dead now
    votes_against_me = [i for i in votes_against_me if observation['observation']['player_status'][i] == True]

    villagers_alive = [i for i, x in enumerate(observation['observation']['roles']) \
        if observation['observation']['player_status'][i] == True and x == 0]

    # if there are no votes against me, pick a random villager that is alive
    choice = random.choice(votes_against_me) if len(votes_against_me) > 0 else random.choice(villagers_alive)

    return action if action != None else choice

def random_wolf_policy(observation, agent, action=None):
    # pick a villager to vote for that is alive
    villagers_alive = [i for i, x in enumerate(observation['observation']['roles']) \
        if observation['observation']['player_status'][i] == True and x == 0]
    return action if action != None else random.choice(villagers_alive)



## Setting up our mlflow tracking

In [8]:
import mlflow

#mlflow.set_tracking_uri("http://mlflow:5000")

# with mlflow.start_run(run_name='Wolf Experiment'):

### Coordinate wolf revenge

In [7]:
env = raw_env(num_agents=10, werewolves=2)
env.reset()

num_games = 1000
avg_game_length = 0
wolf_wins = 0
villager_wins = 0

def revenge_wolf_policy(observation, agent, action=None):
    # we already know the agent is a werewolf
    me = observation['observation']['self_id']

    # who voted for me 
    votes_against_me = [i for i, x in enumerate(observation['observation']['votes']) if x == me and i != me]

    # remove any wolves who voted for me (they should not have)
    wolf_ids = [i for i, x in enumerate(observation['observation']['roles']) if x == 1 and i != me]
    votes_against_me = list(set(votes_against_me)^set(wolf_ids))

    # remove any players who voted for me but are dead now
    votes_against_me = [i for i in votes_against_me if observation['observation']['player_status'][i] == True]

    villagers_alive = [i for i, x in enumerate(observation['observation']['roles']) \
        if observation['observation']['player_status'][i] == True and x == 0]

    # if there are no votes against me, pick a random villager that is alive
    choice = random.choice(votes_against_me) if len(votes_against_me) > 0 else random.choice(villagers_alive)

    return action if action != None else choice


self_voting = []
dead_voting = []

with mlflow.start_run(run_name='Coordinated Wolf Revenge'):
    for _ in tqdm(range(num_games)):
        env.reset()
        wolf_brain = {'day': 1, 'phase': 0, 'action': None}
        
        self_votes = 0
        dead_votes = 0

        for agent in env.agent_iter():
            observation, reward, termination, truncation, info = env.last()
            
            day = observation['observation']['day']
            phase = observation['observation']['phase']

            if wolf_brain['day'] != day or wolf_brain['phase'] != phase:
                wolf_brain = {'day': day, 'phase': phase, 'action': None}

            role = observation['observation']['roles'][observation['observation']['self_id']]

            if role == Roles.WEREWOLF:
                action = revenge_wolf_policy(observation, agent, action=wolf_brain['action']) if not termination or truncation else None
                wolf_brain['action'] = action
            else:
                action = true_random_policy(observation, agent) if not termination or truncation else None

                # check how many times the action lines up with the agent
                if action == observation['observation']['self_id']:
                    self_votes += 1
                
                if action in [i for i, status in enumerate(observation['observation']['player_status']) if status == False]:
                    dead_votes += 1

            env.step(action)

        # get some stats
        winner = env.world_state['winners']
        day = env.world_state['day']

        self_voting.append(self_votes)
        dead_voting.append(dead_votes)

        if winner:
            wolf_wins += 1
        else:
            villager_wins += 1
        
        avg_game_length += (day * 1.0)/num_games

        if num_games % 20 == 0:
            mlflow.log_metric("avg_game_len", f'{avg_game_length:.2f}')
            mlflow.log_metric("avg_self_votes", f'{sum(self_voting)/len(self_voting)}')
            mlflow.log_metric("avg_dead_votes", f'{sum(dead_voting)/len(dead_voting)}')
            # mlflow.log_param("avg_game_len", f'{avg_game_length:.2f}')
            # mlflow.log_param("avg_self_votes", f'{sum(self_voting)/len(self_voting)}')
            # mlflow.log_param("avg_dead_votes", f'{sum(dead_voting)/len(dead_voting)}')


print(f'Average game length = {avg_game_length:.2f}')
print(f'Wolf wins : {wolf_wins}')
print(f'Villager wins: {villager_wins}')
print(f'Avg amount of self votes a game across villagers: {sum(self_voting)/len(self_voting)}')
print(f'Avg amount of dead votes a game across villagers: {sum(dead_voting)/len(dead_voting)}')

MlflowException: API request to http://mlflow:5000/api/2.0/mlflow/runs/create failed with exception HTTPConnectionPool(host='mlflow', port=5000): Max retries exceeded with url: /api/2.0/mlflow/runs/create (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fbb2a63f820>: Failed to establish a new connection: [Errno -2] Name or service not known'))

Lets train a policy on the given reward structure we currently have, and see what happens

In [5]:
def flat_obs(observation):
    return  np.asarray([observation['day']] + \
            [observation['phase']] + \
            [int(status) for status in observation['player_status']] + \
            [role for role in observation['roles']] + \
            [vote for vote in observation['votes']])

In [10]:
env = raw_env(num_agents=10, werewolves=2)
env.reset()
num_agents = 10
num_actions = env.action_spaces['player_1'].n
observation_size = flat_obs(env.observation_spaces['player_1'].sample()['observation']).shape[-1]

# Learner Setup
ppo_agent = Agent(num_actions=num_actions, obs_size=observation_size)
optimizer = torch.optim.Adam(ppo_agent.parameters(), lr=0.001, eps=1e-5)

In [11]:
ent_coef = 0.1 #
vf_coef = 0.1 #
clip_coef = 0.1 #
gamma = 0.99 #
gae_lambda = 0.95
batch_size = 16 #
max_cycles = 125 #
total_episodes = 9000 #
update_epochs = 3 #

In [7]:
def play_and_return_stats(env, wolf_policy, agent_policy, num_games=1000):
    wolf_wins = 0
    villager_wins = 0

    game_replays = []
    for _ in range(num_games):
        with torch.no_grad():
            env.reset()

            # brain and extra stats 
            wolf_brain = {'day': 1, 'phase': 0, 'action': None}
            
            # magent_list = {agent: [] for agent in env.agents}
            magent_list = {agent : {"self_votes": 0, "dead_votes": 0, "lasted_for": 0} for agent in env.agents if not env.agent_roles[agent]}

            # print(magent_list.keys())
            for magent in env.agent_iter():
                observation, reward, termination, truncation, info = env.last()

                day = observation['observation']['day']
                phase = observation['observation']['phase']

                if wolf_brain['day'] != day or wolf_brain['phase'] != phase:
                    wolf_brain = {'day': day, 'phase': phase, 'action': None}

                role = observation['observation']['roles'][observation['observation']['self_id']]

                # werewolves have full role TODO: add logic for wolves herevisibility
                if role == Roles.WEREWOLF:
                    # action = revenge_wolf_policy(observation, magent, action=wolf_brain['action']) if not termination or truncation else None
                    # wolf_brain['action'] = action
                    # action = random_policy(observation, magent) if not termination or truncation else None

                    action = wolf_policy(observation, None, action=wolf_brain['action']) if not termination or truncation else None
                    wolf_brain['action'] = action
                else:
                    # action = true_random_policy(observation, agent) if not termination or truncation else None
                    obs = torch.Tensor(flat_obs(observation['observation']))
                    if not termination or truncation:
                        action, logprobs, _, value = agent_policy.get_action_and_value(obs)
                        action = action.item()
                    else:
                        action = None

                    # grab some villager stats we think are useful
                    # TODO : maybe make these callbacks?
                    if action == observation['observation']['self_id']:
                        magent_list[magent]['self_votes'] += 1
                
                    if action in [i for i, status in enumerate(observation['observation']['player_status']) if status == False]:
                        magent_list[magent]['dead_votes'] += 1
                        
                env.step(action)
            
        game_replays.append(env.history)


        # POST GAME STATS #
        winner = env.world_state['winners']
        day = env.world_state['day']

        if winner:
            wolf_wins += 1
        else:
            villager_wins += 1
        
    return wolf_wins, villager_wins, game_replays


env = raw_env(num_agents=10, werewolves=2)
env.reset()
num_agents = 10
num_actions = env.action_spaces['player_1'].n
observation_size = flat_obs(env.observation_spaces['player_1'].sample()['observation']).shape[-1]

# Learner Setup
ppo_agent = Agent(num_actions=num_actions, obs_size=observation_size)
ppo_agent = torch.load("ppo_agent")

wwins, vwins, game_replay = play_and_return_stats(env, random_wolf_policy, ppo_agent, num_games=10)
print(f'Wolf wins : {wwins}, Villager wins : {vwins}')

    

Wolf wins : 6, Villager wins : 4


In [9]:
game_replay[0][0]

{'day': 1,
 'phase': <Phase.ACCUSATION: 0>,
 'alive': ['player_3', 'player_5'],
 'killed': ['player_4', 'player_1', 'player_2', 'player_8'],
 'executed': ['player_7', 'player_6', 'player_10', 'player_9'],
 'werewolves': ['player_3', 'player_10'],
 'villagers': ['player_1',
  'player_2',
  'player_4',
  'player_5',
  'player_6',
  'player_7',
  'player_8',
  'player_9'],
 'votes': {'player_1': 0,
  'player_2': 6,
  'player_3': 6,
  'player_4': 0,
  'player_5': 6,
  'player_6': 0,
  'player_7': 9,
  'player_8': 7,
  'player_9': 5,
  'player_10': 6},
 'winners': None}

In [10]:
game_replay[0][-1]

{'day': 4,
 'phase': 2,
 'alive': ['player_3', 'player_5'],
 'killed': ['player_4', 'player_1', 'player_2', 'player_8'],
 'executed': ['player_7', 'player_6', 'player_10', 'player_9'],
 'werewolves': ['player_3', 'player_10'],
 'villagers': ['player_1',
  'player_2',
  'player_4',
  'player_5',
  'player_6',
  'player_7',
  'player_8',
  'player_9'],
 'votes': {'player_3': 7},
 'winners': <Roles.WEREWOLF: 1>}

In [35]:
ent_coef = 0.1 #
vf_coef = 0.1 #
clip_coef = 0.1 #
gamma = 0.99 #
gae_lambda = 0.95
batch_size = 16 #
max_cycles = 125 #
total_episodes = 50000 #
update_epochs = 3 #

# stats to keep track of for custom metrics
self_voting = []
dead_voting = []

avg_game_length = 0
wolf_wins = 0
villager_wins = 0

env = raw_env(num_agents=10, werewolves=2)
env.reset()
num_agents = 10
num_actions = env.action_spaces['player_1'].n
observation_size = flat_obs(env.observation_spaces['player_1'].sample()['observation']).shape[-1]

# Learner Setup
ppo_agent = Agent(num_actions=num_actions, obs_size=observation_size)
optimizer = torch.optim.Adam(ppo_agent.parameters(), lr=0.001, eps=1e-5)


with mlflow.start_run(run_name='Random Wolf behavior'):
    for episode in tqdm(range(total_episodes)):
        with torch.no_grad():
            env.reset()

            # brain and extra stats 
            wolf_brain = {'day': 1, 'phase': 0, 'action': None}
            self_votes = 0
            dead_votes = 0
            
            # magent_list = {agent: [] for agent in env.agents}
            magent_list = {agent : [] for agent in env.agents if not env.agent_roles[agent]}

            # print(magent_list.keys())
            for magent in env.agent_iter():
                observation, reward, termination, truncation, info = env.last()

                day = observation['observation']['day']
                phase = observation['observation']['phase']

                if wolf_brain['day'] != day or wolf_brain['phase'] != phase:
                    wolf_brain = {'day': day, 'phase': phase, 'action': None}

                role = observation['observation']['roles'][observation['observation']['self_id']]

                # werewolves have full role TODO: add logic for wolves herevisibility
                if role == Roles.WEREWOLF:
                    # action = revenge_wolf_policy(observation, magent, action=wolf_brain['action']) if not termination or truncation else None
                    # wolf_brain['action'] = action
                    # action = random_policy(observation, magent) if not termination or truncation else None

                    action = random_wolf_policy(observation, agent, action=wolf_brain['action']) if not termination or truncation else None
                    wolf_brain['action'] = action
                else:
                    obs = torch.Tensor(flat_obs(observation['observation']))
                    if not termination or truncation:
                        action, logprobs, _, value = ppo_agent.get_action_and_value(obs)
                    else:
                        action = None

                    # grab some villager stats we think are useful
                    # TODO : maybe make these callbacks?
                    if action == observation['observation']['self_id']:
                        self_votes += 1
                
                    if action in [i for i, status in enumerate(observation['observation']['player_status']) if status == False]:
                        dead_votes += 1

                    magent_list[magent].append({
                        "obs": obs, 
                        "action": action,
                        "prev_reward": reward,
                        "logprobs": logprobs,
                        "term": termination,
                        "value": value
                        })

                env.step(action)
            
            # take the sequential observations of each agent, and store them appropriately
            magent_obs = {agent: {'obs': [], 'rewards': [], 'actions': [], 'logprobs': [], 'values': [], 'terms': []} for agent in magent_list}
            for key, value in magent_list.items():
                # print(f'-- {key} --')
                for s1, s2 in zip(value, value[1:]):
                    magent_obs[key]['obs'].append(s1['obs'])
                    magent_obs[key]['rewards'].append(s2['prev_reward'])
                    magent_obs[key]['actions'].append(s1['action'])
                    magent_obs[key]['logprobs'].append(s1['logprobs'])
                    magent_obs[key]['values'].append(s1['value'])
                    magent_obs[key]['terms'].append(s2['term'])


        # POST GAME STATS #
        winner = env.world_state['winners']
        day = env.world_state['day']

        self_voting.append(self_votes)
        dead_voting.append(dead_votes)

        if winner:
            wolf_wins += 1
        else:
            villager_wins += 1
        
        avg_game_length += (day * 1.0)/total_episodes
        # END OF POST GAME STATS #

        mlflow.log_metric("avg_game_len", f'{avg_game_length:.2f}')
        mlflow.log_metric("avg_self_votes", f'{sum(self_voting)/len(self_voting)}')
        mlflow.log_metric("avg_dead_votes", f'{sum(dead_voting)/len(dead_voting)}')
        mlflow.log_metric("wolf wins", wolf_wins)
        mlflow.log_metric("villager wins", villager_wins)
        if episode % 50 == 0:
            #wwins, vwins = play_and_return_stats(env, random_wolf_policy, ppo_agent, num_games=100)
            #mlflow.log_metric("wwins", wwins)
            #mlflow.log_metric("vwins", vwins)
            continue
        
        

        # We will do this for each agent in the episode
        # essentially we are calculating advantages and returns
        with torch.no_grad():
            for player, records in magent_obs.items():
                # print(f'{records}')
                advantages = torch.zeros_like(torch.tensor(records['rewards']))

                for t in reversed(range(len(records['obs']))):
                    # print(f'T: {t+1} - Rewards : {torch.tensor(records["rewards"])[t+1]} ')
                    # not using terms, as these are episodic

                    ## this was the last one. We are not using any terminal states in a good way

                    if t == len(records['obs']) - 1:
                        #print(f'T: {t} - Rewards at end : {torch.tensor(records["rewards"])[t]} ')
                        #print(f'T: {t} - Actions at end : {torch.tensor(records["actions"])[t]} ')
                        delta = records["rewards"][t] - records["values"][t]
                        advantages[t]  = delta
                    else:
                        #print(f'T: {t} - Rewards : {torch.tensor(records["rewards"])[t]} ')
                        #print(f'T: {t} - Actions : {torch.tensor(records["actions"])[t]} ')                    
                        delta = records["rewards"][t] + gamma * records["values"][t+1] - records["values"][t]
                        advantages[t]  = delta + gamma * gamma * advantages[t+1]

                    #delta = records['rewards'][t] + gamma * records['values'][t+1] - records['values'][t]
                magent_obs[player]["advantages"] = advantages
                magent_obs[player]["returns"] = advantages + torch.tensor(records["values"])
                    #advantages[t] = delta + gamma * gamma * advantages[t+1]
        

        # new logic, maybe we do this after a couple of games, so we get more data overall?
        

        # optimize the policy and the value network now
        # we can take all our observations now and flatten them into one bigger list of individual transitions
        # TODO: could make this setting into a single loop, but maybe this is clearer. ALso could make all these tensors earlier
        b_observations = torch.cat([torch.stack(item['obs']) for item in magent_obs.values()])
        b_logprobs = torch.cat([torch.stack(item['logprobs']) for item in magent_obs.values()])
        b_actions = torch.cat([torch.stack(item['actions']) for item in magent_obs.values()])
        b_returns = torch.cat([item['returns'] for item in magent_obs.values()])
        b_values = torch.cat([torch.stack(item['values']) for item in magent_obs.values()])
        b_advantages =  torch.cat([item['advantages'] for item in magent_obs.values()])



        # b_index stands for batch index
        b_index = np.arange(len(b_observations))
        clip_fracs = []
        for epoch in range(update_epochs):
            np.random.shuffle(b_index)
            for start in range(0, len(b_observations), batch_size):
                end = start + batch_size
                batch_index = b_index[start:end]

                _, newlogprob, entropy, value = ppo_agent.get_action_and_value(
                    b_observations[batch_index], b_actions.long()[batch_index])
                
                logratio = newlogprob - b_logprobs[batch_index]
                ratio = logratio.exp()

                with torch.no_grad():
                    # calculate approx_kl http://joschu.net/blog/kl-approx.html
                    old_approx_kl = (-logratio).mean()
                    approx_kl = ((ratio - 1) - logratio).mean()
                    clip_fracs += [
                        ((ratio - 1.0).abs() > clip_coef).float().mean().item()
                    ]
                
                # normalizing advantages
                advantages = b_advantages[batch_index]
                advantages = advantages.float()
                advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

                # policy loss
                pg_loss1 = -advantages * ratio
                pg_loss2 = -advantages * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef)
                pg_loss = torch.max(pg_loss1, pg_loss2).mean()

                # value loss
                value = value.flatten()
                v_loss_unclipped = (value - b_returns[batch_index]) ** 2
                v_clipped = b_values[batch_index] + torch.clamp(
                    value - b_values[batch_index],
                    -clip_coef,
                    clip_coef,
                )
                v_loss_clipped = (v_clipped - b_returns[batch_index]) ** 2
                v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                v_loss = 0.5 * v_loss_max.mean()

                entropy_loss = entropy.mean()
                loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        
        # could move them from GPU here
        y_pred, y_true = b_values.numpy(), b_returns.numpy()
        var_y = np.var(y_true)
        explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
        
        # if episode % 20 == 0:
            # print(f"Training episode {episode}")
            # #print(f"Episodic Return: {np.mean(total_episodic_return)}")
            # #print(f"Episode Length: {end_step}")
            # print("")
            # print(f"Value Loss: {v_loss.item()}")
            # print(f"Policy Loss: {pg_loss.item()}")
            # print(f"Old Approx KL: {old_approx_kl.item()}")
            # print(f"Approx KL: {approx_kl.item()}")
            # print(f"Clip Fraction: {np.mean(clip_fracs)}")
            # print(f"Explained Variance: {explained_var.item()}")
            # print("\n-------------------------------------------\n")

            # also check some stats and try to log these

    # At the end, print some stuff here for overall stats

    print(f'Average game length = {avg_game_length:.2f}')
    print(f'Wolf wins : {wolf_wins}')
    print(f'Villager wins: {villager_wins}')
    print(f'Avg amount of self votes a game across villagers: {sum(self_voting)/len(self_voting)}')
    print(f'Avg amount of dead votes a game across villagers: {sum(dead_voting)/len(dead_voting)}')

100%|██████████| 50000/50000 [1:21:10<00:00, 10.27it/s]

Average game length = 4.00
Wolf wins : 50000
Villager wins: 0
Avg amount of self votes a game across villagers: 5.38126
Avg amount of dead votes a game across villagers: 5.52664





In [68]:
env = raw_env(num_agents=10, werewolves=1)
env.reset()
num_agents = 5
num_actions = env.action_spaces['player_1'].n
observation_size = flat_obs(env.observation_spaces['player_1'].sample()['observation']).shape[-1]

# Learner Setup
# ppo_agent = Agent(num_actions=num_actions, obs_size=observation_size)

wwins, vwins = play_and_return_stats(env, random_wolf_policy, ppo_agent, num_games=20)
print(f'Wolf wins : {wwins}, Villager wins : {vwins}')

Wolf wins : 8, Villager wins : 12


In [70]:
torch.save(ppo_agent, "ppo_agent")

### Coordinated wolf execution

In [31]:
env = raw_env(num_agents=10, werewolves=2)
env.reset()

num_games = 1000
avg_game_length = 0
wolf_wins = 0
villager_wins = 0

for _ in tqdm(range(num_games)):
    env.reset()
    wolf_brain = {'day': 1, 'phase': 0, 'action': None}
    
    for agent in env.agent_iter():
        observation, reward, termination, truncation, info = env.last()
        
        day = observation['observation']['day']
        phase = observation['observation']['phase']

        if wolf_brain['day'] != day or wolf_brain['phase'] != phase:
            wolf_brain = {'day': day, 'phase': phase, 'action': None}

        role = observation['observation']['roles'][observation['observation']['self_id']]

        if role == Roles.WEREWOLF:
            action = random_wolf_policy(observation, agent, action=wolf_brain['action']) if not termination or truncation else None
            wolf_brain['action'] = action
        else:
            action = random_policy(observation, agent) if not termination or truncation else None

        env.step(action)

    # get some stats
    winner = env.world_state['winners']
    day = env.world_state['day']

    if winner:
        wolf_wins += 1
    else:
        villager_wins += 1
    
    avg_game_length += (day * 1.0)/num_games 

print(f'Average game length = {avg_game_length:.2f}')
print(f'Wolf wins : {wolf_wins}')
print(f'Villager wins: {villager_wins}')

100%|██████████| 1000/1000 [00:01<00:00, 843.99it/s]

Average game length = 4.15
Wolf wins : 925
Villager wins: 75





### Random Wolves, not coordinated

In [60]:
ten_player_env = raw_env(num_agents=10, werewolves=1)

avg_game_length = 0
wolf_wins = 0
villager_wins = 0

num_games = 20

ten_player_env.reset()

for _ in tqdm(range(num_games)):

    for agent in ten_player_env.agent_iter():
        observation, reward, termination, truncation, info = ten_player_env.last()
        action = random_policy(observation, agent) if not termination or truncation else None
        ten_player_env.step(action)
    
    # get some stats
    winner = ten_player_env.world_state['winners']
    day = ten_player_env.world_state['day']

    if winner:
        wolf_wins += 1
    else:
        villager_wins += 1
    
    avg_game_length += (day * 1.0)/num_games 

    # reset 
    ten_player_env.reset()

print(f'Average game length = {avg_game_length:.2f}')
print(f'Wolf wins : {wolf_wins}')
print(f'Villager wins: {villager_wins}')

100%|██████████| 20/20 [00:00<00:00, 390.35it/s]

Average game length = 4.15
Wolf wins : 13
Villager wins: 7





In [1]:
import mlflow

In [2]:
mlflow.set_experiment(experiment_name="mlflow-test")
mlflow.log_metric("test", 5)

2023/02/26 01:02:02 INFO mlflow.tracking.fluent: Experiment with name 'mlflow-test' does not exist. Creating a new experiment.
