In [2]:
import numpy as np
import torch
import sys
sys.path.append('../')
from voting_games.werewolf_env_v0 import parrallel_raw_env, Roles
import random
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
class Agent(torch.nn.Module):
    def __init__(self, approval_states, num_players, obs_size):
        super().__init__()

        self.critic = torch.nn.Sequential(
            self._layer_init(torch.nn.Linear(obs_size+1, 64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64,64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64,1), std=1.0),
        )

        self.actor = torch.nn.Sequential(
            self._layer_init(torch.nn.Linear(obs_size+1, 64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64,64)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(64, approval_states), std=0.01),
        )

        self.num_players = num_players
    
    def get_value(self, x):
        return self.critic(torch.stack([torch.cat((torch.tensor([i]), x)) for i in range(self.num_players)]))
    
    def get_action_and_value(self, x, action=None):
        # could call the network each time, with a different integer for each player?  get approvals that way
        # x is the flattened observation. we should go ahead and run each of the player_ids appended to full obs to get multiple classifications
        # how  to handle entropy here? maybe we multiply all the probs, and then calculate the overall entropy
        # self.critic needs to be changed too, to return an array

        # option to have critic/actors for every single player?

        # option to also delevt n-1 * n-2 for -1s on the wolf
        
        # get logits for every single player in the game.
        x = torch.stack([torch.cat((torch.tensor([i]), x)) for i in range(self.num_players)])
        logits = self.actor(x)
        probs = torch.distributions.categorical.Categorical(logits=logits)

        if action is None:
            action = probs.sample()
        
        # we multiply the entropy, and we add the log_probs together
        # TODO: multiple values for critic. should I average?
        return action, torch.sum(probs.log_prob(action)), torch.prod(probs.entropy()), self.critic(x)

    def _layer_init(self, layer, std=np.sqrt(2), bias_const=0.0):
        torch.nn.init.orthogonal_(layer.weight, std)
        torch.nn.init.constant_(layer.bias, bias_const)
        return layer

In [4]:
def random_policy(observation, agent):
    # these are the other wolves. we cannot vote for them either
    player_status = list(range(len(observation['observation']['player_status'])))
    # dead players
    action_mask = observation['action_mask']
    me = observation['observation']['self_id']

    legal_actions = [action for action,is_alive,is_wolf in zip(player_status, action_mask, observation['observation']['roles']) if is_alive and not is_wolf]
    # wolves don't vote for other wolves. will select another villager at random
    player = random.choice(legal_actions)

    action = [0] * len(action_mask)
    action[me] = 1
    action[player] = -1
    return action

def revenge_wolf_policy(observation, agent, action=None):
    # we already know the agent is a werewolf
    me = observation['observation']['self_id']

    # who voted for me 
    votes_against_me = [i for i, x in enumerate(observation['observation']['votes']) if x == -1 and i == me]

    # remove any wolves who voted for me (they should not have)
    wolf_ids = [i for i, x in enumerate(observation['observation']['roles']) if x == 1 and i != me]
    votes_against_me = list(set(votes_against_me)^set(wolf_ids))

    # remove any players who voted for me but are dead now
    votes_against_me = [i for i in votes_against_me if observation['observation']['player_status'][i] == True]

    villagers_alive = [i for i, x in enumerate(observation['observation']['roles']) \
        if observation['observation']['player_status'][i] == True and x == 0]

    # if there are no votes against me, pick a random villager that is alive
    player_selected = random.choice(votes_against_me) if len(votes_against_me) > 0 else random.choice(villagers_alive)
    choice = [-1] * len(observation['action_mask'])

    choice[me] = 1
    for wid in wolf_ids:
        choice[wid] = 1

    return action if action != None else choice

def random_wolf_policy(observation, agent, action=None):
    # we already know the agent is a werewolf
    wolf_ids = [i for i, x in enumerate(observation['observation']['roles']) if x == 1]
    villagers_alive = [i for i, x in enumerate(observation['observation']['roles']) \
        if observation['observation']['player_status'][i] == True and x == 0]

    # if there are no votes against me, pick a random villager that is alive
    player_selected =  random.choice(villagers_alive)
    choice = [0] * len(observation['observation']['player_status'])
    
    for wid in wolf_ids:
        choice[wid] = 1

    choice[player_selected] = -1

    return action if action != None else choice

In [18]:
env = parrallel_raw_env(num_agents=10, werewolves=2)
env.reset()

num_games = 1000
avg_game_length = 0
wolf_wins = 0
villager_wins = 0

for _ in tqdm(range(num_games)):
    env.reset()
    wolf_brain = {'day': 1, 'phase': 0, 'action': None}
    
    for agent in env.agent_iter():
        observation, reward, termination, truncation, info = env.last()
        
        day = observation['observation']['day']
        phase = observation['observation']['phase']

        if wolf_brain['day'] != day or wolf_brain['phase'] != phase:
            wolf_brain = {'day': day, 'phase': phase, 'action': None}

        role = observation['observation']['roles'][observation['observation']['self_id']]

        if role == Roles.WEREWOLF:
            action = random_wolf_policy(observation, agent, action=wolf_brain['action']) if not termination or truncation else None
            wolf_brain['action'] = action
        else:
            action = random_policy(observation, agent) if not termination or truncation else None

        env.step(action)

    # get some stats
    winner = env.world_state['winners']
    day = env.world_state['day']

    if winner:
        wolf_wins += 1
    else:
        villager_wins += 1
    
    avg_game_length += (day * 1.0)/num_games 

print(f'Average game length = {avg_game_length:.2f}')
print(f'Wolf wins : {wolf_wins}')
print(f'Villager wins: {villager_wins}')

100%|██████████| 1000/1000 [00:02<00:00, 439.63it/s]

Average game length = 4.16
Wolf wins : 943
Villager wins: 57





In [20]:
env = parrallel_raw_env(num_agents=10, werewolves=2)
env.reset()

num_games = 1000
avg_game_length = 0
wolf_wins = 0
villager_wins = 0

for _ in tqdm(range(num_games)):
    env.reset()
    wolf_brain = {'day': 1, 'phase': 0, 'action': None}
    
    for agent in env.agent_iter():
        observation, reward, termination, truncation, info = env.last()
        
        day = observation['observation']['day']
        phase = observation['observation']['phase']

        if wolf_brain['day'] != day or wolf_brain['phase'] != phase:
            wolf_brain = {'day': day, 'phase': phase, 'action': None}

        role = observation['observation']['roles'][observation['observation']['self_id']]

        if role == Roles.WEREWOLF:
            action = revenge_wolf_policy(observation, agent, action=wolf_brain['action']) if not termination or truncation else None
            wolf_brain['action'] = action
        else:
            action = random_policy(observation, agent) if not termination or truncation else None

        env.step(action)

    # get some stats
    winner = env.world_state['winners']
    day = env.world_state['day']

    if winner:
        wolf_wins += 1
    else:
        villager_wins += 1
    
    avg_game_length += (day * 1.0)/num_games 

print(f'Average game length = {avg_game_length:.2f}')
print(f'Wolf wins : {wolf_wins}')
print(f'Villager wins: {villager_wins}')

100%|██████████| 1000/1000 [00:02<00:00, 430.65it/s]

Average game length = 4.04
Wolf wins : 997
Villager wins: 3





In [19]:
ten_player_env = parrallel_raw_env(num_agents=10, werewolves=1)

avg_game_length = 0
wolf_wins = 0
villager_wins = 0

num_games = 1000

ten_player_env.reset()

for _ in tqdm(range(num_games)):

    for agent in ten_player_env.agent_iter():
        observation, reward, termination, truncation, info = ten_player_env.last()
        action = random_policy(observation, agent) if not termination or truncation else None
        ten_player_env.step(action)
    
    # get some stats
    winner = ten_player_env.world_state['winners']
    day = ten_player_env.world_state['day']

    if winner:
        wolf_wins += 1
    else:
        villager_wins += 1
    
    avg_game_length += (day * 1.0)/num_games 

    # reset 
    ten_player_env.reset()

print(f'Average game length = {avg_game_length:.2f}')
print(f'Wolf wins : {wolf_wins}')
print(f'Villager wins: {villager_wins}')

100%|██████████| 1000/1000 [00:02<00:00, 451.78it/s]

Average game length = 4.15
Wolf wins : 643
Villager wins: 357





## Training on approval

Because we need to generate approvals for every other agent, we will have to call the neural network n-1 times. We should look at batching this, as well as generating the proper observation

maybe we have to call the model n times, but then add the loss together?

In [28]:
def play_and_return_stats(env, wolf_policy, agent_policy, num_games=1000):
    wolf_wins = 0
    villager_wins = 0

    game_replays = []
    for _ in range(num_games):
        with torch.no_grad():
            env.reset()

            # brain and extra stats 
            wolf_brain = {'day': 1, 'phase': 0, 'action': None}
            
            # magent_list = {agent: [] for agent in env.agents}
            magent_list = {agent : {"self_votes": 0, "dead_votes": 0, "lasted_for": 0} for agent in env.agents if not env.agent_roles[agent]}

            # print(magent_list.keys())
            for magent in env.agent_iter():
                observation, reward, termination, truncation, info = env.last()

                day = observation['observation']['day']
                phase = observation['observation']['phase']

                if wolf_brain['day'] != day or wolf_brain['phase'] != phase:
                    wolf_brain = {'day': day, 'phase': phase, 'action': None}

                role = observation['observation']['roles'][observation['observation']['self_id']]

                # werewolves have full role TODO: add logic for wolves herevisibility
                if role == Roles.WEREWOLF:
                    # action = revenge_wolf_policy(observation, magent, action=wolf_brain['action']) if not termination or truncation else None
                    # wolf_brain['action'] = action
                    # action = random_policy(observation, magent) if not termination or truncation else None

                    action = wolf_policy(observation, None, action=wolf_brain['action']) if not termination or truncation else None
                    wolf_brain['action'] = action
                else:
                    # action = true_random_policy(observation, agent) if not termination or truncation else None
                    obs = torch.Tensor(env.convert_obs(observation['observation']))
                    if not termination or truncation:
                        action, logprobs, _, value = agent_policy.get_action_and_value(obs)
                        action = action.tolist()
                    else:
                        action = None

                    # grab some villager stats we think are useful
                    # TODO : maybe make these callbacks?
                    if action == observation['observation']['self_id']:
                        magent_list[magent]['self_votes'] += 1
                
                    if action in [i for i, status in enumerate(observation['observation']['player_status']) if status == False]:
                        magent_list[magent]['dead_votes'] += 1
                        
                env.step(action)
            
        game_replays.append(env.history)


        # POST GAME STATS #
        winner = env.world_state['winners']
        day = env.world_state['day']

        if winner:
            wolf_wins += 1
        else:
            villager_wins += 1
        
    return wolf_wins, villager_wins, game_replays


env = parrallel_raw_env(num_agents=10, werewolves=2)
env.reset()
num_agents = 10
observation_size = env.convert_obs(env.observation_spaces['player_1'].sample()['observation']).shape[-1]

# Learner Setup
ppo_agent = Agent(num_players=num_agents, approval_states= 3, obs_size=observation_size)
# ppo_agent = torch.load("ppo_agent")

wwins, vwins, game_replay = play_and_return_stats(env, random_wolf_policy, ppo_agent, num_games=2)
print(f'Wolf wins : {wwins}, Villager wins : {vwins}')


Wolf wins : 2, Villager wins : 0
