In [1]:
import numpy as np
import torch
import sys
sys.path.append('../')
from voting_games.werewolf_env_v0 import pare, pare_Role, pare_Phase
import random
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


Done


In [2]:
env = pare(num_agents=10, werewolves=2)
env.reset()

def random_coordinated_wolf(env):
    actions = {}

    villagers_remaining = set(env.world_state["villagers"]) & set(env.world_state['alive'])
    wolves_remaining = set(env.world_state["werewolves"]) & set(env.world_state['alive'])

    target = random.choice(list(villagers_remaining))
    # pick 
    for wolf in wolves_remaining:
        actions[wolf] = [0] * len(env.possible_agents)
        actions[wolf][int(target.split("_")[-1])] = -1
        for curr_wolf in wolves_remaining:
            actions[wolf][int(curr_wolf.split("_")[-1])] = 1

    return actions

def aggressive_wolf(env):
    actions = {}

    wolves_remaining = set(env.world_state["werewolves"]) & set(env.world_state['alive'])

    for wolf in wolves_remaining:
        actions[wolf] = [-1] * len(env.possible_agents)
        for curr_wolf in wolves_remaining:
            actions[wolf][int(curr_wolf.split("_")[-1])] = 1

    return actions

def random_wolfs(env):
    return {wolf: env.action_space(wolf).sample().tolist() for
            wolf in set(env.world_state["werewolves"]) & set(env.world_state['alive'])}


def revenge_coordinated_wolf(env, actions = None):
    villagers_remaining = set(env.world_state["villagers"]) & set(env.world_state['alive'])
    wolves_remaining = set(env.world_state["werewolves"]) & set(env.world_state['alive'])

    # who tried to vote out a wolf last time?
    
    target = random.choice(list(villagers_remaining))
    # pick 
    for wolf in wolves_remaining:
        actions[wolf] = [0] * len(env.possible_agents)
        actions[wolf][int(target.split("_")[-1])] = -1
        for curr_wolf in wolves_remaining:
            actions[wolf][int(curr_wolf.split("_")[-1])] = 1
    # for wolf in env.werewolves_remaining:

def random_single_target_villager(env, agent):
    targets = set(env.world_state["alive"]) - set([agent])
    action = [0] * len(env.possible_agents)
    action[int(agent.split("_")[-1])] = 1
    action[int(random.choice(list(targets)).split("_")[-1])] = -1

    return action
    # for villager in env.villagers_remaining:

# random_coordinated_wolf(env)
def random_agent_action(env, agent):
   return env.action_space(agent).sample().tolist()

# def random_wolf(env, action=None):


In [3]:
def play_static_wolf_game(env, wolf_policy, villager_agent, num_times=100) -> tuple(pare_Role):

    villager_wins = 0
    loop = tqdm(range(num_times))

    for _ in loop:
        observations, rewards, terminations, truncations, infos = env.reset()
        while env.agents:
            actions = {}

            villagers = set(env.agents) & set(env.world_state["villagers"])
            wolves = set(env.agents) & set(env.world_state["werewolves"])

            # villager steps
            if env.world_state["phase"] != pare_Phase.NIGHT:
                # villagers actions
                for villager in villagers:
                    actions[villager] = villager_agent(env, villager)

            # wolf steps
            actions = actions | wolf_policy(env)
        
            observations, rewards, terminations, truncations, infos = env.step(actions)

        winner = env.world_state['winners']
        if winner == pare_Role.VILLAGER:
            villager_wins += 1

        loop.set_description(f"Villagers won {villager_wins} out of a total of {num_times} games")

env = pare(num_agents=10, werewolves=2)
env.reset()

print("Random Coordinated Wolves")
print("\t vs. Single Target Random Villagers")
play_static_wolf_game(env, random_coordinated_wolf, random_single_target_villager, num_times=1000)
print("\t vs. Random Villagers")
play_static_wolf_game(env, random_coordinated_wolf, random_agent_action, num_times=1000)
print("------------------------------------\n")
print("Aggresive Wolves")
print("\t vs. Single Target Random Villagers")
play_static_wolf_game(env, aggressive_wolf, random_single_target_villager, num_times=1000)
print("\t vs. Random Villagers")
play_static_wolf_game(env, aggressive_wolf, random_agent_action, num_times=1000)
print("------------------------------------\n")
print("Random Wolves")
print("\t vs. Single Target Random Villagers")
play_static_wolf_game(env, random_wolfs, random_single_target_villager, num_times=1000)
print("\t vs. Random Villagers")
play_static_wolf_game(env, random_wolfs, random_agent_action, num_times=1000)
print("------------------------------------\n")


Random Coordinated Wolves
	 vs. Single Target Random Villagers


Villagers won 120 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:03<00:00, 260.31it/s]


	 vs. Random Villagers


Villagers won 78 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:05<00:00, 178.42it/s]


------------------------------------

Aggresive Wolves
	 vs. Single Target Random Villagers


Villagers won 21 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:03<00:00, 268.10it/s]


	 vs. Random Villagers


Villagers won 8 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:05<00:00, 182.02it/s]


------------------------------------

Random Wolves
	 vs. Single Target Random Villagers


Villagers won 662 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:04<00:00, 209.78it/s]


	 vs. Random Villagers


Villagers won 617 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:06<00:00, 154.55it/s]

------------------------------------






## Training Agents

The agent structure will be much the same as the agent structure in

In [5]:
from learning_agents.approval_agents import SequentialAgent
import mlflow

In [6]:
ent_coef = 0.1 #
vf_coef = 0.1 #
clip_coef = 0.1 #
gamma = 0.99 #
gae_lambda = 0.95
batch_size = 32 #
max_cycles = 125 #
total_episodes = 10000 #
update_epochs = 3 #

# stats to keep track of for custom metrics
self_voting = []
dead_voting = []

avg_game_length = 0
wolf_wins = 0
villager_wins = 0

num_agents = 10
env = pare(num_agents=num_agents, werewolves=2)
env.reset()
observation_size = env.convert_obs(env.observation_spaces['player_1'].sample()['observation']).shape[-1]

# Learner Setup
training_agent = SequentialAgent(num_players=num_agents, approval_states= 3, obs_size=observation_size)
optimizer = torch.optim.Adam(training_agent.parameters(), lr=0.001, eps=1e-5)


with mlflow.start_run(run_name='Training our agents'):
    
    for episode in tqdm(range(total_episodes)):
        observations, rewards, terminations, truncations, infos = env.reset()
        magent_obs = {agent: {'obs': [], 'rewards': [], 'actions': [], 'logprobs': [], 'values': [], 'terms': []} for agent in env.agents if not env.agent_roles[agent]}

        with torch.no_grad():
            while env.agents:
                actions = {}
                villagers = set(env.agents) & set(env.world_state["villagers"])
                wolves = set(env.agents) & set(env.world_state["werewolves"])

                if env.world_state["phase"] != pare_Phase.NIGHT:
                    # villagers actions
                    for villager in villagers:
                        # only cares about observations
                        obs = torch.Tensor(env.convert_obs(observations[villager]['observation']))
                        ml_action, logprobs, _, value = training_agent.get_action_and_value(obs)
                        
                        actions[villager] = training_agent.convert_actions_to_approvals(ml_action)

                        # missing rewards, term
                        magent_obs[villager]["obs"].append(obs)
                        magent_obs[villager]["actions"].append(ml_action)
                        magent_obs[villager]["logprobs"].append(logprobs)
                        magent_obs[villager]["values"].append(value)

                # TODO : pass in a wolf policy possibly
                actions = actions | random_coordinated_wolf(env)
            
                observations, rewards, terminations, truncations, infos = env.step(actions)

                # lets add the post step information now
                # TODO: Should we not care about villagers losing here? What if they die at night, we want them to get a death reward
                # TODO: What if the game ends on a night kill? We want the villagers to still get their rewards
                #if env.history[-1]["phase"] != pare_Phase.NIGHT:
                for villager in villagers:
                    if env.history[-1]["phase"] == pare_Phase.NIGHT:
                        magent_obs[villager]["rewards"][-1] += rewards[villager]
                        magent_obs[villager]["terms"][-1] += terminations[villager]
                    else:
                        magent_obs[villager]["rewards"].append(rewards[villager])
                        magent_obs[villager]["terms"].append(terminations[villager])

            # POST GAME STATS #
            winner = env.world_state['winners']

            if winner == pare_Role.VILLAGER:
                villager_wins += 1
        
            # END OF POST GAME STATS #
            mlflow.log_metric("villager wins", villager_wins)
            if episode % 50 == 0:
                #wwins, vwins = play_and_return_stats(env, random_wolf_policy, ppo_agent, num_games=100)
                #mlflow.log_metric("wwins", wwins)
                #mlflow.log_metric("vwins", vwins)
                continue
        
        
        
        # We will do this for each agent in the episode
        # essentially we are calculating advantages and returns
            with torch.no_grad():
                for player, records in magent_obs.items():
                    # print(f'{records}')
                    advantages = torch.zeros_like(torch.tensor(records['rewards']))

                    for t in reversed(range(len(records['obs']))):
                        # print(f'T: {t+1} - Rewards : {torch.tensor(records["rewards"])[t+1]} ')
                        # not using terms, as these are episodic

                        ## this was the last one. We are not using any terminal states in a good way

                        if t == len(records['obs']) - 1:
                            #print(f'T: {t} - Rewards at end : {torch.tensor(records["rewards"])[t]} ')
                            #print(f'T: {t} - Actions at end : {torch.tensor(records["actions"])[t]} ')
                            delta = records["rewards"][t] - records["values"][t]
                            advantages[t]  = delta
                        else:
                            #print(f'T: {t} - Rewards : {torch.tensor(records["rewards"])[t]} ')
                            #print(f'T: {t} - Actions : {torch.tensor(records["actions"])[t]} ')                    
                            delta = records["rewards"][t] + gamma * records["values"][t+1] - records["values"][t]
                            advantages[t]  = delta + gamma * gamma * advantages[t+1]

                        #delta = records['rewards'][t] + gamma * records['values'][t+1] - records['values'][t]
                    magent_obs[player]["advantages"] = advantages
                    magent_obs[player]["returns"] = advantages + torch.tensor(records["values"])
                        #advantages[t] = delta + gamma * gamma * advantages[t+1]
        
            # new logic, maybe we do this after a couple of games, so we get more data overall?

            # optimize the policy and the value network now
            # we can take all our observations now and flatten them into one bigger list of individual transitions
            # TODO: could make this setting into a single loop, but maybe this is clearer. ALso could make all these tensors earlier
            b_observations = torch.cat([torch.stack(item['obs']) for item in magent_obs.values()])
            b_logprobs = torch.cat([torch.stack(item['logprobs']) for item in magent_obs.values()])
            b_actions = torch.cat([torch.stack(item['actions']) for item in magent_obs.values()])
            b_returns = torch.cat([item['returns'] for item in magent_obs.values()])
            b_values = torch.cat([torch.stack(item['values']) for item in magent_obs.values()])
            b_advantages =  torch.cat([item['advantages'] for item in magent_obs.values()])



            # b_index stands for batch index
            b_index = np.arange(len(b_observations))
            clip_fracs = []
            for epoch in range(update_epochs):
                np.random.shuffle(b_index)
                for start in range(0, len(b_observations), batch_size):
                    end = start + batch_size
                    batch_index = b_index[start:end]

                    # TODO: batched actions, How to handle batched observations and acctions properly in the agent
                    #       Maybe a different 

                    # newlogprob needs to return a list of logprobs
                    _, newlogprob, entropy, value = training_agent.get_batched_action_and_value(
                        b_observations[batch_index], b_actions[batch_index])
                    
                    logratio = newlogprob - b_logprobs[batch_index]
                    ratio = logratio.exp()

                    with torch.no_grad():
                        # calculate approx_kl http://joschu.net/blog/kl-approx.html
                        old_approx_kl = (-logratio).mean()
                        approx_kl = ((ratio - 1) - logratio).mean()
                        clip_fracs += [
                            ((ratio - 1.0).abs() > clip_coef).float().mean().item()
                        ]
                    
                    # normalizing advantages
                    advantages = b_advantages[batch_index]
                    advantages = advantages.float()
                    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

                    # policy loss
                    pg_loss1 = -advantages * ratio
                    pg_loss2 = -advantages * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef)
                    pg_loss = torch.max(pg_loss1, pg_loss2).mean()

                    # value loss
                    value = value.flatten()
                    v_loss_unclipped = (value - b_returns[batch_index]) ** 2
                    v_clipped = b_values[batch_index] + torch.clamp(
                        value - b_values[batch_index],
                        -clip_coef,
                        clip_coef,
                    )
                    v_loss_clipped = (v_clipped - b_returns[batch_index]) ** 2
                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                    v_loss = 0.5 * v_loss_max.mean()

                    entropy_loss = entropy.mean()
                    loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef

                    optimizer.zero_grad()
                    # loss = torch.Variable(loss, requires_grad = True)
                    loss.requires_grad = True
                    loss.backward()
                    optimizer.step()
            
            # could move them from GPU here
            y_pred, y_true = b_values.numpy(), b_returns.numpy()
            var_y = np.var(y_true)
            explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

        # At the end, print some stuff here for overall stats

        # print(f'Average game length = {avg_game_length:.2f}')
        # print(f'Wolf wins : {wolf_wins}')
        # print(f'Villager wins: {villager_wins}')
        # print(f'Avg amount of self votes a game across villagers: {sum(self_voting)/len(self_voting)}')

torch.save(training_agent, "long_pare_bigger_model_added_night")

100%|██████████| 10000/10000 [35:34<00:00,  4.69it/s] 


In [None]:
class Agent(torch.nn.Module):
    def __init__(self, approval_states, num_players, obs_size):
        super().__init__()

        self.critic = torch.nn.Sequential(
            self._layer_init(torch.nn.Linear(obs_size+1, 256)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(256,256)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(256,1), std=1.0),
        )

        self.actor = torch.nn.Sequential(
            self._layer_init(torch.nn.Linear(obs_size+1, 256)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(256,256)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(256, approval_states), std=0.01),
        )

        self.num_players = num_players
    
    def get_value(self, x):
        # TODO: We need torch.mean because PPO will use value, and we have a bunch here. 
        #       Do we need to change PPO here?
        return torch.mean(self.critic(torch.stack([torch.cat((torch.tensor([i]), x)) for i in range(self.num_players)])))
    
    # only doing this for the PPO batched call so I don't need extra logic in the regular get action and value
    def get_batched_action_and_value(self, x, actions=None):

        if actions is None:
            raise ValueError("We need batched actions here")

        log_probs = []
        entropies = []
        critics = []
        for current_obs, action in zip(x, actions):
            updated_obs = torch.stack([torch.cat((torch.tensor([i]), current_obs)) for i in range(self.num_players)])

            logits = self.actor(updated_obs)
            probs = torch.distributions.categorical.Categorical(logits=logits)
            
            # update our return tensors
            log_probs.append(torch.sum(probs.log_prob(action)))
            entropies.append(torch.prod(probs.entropy()))
            critics.append(torch.mean(self.critic(updated_obs)))
            
        return actions, torch.stack(log_probs), torch.stack(entropies), torch.stack(critics)

    def convert_actions_to_approvals(self, actions):
        return [-1 if a == 2 else a.item() for a in actions]

    def get_action_and_value(self, x, action=None):
        # could call the network each time, with a different integer for each player?  get approvals that way
        # x is the flattened observation. we should go ahead and run each of the player_ids appended to full obs to get multiple classifications
        # how  to handle entropy here? maybe we multiply all the probs, and then calculate the overall entropy
        # self.critic needs to be changed too, to return an array

        # option to have critic/actors for every single player?

        # option to also delevt n-1 * n-2 for -1s on the wolf
        
        # get logits for every single player in the game.
        x = torch.stack([torch.cat((torch.tensor([i]), x)) for i in range(self.num_players)])
        logits = self.actor(x)
        probs = torch.distributions.categorical.Categorical(logits=logits)

        if action is None:
            action = probs.sample()
        
        # we multiply the entropy, and we add the log_probs together
        # TODO: multiple values for critic. should I average?
        return action, torch.sum(probs.log_prob(action)), torch.prod(probs.entropy()), torch.mean(self.critic(x))

    def _layer_init(self, layer, std=np.sqrt(2), bias_const=0.0):
        torch.nn.init.orthogonal_(layer.weight, std)
        torch.nn.init.constant_(layer.bias, bias_const)
        return layer

In [18]:
def collect_and_report_stats(env, information, ignore_wolf=True, mlflow_uri=None):
    total_self_votes = len([vals for player, vals in information.items() if vals["self_vote"] and (ignore_wolf and env.agent_roles[player] != pare_Role.WEREWOLF)])
    total_dead_votes = sum([vals["dead_vote"] for player, vals in information.items() if ignore_wolf and env.agent_roles[player] != pare_Role.WEREWOLF])
    total_viable_votes = sum([vals["viable_vote"] for player, vals in information.items() if ignore_wolf and env.agent_roles[player] != pare_Role.WEREWOLF])

    avg_self_votes = total_self_votes/len(information)
    avg_dead_votes = total_dead_votes/len(information)
    avg_viable_votes = total_viable_votes/len(information)

    return {
        "total_self_votes": total_self_votes,
        "total_dead_votes": total_dead_votes,
        "total_viable_votes": total_viable_votes,
        "avg_self_votes": avg_self_votes,
        "avg_dead_votes": avg_dead_votes,
        "avg_viable_votes": avg_viable_votes,
        "players_with_viable_votes": len([vals["viable_vote"] for player, vals in information.items() if ignore_wolf and env.agent_roles[player] != pare_Role.WEREWOLF])
    }

In [20]:
mlflow.set_tracking_uri("http://mlflow:5000")

def play_static_wolf_game_w_agents(env, wolf_policy, trained_villager, num_times=2) -> tuple(pare_Role):

    villager_wins = 0
    loop = tqdm(range(num_times))

    with mlflow.start_run(run_name='Gameplay stats'):
        for _ in loop:
            observations, rewards, terminations, truncations, infos = env.reset()
            while env.agents:
                actions = {}

                villagers = set(env.agents) & set(env.world_state["villagers"])
                wolves = set(env.agents) & set(env.world_state["werewolves"])

                # villager steps
                if env.world_state["phase"] != pare_Phase.NIGHT:
                    # villagers actions
                    for villager in villagers:
                        obs = torch.Tensor(env.convert_obs(observations[villager]['observation']))
                        action, logprobs, _, value = trained_villager.get_action_and_value(obs)
                        actions[villager] = trained_villager.convert_actions_to_approvals(action)
                # wolf steps
                actions = actions | wolf_policy(env)
            
                observations, rewards, terminations, truncations, infos = env.step(actions)
                mlflow.log_metrics(collect_and_report_stats(env, infos))
            winner = env.world_state['winners']
            if winner == pare_Role.VILLAGER:
                villager_wins += 1

            loop.set_description(f"Villagers won {villager_wins} out of a total of {num_times} games")

env = pare(num_agents=10, werewolves=2)
env.reset()

# trained_agent = torch.load("long_approval_agent")
test_agent = torch.load("long_pare_bigger_model_added_night")
play_static_wolf_game_w_agents(env, random_coordinated_wolf, test_agent, num_times=10)

Villagers won 63 out of a total of 1000 games:  89%|████████▉ | 891/1000 [03:45<00:27,  3.96it/s]


KeyboardInterrupt: 