In [1]:
import numpy as np
import torch
import sys
sys.path.append('../')
from voting_games.werewolf_env_v0 import pare, pare_Role, pare_Phase
import random
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


Done


In [2]:
env = pare(num_agents=10, werewolves=2)
env.reset()

def random_coordinated_wolf(env):
    actions = {}

    villagers_remaining = set(env.world_state["villagers"]) & set(env.world_state['alive'])
    wolves_remaining = set(env.world_state["werewolves"]) & set(env.world_state['alive'])

    target = random.choice(list(villagers_remaining))
    # pick 
    for wolf in wolves_remaining:
        actions[wolf] = [0] * len(env.possible_agents)
        actions[wolf][int(target.split("_")[-1])] = -1
        for curr_wolf in wolves_remaining:
            actions[wolf][int(curr_wolf.split("_")[-1])] = 1

    return actions

def aggressive_wolf(env):
    actions = {}

    wolves_remaining = set(env.world_state["werewolves"]) & set(env.world_state['alive'])

    for wolf in wolves_remaining:
        actions[wolf] = [-1] * len(env.possible_agents)
        for curr_wolf in wolves_remaining:
            actions[wolf][int(curr_wolf.split("_")[-1])] = 1

    return actions

def random_wolfs(env):
    return {wolf: env.action_space(wolf).sample().tolist() for
            wolf in set(env.world_state["werewolves"]) & set(env.world_state['alive'])}


def revenge_coordinated_wolf(env, actions = None):
    villagers_remaining = set(env.world_state["villagers"]) & set(env.world_state['alive'])
    wolves_remaining = set(env.world_state["werewolves"]) & set(env.world_state['alive'])

    # who tried to vote out a wolf last time?
    
    target = random.choice(list(villagers_remaining))
    # pick 
    for wolf in wolves_remaining:
        actions[wolf] = [0] * len(env.possible_agents)
        actions[wolf][int(target.split("_")[-1])] = -1
        for curr_wolf in wolves_remaining:
            actions[wolf][int(curr_wolf.split("_")[-1])] = 1
    # for wolf in env.werewolves_remaining:

def random_single_target_villager(env, agent):
    targets = set(env.world_state["alive"]) - set([agent])
    action = [0] * len(env.possible_agents)
    action[int(agent.split("_")[-1])] = 1
    action[int(random.choice(list(targets)).split("_")[-1])] = -1

    return action
    # for villager in env.villagers_remaining:

# random_coordinated_wolf(env)
def random_agent_action(env, agent):
   return env.action_space(agent).sample().tolist()

# def random_wolf(env, action=None):


In [85]:
def play_static_wolf_game(env, wolf_policy, villager_agent, num_times=100) -> tuple(pare_Role):

    villager_wins = 0
    loop = tqdm(range(num_times))

    for _ in loop:
        observations, rewards, terminations, truncations, infos = env.reset()
        while env.agents:
            actions = {}

            villagers = set(env.agents) & set(env.world_state["villagers"])
            wolves = set(env.agents) & set(env.world_state["werewolves"])

            # villager steps
            if env.world_state["phase"] != pare_Phase.NIGHT:
                # villagers actions
                for villager in villagers:
                    actions[villager] = villager_agent(env, villager)

            # wolf steps
            actions = actions | wolf_policy(env)
        
            observations, rewards, terminations, truncations, infos = env.step(actions)

        winner = env.world_state['winners']
        if winner == pare_Role.VILLAGER:
            villager_wins += 1

        loop.set_description(f"Villagers won {villager_wins} out of a total of {num_times} games")

env = pare(num_agents=10, werewolves=2)
env.reset()

print("Random Coordinated Wolves")
print("\t vs. Single Target Random Villagers")
play_static_wolf_game(env, random_coordinated_wolf, random_single_target_villager, num_times=1000)
print("\t vs. Random Villagers")
play_static_wolf_game(env, random_coordinated_wolf, random_agent_action, num_times=1000)
print("------------------------------------\n")
print("Aggresive Wolves")
print("\t vs. Single Target Random Villagers")
play_static_wolf_game(env, aggressive_wolf, random_single_target_villager, num_times=1000)
print("\t vs. Random Villagers")
play_static_wolf_game(env, aggressive_wolf, random_agent_action, num_times=1000)
print("------------------------------------\n")
print("Random Wolves")
print("\t vs. Single Target Random Villagers")
play_static_wolf_game(env, random_wolfs, random_single_target_villager, num_times=1000)
print("\t vs. Random Villagers")
play_static_wolf_game(env, random_wolfs, random_agent_action, num_times=1000)
print("------------------------------------\n")


Random Coordinated Wolves
	 vs. Single Target Random Villagers


Villagers won 136 out of a total of 1000 games: 100%|██████████| 1000/1000 [21:27<00:00,  1.29s/it] 


	 vs. Random Villagers


Villagers won 75 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:11<00:00, 86.84it/s]


------------------------------------

Aggresive Wolves
	 vs. Single Target Random Villagers


Villagers won 20 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:08<00:00, 121.51it/s]


	 vs. Random Villagers


Villagers won 7 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:10<00:00, 94.28it/s]


------------------------------------

Random Wolves
	 vs. Single Target Random Villagers


Villagers won 655 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:09<00:00, 102.46it/s]


	 vs. Random Villagers


Villagers won 591 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:12<00:00, 82.59it/s]

------------------------------------






## Training Agents

The agent structure will be much the same as the agent structure in

In [4]:
from learning_agents.approval_agents import SequentialAgent
import mlflow

In [16]:
ent_coef = 0.1 #
vf_coef = 0.1 #
clip_coef = 0.1 #
gamma = 0.99 #
gae_lambda = 0.95
batch_size = 32 #
max_cycles = 125 #
total_episodes = 10000 #
update_epochs = 3 #

# stats to keep track of for custom metrics
self_voting = []
dead_voting = []

avg_game_length = 0
wolf_wins = 0
villager_wins = 0

num_agents = 10
env = pare(num_agents=num_agents, werewolves=2)
env.reset()
observation_size = env.convert_obs(env.observation_spaces['player_1'].sample()['observation']).shape[-1]

# Learner Setup
training_agent = SequentialAgent(num_players=num_agents, approval_states= 3, obs_size=observation_size)
optimizer = torch.optim.Adam(training_agent.parameters(), lr=0.001, eps=1e-5)


with mlflow.start_run(run_name='Training our agents'):
    
    for episode in tqdm(range(total_episodes)):
        observations, rewards, terminations, truncations, infos = env.reset()
        magent_obs = {agent: {'obs': [], 'rewards': [], 'actions': [], 'logprobs': [], 'values': [], 'terms': []} for agent in env.agents if not env.agent_roles[agent]}

        with torch.no_grad():
            while env.agents:
                actions = {}
                villagers = set(env.agents) & set(env.world_state["villagers"])
                wolves = set(env.agents) & set(env.world_state["werewolves"])

                if env.world_state["phase"] != pare_Phase.NIGHT:
                    # villagers actions
                    for villager in villagers:
                        # only cares about observations
                        obs = torch.Tensor(env.convert_obs(observations[villager]['observation']))
                        ml_action, logprobs, _, value = training_agent.get_action_and_value(obs)
                        
                        actions[villager] = training_agent.convert_actions_to_approvals(ml_action)

                        # missing rewards, term
                        magent_obs[villager]["obs"].append(obs)
                        magent_obs[villager]["actions"].append(ml_action)
                        magent_obs[villager]["logprobs"].append(logprobs)
                        magent_obs[villager]["values"].append(value)

                # TODO : pass in a wolf policy possibly
                actions = actions | random_coordinated_wolf(env)
            
                observations, rewards, terminations, truncations, infos = env.step(actions)

                # lets add the post step information now
                # TODO: Should we not care about villagers losing here? What if they die at night, we want them to get a death reward
                # TODO: What if the game ends on a night kill? We want the villagers to still get their rewards
                if env.history[-1]["phase"] != pare_Phase.NIGHT:
                    for villager in villagers:
                        magent_obs[villager]["rewards"].append(rewards[villager])
                        magent_obs[villager]["terms"].append(terminations[villager])

            # POST GAME STATS #
            winner = env.world_state['winners']

            if winner == pare_Role.VILLAGER:
                villager_wins += 1
        
            # END OF POST GAME STATS #
            mlflow.log_metric("villager wins", villager_wins)
            if episode % 50 == 0:
                #wwins, vwins = play_and_return_stats(env, random_wolf_policy, ppo_agent, num_games=100)
                #mlflow.log_metric("wwins", wwins)
                #mlflow.log_metric("vwins", vwins)
                continue
        
        
        
        # We will do this for each agent in the episode
        # essentially we are calculating advantages and returns
            with torch.no_grad():
                for player, records in magent_obs.items():
                    # print(f'{records}')
                    advantages = torch.zeros_like(torch.tensor(records['rewards']))

                    for t in reversed(range(len(records['obs']))):
                        # print(f'T: {t+1} - Rewards : {torch.tensor(records["rewards"])[t+1]} ')
                        # not using terms, as these are episodic

                        ## this was the last one. We are not using any terminal states in a good way

                        if t == len(records['obs']) - 1:
                            #print(f'T: {t} - Rewards at end : {torch.tensor(records["rewards"])[t]} ')
                            #print(f'T: {t} - Actions at end : {torch.tensor(records["actions"])[t]} ')
                            delta = records["rewards"][t] - records["values"][t]
                            advantages[t]  = delta
                        else:
                            #print(f'T: {t} - Rewards : {torch.tensor(records["rewards"])[t]} ')
                            #print(f'T: {t} - Actions : {torch.tensor(records["actions"])[t]} ')                    
                            delta = records["rewards"][t] + gamma * records["values"][t+1] - records["values"][t]
                            advantages[t]  = delta + gamma * gamma * advantages[t+1]

                        #delta = records['rewards'][t] + gamma * records['values'][t+1] - records['values'][t]
                    magent_obs[player]["advantages"] = advantages
                    magent_obs[player]["returns"] = advantages + torch.tensor(records["values"])
                        #advantages[t] = delta + gamma * gamma * advantages[t+1]
        
            # new logic, maybe we do this after a couple of games, so we get more data overall?

            # optimize the policy and the value network now
            # we can take all our observations now and flatten them into one bigger list of individual transitions
            # TODO: could make this setting into a single loop, but maybe this is clearer. ALso could make all these tensors earlier
            b_observations = torch.cat([torch.stack(item['obs']) for item in magent_obs.values()])
            b_logprobs = torch.cat([torch.stack(item['logprobs']) for item in magent_obs.values()])
            b_actions = torch.cat([torch.stack(item['actions']) for item in magent_obs.values()])
            b_returns = torch.cat([item['returns'] for item in magent_obs.values()])
            b_values = torch.cat([torch.stack(item['values']) for item in magent_obs.values()])
            b_advantages =  torch.cat([item['advantages'] for item in magent_obs.values()])



            # b_index stands for batch index
            b_index = np.arange(len(b_observations))
            clip_fracs = []
            for epoch in range(update_epochs):
                np.random.shuffle(b_index)
                for start in range(0, len(b_observations), batch_size):
                    end = start + batch_size
                    batch_index = b_index[start:end]

                    # TODO: batched actions, How to handle batched observations and acctions properly in the agent
                    #       Maybe a different 

                    # newlogprob needs to return a list of logprobs
                    _, newlogprob, entropy, value = training_agent.get_batched_action_and_value(
                        b_observations[batch_index], b_actions[batch_index])
                    
                    logratio = newlogprob - b_logprobs[batch_index]
                    ratio = logratio.exp()

                    with torch.no_grad():
                        # calculate approx_kl http://joschu.net/blog/kl-approx.html
                        old_approx_kl = (-logratio).mean()
                        approx_kl = ((ratio - 1) - logratio).mean()
                        clip_fracs += [
                            ((ratio - 1.0).abs() > clip_coef).float().mean().item()
                        ]
                    
                    # normalizing advantages
                    advantages = b_advantages[batch_index]
                    advantages = advantages.float()
                    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

                    # policy loss
                    pg_loss1 = -advantages * ratio
                    pg_loss2 = -advantages * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef)
                    pg_loss = torch.max(pg_loss1, pg_loss2).mean()

                    # value loss
                    value = value.flatten()
                    v_loss_unclipped = (value - b_returns[batch_index]) ** 2
                    v_clipped = b_values[batch_index] + torch.clamp(
                        value - b_values[batch_index],
                        -clip_coef,
                        clip_coef,
                    )
                    v_loss_clipped = (v_clipped - b_returns[batch_index]) ** 2
                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                    v_loss = 0.5 * v_loss_max.mean()

                    entropy_loss = entropy.mean()
                    loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef

                    optimizer.zero_grad()
                    # loss = torch.Variable(loss, requires_grad = True)
                    loss.requires_grad = True
                    loss.backward()
                    optimizer.step()
            
            # could move them from GPU here
            y_pred, y_true = b_values.numpy(), b_returns.numpy()
            var_y = np.var(y_true)
            explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

        # At the end, print some stuff here for overall stats

        # print(f'Average game length = {avg_game_length:.2f}')
        # print(f'Wolf wins : {wolf_wins}')
        # print(f'Villager wins: {villager_wins}')
        # print(f'Avg amount of self votes a game across villagers: {sum(self_voting)/len(self_voting)}')

torch.save(training_agent, "pare_training_agent")

  0%|          | 38/10000 [00:08<37:07,  4.47it/s]