In [2]:
import numpy as np
import torch
import sys
import copy
sys.path.append('../')
from voting_games.werewolf_env_v0 import pare, pare_Role, pare_Phase
import random
from learning_agents.approval_agents import ApprovalRecurrentAgent
from tqdm import tqdm
import mlflow

## Approval Voting Scenario

In approval voting, an agent has to present a positive opinion of agents they trust, a negative opinion (synonymous to a kill vote) of those they dont, and 0 for a neutral opinion.

### Setup for static wolf strategies

In [3]:
env = pare(num_agents=10, werewolves=2)
env.reset()

def random_wolf(env, agent, action=None):
    if action != None:
        return action

    villagers_remaining = set(env.world_state["villagers"]) & set(env.world_state['alive'])
    wolves_remaining = set(env.world_state["werewolves"]) & set(env.world_state['alive'])

    # pick a living target
    target = random.choice(list(villagers_remaining))

    action = [0] * len(env.possible_agents)
    action[int(target.split("_")[-1])] = -1
    for curr_wolf in wolves_remaining:
        action[int(curr_wolf.split("_")[-1])] = 1

    return action

def aggressive_wolf(env, agent, action=None):
    wolves_remaining = set(env.world_state["werewolves"]) & set(env.world_state['alive'])
    action = [-1] * len(env.possible_agents)
    for curr_wolf in wolves_remaining:
        action[int(curr_wolf.split("_")[-1])] = 1

    return action


def revenge_coordinated_wolf(env, actions = None):
    villagers_remaining = set(env.world_state["villagers"]) & set(env.world_state['alive'])
    wolves_remaining = set(env.world_state["werewolves"]) & set(env.world_state['alive'])

    # who tried to vote out a wolf last time?
    # TODO:
    return None
    # for wolf in env.werewolves_remaining:

def random_single_target_villager(env, agent):
    targets = set(env.world_state["alive"]) - set([agent])
    action = [0] * len(env.possible_agents)
    action[int(agent.split("_")[-1])] = 1
    action[int(random.choice(list(targets)).split("_")[-1])] = -1

    return action
    # for villager in env.villagers_remaining:

# random_coordinated_wolf(env)
def random_agent_action(env, agent, action=None):
   return env.action_space(agent).sample().tolist()

In [4]:
def play_static_wolf_game(env, wolf_policy, villager_agent, num_times=100) -> tuple(pare_Role):

    villager_wins = 0
    loop = tqdm(range(num_times))

    for _ in loop:
        observations, rewards, terminations, truncations, infos = env.reset()
        wolf_brain = {'day': 1, 'phase': 0, 'action': None}

        while env.agents:
            actions = {}

            villagers = set(env.agents) & set(env.world_state["villagers"])
            wolves = set(env.agents) & set(env.world_state["werewolves"])

            # villager steps
            for villager in villagers:
                actions[villager] = villager_agent(env, villager)

            day = observations[list(observations)[0]]['observation']['day']
            phase = observations[list(observations)[0]]['observation']['phase']

            if wolf_brain['day'] != day or wolf_brain['phase'] == pare_Phase.NIGHT:
                wolf_brain = {'day': day, 'phase': phase, 'action': None}

            # wolf steps
            for wolf in wolves:
                action = wolf_policy(env, wolf, action=wolf_brain['action'])
                actions[wolf] = action
                wolf_brain['action'] = action
        
            observations, rewards, terminations, truncations, infos = env.step(actions)

        winner = env.world_state['winners']
        if winner == pare_Role.VILLAGER:
            villager_wins += 1

        loop.set_description(f"Villagers won {villager_wins} out of a total of {num_times} games")

env = pare(num_agents=10, werewolves=2)
env.reset()

print("Random Coordinated Wolves")
print("\t vs. Single Target Random Villagers")
play_static_wolf_game(env, random_wolf, random_single_target_villager, num_times=1000)
print("\t vs. Random Villagers")
play_static_wolf_game(env, random_wolf, random_agent_action, num_times=1000)
print("------------------------------------\n")
print("Aggresive Wolves")
print("\t vs. Single Target Random Villagers")
play_static_wolf_game(env, aggressive_wolf, random_single_target_villager, num_times=1000)
print("\t vs. Random Villagers")
play_static_wolf_game(env, aggressive_wolf, random_agent_action, num_times=1000)
print("------------------------------------\n")
print("Random Wolves")
print("\t vs. Single Target Random Villagers")
play_static_wolf_game(env, random_agent_action, random_single_target_villager, num_times=1000)
print("\t vs. Random Villagers")
play_static_wolf_game(env, random_agent_action, random_agent_action, num_times=1000)
print("------------------------------------\n")

Random Coordinated Wolves
	 vs. Single Target Random Villagers


  0%|          | 0/1000 [00:00<?, ?it/s]


Exception: Villager should not have voted during the night

### Setup for training our agents

In [None]:
class RolloutBuffer():
    
    def __init__(self, buffer_size: int, gamma: float, gae_lambda: float):
        '''
            @bufffer_size: This is the number of trajectories
        '''
        
        self.rewards = None
        self.actions = None
        self.dones = None
        self.observations = None

        # do we want these for both actor and critic?
        self.hcxs = None 

        self.log_probs = None
        self.values = None
        self.advantages = None

        self.buffer_size = buffer_size
        self.gamma = gamma 
        self.gae_lambda = gae_lambda

        self.reset(gamma=gamma, gae_lambda=gae_lambda)

    def reset(self, gamma: float, gae_lambda: float):
        self.rewards = []
        self.actions = []
        self.dones = []
        self.observations = []

        # do we want these for both actor and critic?
        self.hcxs = []

        self.log_probs = []
        self.values = []
        self.advantages = []
        self.returns = []

        self.gamma = gamma 
        self.gae_lambda = gae_lambda

    def add_replay(self, game) -> bool:
         
         self.rewards.append(game['rewards'])
         self.actions.append(game['actions'])
         self.dones.append(game["terms"])
         self.observations.append(game["obs"])
         self.log_probs.append(game["logprobs"])
         self.values.append(game["values"])
         self.hcxs.append(game["hcxs"][:-1])
        
         advantages, returns = self._calculate_advantages(game)
             
         self.advantages.append(advantages)
         self.returns.append(returns)

         return True
    
    @torch.no_grad()
    def _calculate_advantages(self, game):
        """Generalized advantage estimation (GAE)
        """
        advantages = torch.zeros_like(torch.tensor(game['rewards']))

        for t in reversed(range(len(game['rewards']))):
             delta = game['rewards'][t] + self.gamma * game['values'][max((t+1)%len(game['rewards']),t)] - game['values'][t]
             advantages[t] = delta + self.gamma * self.gae_lambda * advantages[max((t+1)%len(game['rewards']),t)]

        # adv and returns
        return advantages, advantages + torch.tensor(game['values'])
    
    def get_minibatch_generator(self, batch_size):

        # fold and stack observations
        actions = torch.cat([item for sublist in self.actions for item in sublist])
        logprobs = torch.cat([item for sublist in self.log_probs for item in sublist])
        returns = torch.cat(self.returns)
        values = torch.cat([item for sublist in self.values for item in sublist])
        advantages = torch.cat(self.advantages).float()

        # TODO : Gotta update these to work with a single set of hxs, rxs
        hxs, cxs = zip(*[(hxs, cxs) for hxs, cxs in [item for sublist in self.hcxs for item in sublist]])
        observations = torch.cat([item for sublist in self.observations for item in sublist])

        index = np.arange(len(observations))

        np.random.shuffle(index)

        # We do not handle remaining stuff here
        for start in range(0,len(observations), batch_size):
            end = start + batch_size
            batch_index = index[start:end].astype(int)

            yield {
                "actions": actions[batch_index],
                "logprobs": logprobs[batch_index],
                "returns": returns[batch_index],
                "values": values[batch_index],
                "advantages": advantages[batch_index],
                # we are using sequence lengths of 1, because everything should be encoded in 
                "hxs": torch.swapaxes(torch.cat(hxs)[batch_index],0,1),
                "cxs": torch.swapaxes(torch.cat(cxs)[batch_index],0,1),
                "observations": observations[batch_index]
            } 

@torch.no_grad()
def fill_recurrent_buffer(buffer, env, config:dict, wolf_policy, villager_agent) -> RolloutBuffer:

    buffer.reset(gamma=config["training"]["gamma"], gae_lambda=config["training"]["gae_lambda"])
    
    for _ in range(config["training"]["buffer_games_per_update"]):
        ## Play the game 
        next_observations, rewards, terminations, truncations, infos = env.reset()
        # init recurrent stuff for actor and critic to 0 as well
        magent_obs = {agent: {'obs': [], 
                              'rewards': [], 
                              'actions': [], 
                              'logprobs': [], 
                              'values': [], 
                              'terms': [],

                              # obs size, and 1,1,64 as we pass batch first
                              'hcxs': [(torch.zeros((1,1,config["model"]["recurrent_hidden_size"]), dtype=torch.float32), 
                                        torch.zeros((1,1,config["model"]["recurrent_hidden_size"]), dtype=torch.float32))]
                    } for agent in env.agents if not env.agent_roles[agent]}
        
        wolf_brain = {'day': 1, 'phase': 0, 'action': None}
        while env.agents:
            observations = copy.deepcopy(next_observations)
            actions = {}

            villagers = set(env.agents) & set(env.world_state["villagers"])
            wolves = set(env.agents) & set(env.world_state["werewolves"])

            # villager steps
                # villagers actions
            for villager in villagers:
                #torch.tensor(env.convert_obs(observations['player_0']['observation']), dtype=torch.float)
                torch_obs = torch.tensor(env.convert_obs(observations[villager]['observation']), dtype=torch.float)
                obs = torch.unsqueeze(torch_obs, 0)

                # TODO: Testing this, we may need a better way to pass in villagers
                recurrent_cell = magent_obs[villager]["hcxs"][-1]
                
                # ensure that the obs is of size (batch,seq,inputs)
                # this needs to be updated
                policy, value, recurrent_cell = villager_agent(obs, recurrent_cell)
                action = policy.sample()
                
                actions[villager] = action.item()

                # can store some stuff 
                magent_obs[villager]["obs"].append(obs)
                magent_obs[villager]["actions"].append(action)

                # how do we get these
                magent_obs[villager]["logprobs"].append(policy.log_prob(action))
                magent_obs[villager]["values"].append(value)

                #store the next recurrent cells
                magent_obs[villager]["hcxs"].append(recurrent_cell)


            # wolf steps
            day = observations[list(observations)[0]]['observation']['day']
            phase = observations[list(observations)[0]]['observation']['phase']

            if wolf_brain['day'] != day or wolf_brain['phase'] == plurality_Phase.NIGHT:
                wolf_brain = {'day': day, 'phase': phase, 'action': None}

            for wolf in wolves:
                action = wolf_policy(env, wolf, action=wolf_brain['action'])
                wolf_brain['action'] = action
                actions[wolf] = action

            # actions = actions | wolf_policy(env)
        
            next_observations, rewards, terminations, truncations, infos = env.step(actions)

            for villager in villagers:
                magent_obs[villager]["rewards"].append(rewards[villager])
                magent_obs[villager]["terms"].append(terminations[villager])

        ## Fill bigger buffer, keeping in mind sequence
        for agent in magent_obs:
            buffer.add_replay(magent_obs[agent])
    
    return buffer

@torch.no_grad()
def play_recurrent_game(env, wolf_policy, villager_agent, num_times=10, hidden_state_size=None):
    
    wins = 0
    # loop = tqdm(range(num_times))
    for _ in range(num_times):
        ## Play the game 
        next_observations, rewards, terminations, truncations, infos = env.reset()
        # init recurrent stuff for actor and critic to 0 as well
        magent_obs = {agent: {'obs': [], 
                              # obs size, and 1,1,64 as we pass batch first
                              'hcxs': [(torch.zeros((1,1,hidden_state_size), dtype=torch.float32), torch.zeros((1,1,hidden_state_size), dtype=torch.float32))],
                    } for agent in env.agents if not env.agent_roles[agent]}
        

        wolf_brain = {'day': 1, 'phase': 0, 'action': None}

        while env.agents:
            observations = copy.deepcopy(next_observations)
            actions = {}

            villagers = set(env.agents) & set(env.world_state["villagers"])
            wolves = set(env.agents) & set(env.world_state["werewolves"])

            # villagers actions
            for villager in villagers:
                #torch.tensor(env.convert_obs(observations['player_0']['observation']), dtype=torch.float)
                torch_obs = torch.tensor(env.convert_obs(observations[villager]['observation']), dtype=torch.float)
                obs = torch.unsqueeze(torch_obs, 0)

                # TODO: Testing this, we may need a better way to pass in villagers
                recurrent_cell = magent_obs[villager]["hcxs"][-1]
                
                # ensure that the obs is of size (batch,seq,inputs)
                policy, value, recurrent_cell = villager_agent(obs, recurrent_cell)
                action = policy.sample()
                
                actions[villager] = action.item()

                #store the next recurrent cells
                magent_obs[villager]["hcxs"].append(recurrent_cell)

            # wolf steps
            day = observations[list(observations)[0]]['observation']['day']
            phase = observations[list(observations)[0]]['observation']['phase']
            
            if wolf_brain['day'] != day or wolf_brain['phase'] == pare_Phase.NIGHT:
                wolf_brain = {'day': day, 'phase': phase, 'action': None}
            
            for wolf in wolves:
                action = wolf_policy(env, wolf, action=wolf_brain['action'])
                wolf_brain['action'] = action
                actions[wolf] = action

            # actions = actions | wolf_policy(env)
        
            next_observations, rewards, terminations, truncations, infos = env.step(actions)

        ## Fill bigger buffer, keeping in mind sequence
        winner = env.world_state['winners']
        if winner == pare_Role.VILLAGER:
            wins += 1

        # loop.set_description(f"Villagers won {wins} out of a total of {num_times} games")
    
    return wins

def calc_minibatch_loss(agent: ApprovalRecurrentAgent, samples: dict, clip_range: float, beta: float, v_loss_coef: float, optimizer):

    # TODO:Consider checking for NAans anywhere. we cant have these. also do this in the model itself
    # if torch.isnan(tensor).any(): print(f"{label} contains NaN values")
    policies, values, _ = agent(samples['observations'], (samples['hxs'], samples['cxs']))
    
    # log_probs, entropies = [], []
    log_probs = policies.log_prob(samples['actions'])
    entropies = policies.entropy() # need to sum if we have more than 1 action
    
    ratio = torch.exp(log_probs - samples['logprobs'])

    # normalize advantages
    norm_advantage = (samples["advantages"] - samples["advantages"].mean()) / (samples["advantages"].std() + 1e-8)
    # normalized_advantage = normalized_advantage.unsqueeze(1).repeat(1, len(self.action_space_shape)) # Repeat is necessary for multi-discrete action spaces

    # policy loss w/ surrogates
    surr1 = norm_advantage * ratio
    surr2 = norm_advantage * torch.clamp(ratio, 1.0 - clip_range, 1.0 + clip_range)
    policy_loss = torch.min(surr1, surr2)
    policy_loss = policy_loss.mean()

    # Value  function loss
    clipped_values = samples["values"] + (values - samples["values"]).clamp(min=-clip_range, max=clip_range)
    vf_loss = torch.max((values - samples['returns']) ** 2, (clipped_values - samples["returns"]) ** 2)
    vf_loss = vf_loss.mean()

    # Entropy Bonus
    entropy_loss = entropies.mean()

    # Complete loss
    loss = -(policy_loss - v_loss_coef * vf_loss + beta * entropy_loss)


    # TODO : do i reset the LR here? do I want to?

    
    # Compute gradients
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(agent.parameters(), max_norm=0.5)
    optimizer.step()

    
    return [policy_loss.cpu().data.numpy(),     # policy loss
            vf_loss.cpu().data.numpy(),         # value loss
            loss.cpu().data.numpy(),            # total loss
            entropy_loss.cpu().data.numpy()]    # entropy loss

In [None]:
config_training = {
    "model": {
        "recurrent_layers": 1,
        "recurrent_hidden_size": 128, # 256
        "mlp_size": 128, # 256
    },
    "training" : {
        "batch_size": 32, # 128
        "epochs": 3, # 6
        "updates": 10, # 1000
        "buffer_games_per_update": 10, # 200
        "clip_range": 0.2,
        "value_loss_coefficient": 0.1,
        "max_grad_norm": 0.5,
        "beta": 0.01, # entropy loss multiplier
        "learning_rate": 0.0001,
        "adam_eps": 1e-8,
        "gamma": 0.99,
        "gae_lambda": 0.95,
    }
}

config_game = {
    "rewards": {
        "day": -1,
        "player_death": -1,
        "player_win": 10,
        "player_loss": -5,
        "self_vote": -1,
        "dead_vote": -1,
        "dead_wolf": 5,
        "no_viable_vote": -1,
        "no_sleep": -1,
    },
    "gameplay": {
        "accusation_phases": 1,
        "num_agents": 10,
        "num_werewolves": 2,
    }
}

config = {
    "config_game": config_game,
    "config_training": config_training,
}

In [None]:
class PPOTrainer:
    def __init__(self, config:dict, run_id:str="run", device:torch.device=torch.device("cpu"), mlflow_uri:str=None) -> None:
        """Initializes all needed training components.
        Arguments:
            config {dict} -- Configuration and hyperparameters of the environment, trainer and model.
            run_id {str, optional} -- A tag used to save Tensorboard Summaries and the trained model. Defaults to "run".
            device {torch.device, optional} -- Determines the training device. Defaults to cpu.
        """
        # Set variables
        self.config = config
        self.device = device
        self.run_id = run_id
        self.mlflow_uri = mlflow_uri
        self.env = None

        # we are not using schedules yet
        # self.lr_schedule = config["learning_rate_schedule"]
        # self.beta_schedule = config["beta_schedule"]
        # self.cr_schedule = config["clip_range_schedule"]

        # Initialize Environment
        env = pare(num_agents=10, werewolves=2)
        self.env = env
        
        observations, rewards, terminations, truncations, infos = env.reset()
        obs_size= env.convert_obs(observations['player_0']['observation']).shape[-1]

        # Initialize Buffer
        self.buffer = RolloutBuffer(buffer_size=10, gamma=0.99, gae_lambda=0.95)

        # Initialize Model & Optimizer
        self.agent = ApprovalRecurrentAgent({"rec_hidden_size": self.config["config_training"]["model"]["recurrent_hidden_size"], 
                                                "rec_layers": self.config["config_training"]["model"]["recurrent_layers"], 
                                                "hidden_mlp_size": self.config["config_training"]["model"]["mlp_size"]},
                                                num_actions=self.env.action_space("player_0").n,
                                                obs_size=obs_size)
        self.optimizer = torch.optim.Adam(self.agent.parameters(), lr=0.0001, eps=1e-5)

        # setup mlflow run if we are using it

    def train(self, idx: int):
        if self.mlflow_uri:
            mlflow.set_tracking_uri(self.mlflow_uri)

        name = f'{self.run_id}_{idx}'
        with mlflow.start_run(run_name=name):
            
            mlflow.log_params(self.config["config_training"]["training"])
            mlflow.log_params(self.config["config_training"]["model"])

            loop = tqdm(range(self.config["config_training"]["training"]["updates"]))

            for tid, _ in enumerate(loop):
                # train 100 times
                if tid % 2 == 0:
                    # print(f'Playing games with our trained agent after {epid} epochs')
                    loop.set_description("Playing games and averaging score")
                    wins = []
                    for _ in range(10):
                        wins.append(play_recurrent_game(self.env, 
                                                        random_coordinated_single_wolf, 
                                                        self.agent, 
                                                        num_times=50,
                                                        hidden_state_size=self.config["config_training"]["model"]["recurrent_hidden_size"]))
                    
                    mlflow.log_metric("avg_wins/50", np.mean(wins))

                loop.set_description("Filling buffer")
                # fill buffer
                buff = fill_recurrent_buffer(self.buffer, 
                                             self.env,
                                             self.config["config_training"],
                                             random_coordinated_single_wolf, 
                                             self.agent)

                # train info will hold our metrics
                train_info = []
                loop.set_description("Epoch Training")
                for _ in range(self.config['config_training']["training"]['epochs']):
                    # run through batches and train network
                    for batch in buff.get_minibatch_generator(self.config['config_training']["training"]['batch_size']):
                        train_info.append(calc_minibatch_loss(self.agent, 
                                                              batch, 
                                                              clip_range=self.config['config_training']["training"]['clip_range'], 
                                                              beta=self.config['config_training']["training"]['beta'], 
                                                              v_loss_coef=self.config['config_training']["training"]['value_loss_coefficient'], 
                                                              optimizer=self.optimizer))

                train_stats = np.mean(train_info, axis=0)
                mlflow.log_metric("policy loss", train_stats[0])
                mlflow.log_metric("value loss", train_stats[1])
                mlflow.log_metric("total loss", train_stats[2])
                mlflow.log_metric("entropy loss", train_stats[3])
            # one more run

        # torch.save(self.agent, f"rnn_agent_{self.run_id}")


trainer = PPOTrainer(config=config,run_id="Approval agent training", mlflow_uri="http://mlflow:5000")
trainer.train(1)

## Old stuff below

In [2]:
env = pare(num_agents=10, werewolves=2)
env.reset()

def random_coordinated_wolf(env):
    actions = {}

    villagers_remaining = set(env.world_state["villagers"]) & set(env.world_state['alive'])
    wolves_remaining = set(env.world_state["werewolves"]) & set(env.world_state['alive'])

    target = random.choice(list(villagers_remaining))
    # pick 
    for wolf in wolves_remaining:
        actions[wolf] = [0] * len(env.possible_agents)
        actions[wolf][int(target.split("_")[-1])] = -1
        for curr_wolf in wolves_remaining:
            actions[wolf][int(curr_wolf.split("_")[-1])] = 1

    return actions

def aggressive_wolf(env):
    actions = {}

    wolves_remaining = set(env.world_state["werewolves"]) & set(env.world_state['alive'])

    for wolf in wolves_remaining:
        actions[wolf] = [-1] * len(env.possible_agents)
        for curr_wolf in wolves_remaining:
            actions[wolf][int(curr_wolf.split("_")[-1])] = 1

    return actions

def random_wolfs(env):
    return {wolf: env.action_space(wolf).sample().tolist() for
            wolf in set(env.world_state["werewolves"]) & set(env.world_state['alive'])}


def revenge_coordinated_wolf(env, actions = None):
    villagers_remaining = set(env.world_state["villagers"]) & set(env.world_state['alive'])
    wolves_remaining = set(env.world_state["werewolves"]) & set(env.world_state['alive'])

    # who tried to vote out a wolf last time?
    
    target = random.choice(list(villagers_remaining))
    # pick 
    for wolf in wolves_remaining:
        actions[wolf] = [0] * len(env.possible_agents)
        actions[wolf][int(target.split("_")[-1])] = -1
        for curr_wolf in wolves_remaining:
            actions[wolf][int(curr_wolf.split("_")[-1])] = 1
    # for wolf in env.werewolves_remaining:

def random_single_target_villager(env, agent):
    targets = set(env.world_state["alive"]) - set([agent])
    action = [0] * len(env.possible_agents)
    action[int(agent.split("_")[-1])] = 1
    action[int(random.choice(list(targets)).split("_")[-1])] = -1

    return action
    # for villager in env.villagers_remaining:

# random_coordinated_wolf(env)
def random_agent_action(env, agent):
   return env.action_space(agent).sample().tolist()

# def random_wolf(env, action=None):


In [3]:
def play_static_wolf_game(env, wolf_policy, villager_agent, num_times=100) -> tuple(pare_Role):

    villager_wins = 0
    loop = tqdm(range(num_times))

    for _ in loop:
        observations, rewards, terminations, truncations, infos = env.reset()
        while env.agents:
            actions = {}

            villagers = set(env.agents) & set(env.world_state["villagers"])
            wolves = set(env.agents) & set(env.world_state["werewolves"])

            # villager steps
            if env.world_state["phase"] != pare_Phase.NIGHT:
                # villagers actions
                for villager in villagers:
                    actions[villager] = villager_agent(env, villager)

            # wolf steps
            actions = actions | wolf_policy(env)
        
            observations, rewards, terminations, truncations, infos = env.step(actions)

        winner = env.world_state['winners']
        if winner == pare_Role.VILLAGER:
            villager_wins += 1

        loop.set_description(f"Villagers won {villager_wins} out of a total of {num_times} games")

env = pare(num_agents=10, werewolves=2)
env.reset()

print("Random Coordinated Wolves")
print("\t vs. Single Target Random Villagers")
play_static_wolf_game(env, random_coordinated_wolf, random_single_target_villager, num_times=1000)
print("\t vs. Random Villagers")
play_static_wolf_game(env, random_coordinated_wolf, random_agent_action, num_times=1000)
print("------------------------------------\n")
print("Aggresive Wolves")
print("\t vs. Single Target Random Villagers")
play_static_wolf_game(env, aggressive_wolf, random_single_target_villager, num_times=1000)
print("\t vs. Random Villagers")
play_static_wolf_game(env, aggressive_wolf, random_agent_action, num_times=1000)
print("------------------------------------\n")
print("Random Wolves")
print("\t vs. Single Target Random Villagers")
play_static_wolf_game(env, random_wolfs, random_single_target_villager, num_times=1000)
print("\t vs. Random Villagers")
play_static_wolf_game(env, random_wolfs, random_agent_action, num_times=1000)
print("------------------------------------\n")


Random Coordinated Wolves
	 vs. Single Target Random Villagers


Villagers won 117 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:07<00:00, 136.09it/s]


	 vs. Random Villagers


Villagers won 69 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:11<00:00, 89.20it/s]


------------------------------------

Aggresive Wolves
	 vs. Single Target Random Villagers


Villagers won 23 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:06<00:00, 160.29it/s]


	 vs. Random Villagers


Villagers won 7 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:10<00:00, 95.46it/s]


------------------------------------

Random Wolves
	 vs. Single Target Random Villagers


Villagers won 666 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:08<00:00, 112.27it/s]


	 vs. Random Villagers


Villagers won 585 out of a total of 1000 games: 100%|██████████| 1000/1000 [00:12<00:00, 78.93it/s]


------------------------------------



## Training Agents

The agent structure will be much the same as the agent structure in

In [4]:
from learning_agents.approval_agents import SequentialAgent
import mlflow

In [5]:
ent_coef = 0.1 #
vf_coef = 0.1 #
clip_coef = 0.1 #
gamma = 0.99 #
gae_lambda = 0.95
batch_size = 32 #
max_cycles = 125 #
total_episodes = 10000 #
update_epochs = 3 #

# stats to keep track of for custom metrics
self_voting = []
dead_voting = []

avg_game_length = 0
wolf_wins = 0
villager_wins = 0

num_agents = 10
env = pare(num_agents=num_agents, werewolves=2)
env.reset()
observation_size = env.convert_obs(env.observation_spaces['player_1'].sample()['observation']).shape[-1]

# Learner Setup
training_agent = SequentialAgent(num_players=num_agents, approval_states= 3, obs_size=observation_size)
optimizer = torch.optim.Adam(training_agent.parameters(), lr=0.001, eps=1e-5)


with mlflow.start_run(run_name='Training our agents'):
    
    for episode in tqdm(range(total_episodes)):
        observations, rewards, terminations, truncations, infos = env.reset()
        magent_obs = {agent: {'obs': [], 'rewards': [], 'actions': [], 'logprobs': [], 'values': [], 'terms': []} for agent in env.agents if not env.agent_roles[agent]}

        with torch.no_grad():
            while env.agents:
                actions = {}
                villagers = set(env.agents) & set(env.world_state["villagers"])
                wolves = set(env.agents) & set(env.world_state["werewolves"])

                if env.world_state["phase"] != pare_Phase.NIGHT:
                    # villagers actions
                    for villager in villagers:
                        # only cares about observations
                        obs = torch.Tensor(env.convert_obs(observations[villager]['observation']))
                        ml_action, logprobs, _, value = training_agent.get_action_and_value(obs)
                        
                        actions[villager] = training_agent.convert_actions_to_approvals(ml_action)

                        # missing rewards, term
                        magent_obs[villager]["obs"].append(obs)
                        magent_obs[villager]["actions"].append(ml_action)
                        magent_obs[villager]["logprobs"].append(logprobs)
                        magent_obs[villager]["values"].append(value)

                # TODO : pass in a wolf policy possibly
                actions = actions | random_coordinated_wolf(env)
            
                observations, rewards, terminations, truncations, infos = env.step(actions)

                # lets add the post step information now
                # TODO: Should we not care about villagers losing here? What if they die at night, we want them to get a death reward
                # TODO: What if the game ends on a night kill? We want the villagers to still get their rewards
                #if env.history[-1]["phase"] != pare_Phase.NIGHT:
                for villager in villagers:
                    if env.history[-1]["phase"] == pare_Phase.NIGHT:
                        magent_obs[villager]["rewards"][-1] += rewards[villager]
                        magent_obs[villager]["terms"][-1] += terminations[villager]
                    else:
                        magent_obs[villager]["rewards"].append(rewards[villager])
                        magent_obs[villager]["terms"].append(terminations[villager])

            # POST GAME STATS #
            winner = env.world_state['winners']

            if winner == pare_Role.VILLAGER:
                villager_wins += 1
        
            # END OF POST GAME STATS #
            mlflow.log_metric("villager wins", villager_wins)
            if episode % 50 == 0:
                #wwins, vwins = play_and_return_stats(env, random_wolf_policy, ppo_agent, num_games=100)
                #mlflow.log_metric("wwins", wwins)
                #mlflow.log_metric("vwins", vwins)
                continue
        
        
        
        # We will do this for each agent in the episode
        # essentially we are calculating advantages and returns
            with torch.no_grad():
                for player, records in magent_obs.items():
                    # print(f'{records}')
                    advantages = torch.zeros_like(torch.tensor(records['rewards']))

                    for t in reversed(range(len(records['obs']))):
                        # print(f'T: {t+1} - Rewards : {torch.tensor(records["rewards"])[t+1]} ')
                        # not using terms, as these are episodic

                        ## this was the last one. We are not using any terminal states in a good way

                        if t == len(records['obs']) - 1:
                            #print(f'T: {t} - Rewards at end : {torch.tensor(records["rewards"])[t]} ')
                            #print(f'T: {t} - Actions at end : {torch.tensor(records["actions"])[t]} ')
                            delta = records["rewards"][t] - records["values"][t]
                            advantages[t]  = delta
                        else:
                            #print(f'T: {t} - Rewards : {torch.tensor(records["rewards"])[t]} ')
                            #print(f'T: {t} - Actions : {torch.tensor(records["actions"])[t]} ')                    
                            delta = records["rewards"][t] + gamma * records["values"][t+1] - records["values"][t]
                            advantages[t]  = delta + gamma * gamma * advantages[t+1]

                        #delta = records['rewards'][t] + gamma * records['values'][t+1] - records['values'][t]
                    magent_obs[player]["advantages"] = advantages
                    magent_obs[player]["returns"] = advantages + torch.tensor(records["values"])
                        #advantages[t] = delta + gamma * gamma * advantages[t+1]
        
            # new logic, maybe we do this after a couple of games, so we get more data overall?

            # optimize the policy and the value network now
            # we can take all our observations now and flatten them into one bigger list of individual transitions
            # TODO: could make this setting into a single loop, but maybe this is clearer. ALso could make all these tensors earlier
            b_observations = torch.cat([torch.stack(item['obs']) for item in magent_obs.values()])
            b_logprobs = torch.cat([torch.stack(item['logprobs']) for item in magent_obs.values()])
            b_actions = torch.cat([torch.stack(item['actions']) for item in magent_obs.values()])
            b_returns = torch.cat([item['returns'] for item in magent_obs.values()])
            b_values = torch.cat([torch.stack(item['values']) for item in magent_obs.values()])
            b_advantages =  torch.cat([item['advantages'] for item in magent_obs.values()])



            # b_index stands for batch index
            b_index = np.arange(len(b_observations))
            clip_fracs = []
            for epoch in range(update_epochs):
                np.random.shuffle(b_index)
                for start in range(0, len(b_observations), batch_size):
                    end = start + batch_size
                    batch_index = b_index[start:end]

                    # TODO: batched actions, How to handle batched observations and acctions properly in the agent
                    #       Maybe a different 

                    # newlogprob needs to return a list of logprobs
                    _, newlogprob, entropy, value = training_agent.get_batched_action_and_value(
                        b_observations[batch_index], b_actions[batch_index])
                    
                    logratio = newlogprob - b_logprobs[batch_index]
                    ratio = logratio.exp()

                    with torch.no_grad():
                        # calculate approx_kl http://joschu.net/blog/kl-approx.html
                        old_approx_kl = (-logratio).mean()
                        approx_kl = ((ratio - 1) - logratio).mean()
                        clip_fracs += [
                            ((ratio - 1.0).abs() > clip_coef).float().mean().item()
                        ]
                    
                    # normalizing advantages
                    advantages = b_advantages[batch_index]
                    advantages = advantages.float()
                    advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

                    # policy loss
                    pg_loss1 = -advantages * ratio
                    pg_loss2 = -advantages * torch.clamp(ratio, 1 - clip_coef, 1 + clip_coef)
                    pg_loss = torch.max(pg_loss1, pg_loss2).mean()

                    # value loss
                    value = value.flatten()
                    v_loss_unclipped = (value - b_returns[batch_index]) ** 2
                    v_clipped = b_values[batch_index] + torch.clamp(
                        value - b_values[batch_index],
                        -clip_coef,
                        clip_coef,
                    )
                    v_loss_clipped = (v_clipped - b_returns[batch_index]) ** 2
                    v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
                    v_loss = 0.5 * v_loss_max.mean()

                    entropy_loss = entropy.mean()
                    loss = pg_loss - ent_coef * entropy_loss + v_loss * vf_coef

                    optimizer.zero_grad()
                    # loss = torch.Variable(loss, requires_grad = True)
                    loss.requires_grad = True
                    loss.backward()
                    optimizer.step()
            
            # could move them from GPU here
            y_pred, y_true = b_values.numpy(), b_returns.numpy()
            var_y = np.var(y_true)
            explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y

        # At the end, print some stuff here for overall stats

        # print(f'Average game length = {avg_game_length:.2f}')
        # print(f'Wolf wins : {wolf_wins}')
        # print(f'Villager wins: {villager_wins}')
        # print(f'Avg amount of self votes a game across villagers: {sum(self_voting)/len(self_voting)}')

torch.save(training_agent, "pare_no_dead_wolf_reward")

100%|██████████| 10000/10000 [34:47<00:00,  4.79it/s]


In [None]:
class Agent(torch.nn.Module):
    def __init__(self, approval_states, num_players, obs_size):
        super().__init__()

        self.critic = torch.nn.Sequential(
            self._layer_init(torch.nn.Linear(obs_size+1, 256)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(256,256)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(256,1), std=1.0),
        )

        self.actor = torch.nn.Sequential(
            self._layer_init(torch.nn.Linear(obs_size+1, 256)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(256,256)),
            torch.nn.Tanh(),
            self._layer_init(torch.nn.Linear(256, approval_states), std=0.01),
        )

        self.num_players = num_players
    
    def get_value(self, x):
        # TODO: We need torch.mean because PPO will use value, and we have a bunch here. 
        #       Do we need to change PPO here?
        return torch.mean(self.critic(torch.stack([torch.cat((torch.tensor([i]), x)) for i in range(self.num_players)])))
    
    # only doing this for the PPO batched call so I don't need extra logic in the regular get action and value
    def get_batched_action_and_value(self, x, actions=None):

        if actions is None:
            raise ValueError("We need batched actions here")

        log_probs = []
        entropies = []
        critics = []
        for current_obs, action in zip(x, actions):
            updated_obs = torch.stack([torch.cat((torch.tensor([i]), current_obs)) for i in range(self.num_players)])

            logits = self.actor(updated_obs)
            probs = torch.distributions.categorical.Categorical(logits=logits)
            
            # update our return tensors
            log_probs.append(torch.sum(probs.log_prob(action)))
            entropies.append(torch.prod(probs.entropy()))
            critics.append(torch.mean(self.critic(updated_obs)))
            
        return actions, torch.stack(log_probs), torch.stack(entropies), torch.stack(critics)

    def convert_actions_to_approvals(self, actions):
        return [-1 if a == 2 else a.item() for a in actions]

    def get_action_and_value(self, x, action=None):
        # could call the network each time, with a different integer for each player?  get approvals that way
        # x is the flattened observation. we should go ahead and run each of the player_ids appended to full obs to get multiple classifications
        # how  to handle entropy here? maybe we multiply all the probs, and then calculate the overall entropy
        # self.critic needs to be changed too, to return an array

        # option to have critic/actors for every single player?

        # option to also delevt n-1 * n-2 for -1s on the wolf
        
        # get logits for every single player in the game.
        x = torch.stack([torch.cat((torch.tensor([i]), x)) for i in range(self.num_players)])
        logits = self.actor(x)
        probs = torch.distributions.categorical.Categorical(logits=logits)

        if action is None:
            action = probs.sample()
        
        # we multiply the entropy, and we add the log_probs together
        # TODO: multiple values for critic. should I average?
        return action, torch.sum(probs.log_prob(action)), torch.prod(probs.entropy()), torch.mean(self.critic(x))

    def _layer_init(self, layer, std=np.sqrt(2), bias_const=0.0):
        torch.nn.init.orthogonal_(layer.weight, std)
        torch.nn.init.constant_(layer.bias, bias_const)
        return layer

In [6]:
def collect_and_report_stats(env, information, ignore_wolf=True, mlflow_uri=None):
    total_self_votes = len([vals for player, vals in information.items() if vals["self_vote"] and (ignore_wolf and env.agent_roles[player] != pare_Role.WEREWOLF)])
    total_dead_votes = sum([vals["dead_vote"] for player, vals in information.items() if ignore_wolf and env.agent_roles[player] != pare_Role.WEREWOLF])
    total_viable_votes = sum([vals["viable_vote"] for player, vals in information.items() if ignore_wolf and env.agent_roles[player] != pare_Role.WEREWOLF])

    avg_self_votes = total_self_votes/len(information)
    avg_dead_votes = total_dead_votes/len(information)
    avg_viable_votes = total_viable_votes/len(information)

    return {
        "total_self_votes": total_self_votes,
        "total_dead_votes": total_dead_votes,
        "total_viable_votes": total_viable_votes,
        "avg_self_votes": avg_self_votes,
        "avg_dead_votes": avg_dead_votes,
        "avg_viable_votes": avg_viable_votes,
        "players_with_viable_votes": len([vals["viable_vote"] for player, vals in information.items() if ignore_wolf and env.agent_roles[player] != pare_Role.WEREWOLF])
    }

In [7]:
# mlflow.set_tracking_uri("http://mlflow:5000")

def play_static_wolf_game_w_agents(env, wolf_policy, trained_villager, num_times=2) -> tuple(pare_Role):

    villager_wins = 0
    loop = tqdm(range(num_times))

    with mlflow.start_run(run_name='Gameplay stats'):
        for _ in loop:
            observations, rewards, terminations, truncations, infos = env.reset()
            while env.agents:
                actions = {}

                villagers = set(env.agents) & set(env.world_state["villagers"])
                wolves = set(env.agents) & set(env.world_state["werewolves"])

                # villager steps
                if env.world_state["phase"] != pare_Phase.NIGHT:
                    # villagers actions
                    for villager in villagers:
                        obs = torch.Tensor(env.convert_obs(observations[villager]['observation']))
                        action, logprobs, _, value = trained_villager.get_action_and_value(obs)
                        actions[villager] = trained_villager.convert_actions_to_approvals(action)
                # wolf steps
                actions = actions | wolf_policy(env)
            
                observations, rewards, terminations, truncations, infos = env.step(actions)
                mlflow.log_metrics(collect_and_report_stats(env, infos))
            winner = env.world_state['winners']
            if winner == pare_Role.VILLAGER:
                villager_wins += 1

            loop.set_description(f"Villagers won {villager_wins} out of a total of {num_times} games")

env = pare(num_agents=10, werewolves=2)
env.reset()

# trained_agent = torch.load("long_approval_agent")
test_agent = torch.load("pare_no_dead_wolf_reward")
play_static_wolf_game_w_agents(env, random_coordinated_wolf, test_agent, num_times=1000)

Villagers won 2 out of a total of 10 games: 100%|██████████| 10/10 [00:01<00:00,  6.92it/s]
