In [1]:
import pandas as pd
df = pd.read_csv('./ELO_tournament/nocom_models.csv')
df2 = df[df['arch'] == 'rnn']


In [3]:
import argparse
import os
import random
import time
from distutils.util import strtobool
import copy

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter
from environment.briscola_communication.actions import BriscolaCommsAction


In [4]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer


class CategoricalMasked(Categorical):
    def __init__(self, probs=None, logits=None, validate_args=None, masks=[]):
        self.masks = masks
        if len(self.masks) == 0:
            super(CategoricalMasked, self).__init__(
                probs, logits, validate_args)
        else:
            self.masks = masks.type(torch.BoolTensor).to(logits.device)
            logits = torch.where(self.masks, logits,
                                 torch.tensor(-1e8).to(logits.device))
            super(CategoricalMasked, self).__init__(
                probs, logits, validate_args)

    def entropy(self):
        if len(self.masks) == 0:
            return super(CategoricalMasked, self).entropy()
        p_log_p = self.logits * self.probs
        p_log_p = torch.where(self.masks, p_log_p,
                              torch.tensor(0.0).to(self.masks.device))
        return -p_log_p.sum(-1)


class Agent(nn.Module):
    def __init__(self, env, rnn_out_size, hidden_dim):
        super().__init__()
        self.lstm = nn.LSTM(
            np.prod(env.previous_round_shape), rnn_out_size)
        for name, param in self.lstm.named_parameters():
            if "bias" in name:
                nn.init.constant_(param, 0)
            elif "weight" in name:
                nn.init.orthogonal_(param, 1.0)

        self.actor = nn.Sequential(
            layer_init(nn.Linear(np.prod(env.current_round_shape) +
                       rnn_out_size, hidden_dim)),
            nn.Tanh(),
            layer_init(nn.Linear(hidden_dim, hidden_dim)),
            nn.Tanh(),
            layer_init(
                nn.Linear(hidden_dim, env.num_actions), std=0.01)
        )

        self.critic = nn.Sequential(
            layer_init(nn.Linear(np.prod(env.current_round_shape) +
                       rnn_out_size, hidden_dim)),
            nn.Tanh(),
            layer_init(nn.Linear(hidden_dim, hidden_dim)),
            nn.Tanh(),
            layer_init(nn.Linear(hidden_dim, 1), std=1)
        )

        self.offset_round = env.current_round_shape[-1]

    def get_states(self, x, lstm_state, done):
        x_features_round = x[:, :, :self.offset_round]  # B, S, F
        x_previous_round = x[:, :,  self.offset_round:]    # B, S, P

        # LSTM logic
        batch_size = lstm_state[0].shape[1]
        x_previous_round = x_previous_round.reshape(
            (-1, batch_size, self.lstm.input_size))
        done = done.reshape((-1, batch_size))

        out_lstm = []
        for xr, d in zip(x_previous_round, done):
            o, lstm_state = self.lstm(
                xr.unsqueeze(0),  ((1.0 - d).view(1, -1, 1) * lstm_state[0], (1.0 - d).view(1, -1, 1) * lstm_state[1]))
            out_lstm += [o]

        out_lstm = torch.flatten(torch.cat(out_lstm), 0, 1)
        new_hidden = torch.cat([x_features_round.squeeze(1),
                                out_lstm], dim=1)
        return new_hidden, lstm_state

    def get_value(self, x, lstm_state, done):
        hidden, _ = self.get_states(x, lstm_state, done)
        return self.critic(hidden)

    def get_action_and_value(self, x, action_mask, lstm_state, done, action=None,  deterministic=False):
        hidden, lstm_state = self.get_states(x, lstm_state, done)
        action_mask = action_mask.squeeze()
        logits = self.actor(hidden)
        probs = CategoricalMasked(logits=logits, masks=action_mask)
        if action is None and not deterministic:
            action = probs.sample()
        if action is None and deterministic:
            if len(action_mask.shape) == 1:
                action_mask = action_mask.unsqueeze(0)
            logits[~action_mask] = -torch.inf
            action = logits.argmax(axis=-1)
        return action, probs.log_prob(action), probs.entropy(), self.critic(hidden), lstm_state


In [5]:
seed = 42
num_test_games = 1000
num_envs = 16
briscola_communicate = False
num_steps = 8


In [7]:
from environment.briscola_base.briscola_rnn import BriscolaEnv


def make_env(seed, rnn_out_size, role_training, briscola_agents, verbose=False, deterministic_eval=False):
    def thunk():
        # if args.briscola_communicate:
        #     env = BriscolaEnv(1, args.rnn_out_size, normalize_reward=False, render_mode='terminal_env' if verbose else None,
        #                       role=role_training,  agents=briscola_agents, deterministic_eval=deterministic_eval, device=ENV_DEVICE,
        #                       communication_say_truth=briscola_communicate_truth)
        # else:
        env = BriscolaEnv(1, rnn_out_size, normalize_reward=False, render_mode='terminal_env' if verbose else None,
                          role=role_training,  agents=briscola_agents, deterministic_eval=deterministic_eval, device='cpu')
        env = gym.wrappers.RecordEpisodeStatistics(env)
        env.seed(seed)
        return env

    return thunk


dummy_env = make_env(42, 0, 'caller', {})()


In [24]:
def evaluate_elo(modelA_path, modelA_rnn_out_size, modelA_hidden_dim, modelB_path, modelB_rnn_out_size, modelB_hidden_dim):
    modelA = Agent(dummy_env, modelA_rnn_out_size, modelA_hidden_dim)
    modelA.load_state_dict(torch.load(modelA_path)['model_state_dict'])
    modelA.eval()
    modelB = Agent(dummy_env, modelB_rnn_out_size, modelB_hidden_dim)
    modelB.load_state_dict(torch.load(modelB_path)['model_state_dict'])
    modelB.eval()

    # 1 Game modelB as caller and callee
    config = {'callee': modelB,  'good_1': modelA,
              'good_2': modelA, 'good_3': modelA}
    env = gym.vector.SyncVectorEnv(
        [make_env(seed+(num_envs)+i, modelB_rnn_out_size, 'caller', config, deterministic_eval=True)
         for i in range(num_test_games)]
    )
    data, _ = env.reset()
    next_obs, next_mask = torch.tensor(data['observation'],  dtype=torch.float), torch.tensor(
        data['action_mask'], dtype=torch.bool)
    next_lstm_state = (
        torch.zeros(modelB.lstm.num_layers, num_test_games,
                    modelB.lstm.hidden_size),
        torch.zeros(modelB.lstm.num_layers, num_test_games,
                    modelB.lstm.hidden_size),
    )  # hidden and cell states
    next_done = torch.zeros(num_test_games)

    count_truth_comm = 0
    for _ in range(0, num_steps):
        # ALGO LOGIC: action logic
        with torch.no_grad():
            action, logprob, _, value, next_lstm_state = modelB.get_action_and_value(
                next_obs, next_mask, next_lstm_state, next_done, deterministic=True)
            if briscola_communicate and (action >= 40).all():
                # Action is communicating
                count_truth_comm += (action <
                                     (40+BriscolaCommsAction.NUM_MESSAGES)).sum()

        # TRY NOT TO MODIFY: execute the game and log data.
        data, reward, done, _, info = env.step(action.cpu().numpy())
        next_obs, next_mask, next_done = torch.tensor(data['observation'],  dtype=torch.float), torch.tensor(
            data['action_mask'], dtype=torch.bool), torch.tensor(done, dtype=torch.float)

    reward_bad_B = reward.mean()
    reward_good_A = 120.0-reward_bad_B

    # 2 Game
    config = {'callee': modelA,  'caller': modelA,
              'good_2': modelB, 'good_3': modelB}
    env = gym.vector.SyncVectorEnv(
        [make_env(seed+(num_envs)+i, modelB_rnn_out_size, 'good_1', config, deterministic_eval=True)
         for i in range(num_test_games)]
    )
    data, _ = env.reset()
    next_obs, next_mask = torch.tensor(data['observation'],  dtype=torch.float), torch.tensor(
        data['action_mask'], dtype=torch.bool)
    next_lstm_state = (
        torch.zeros(modelB.lstm.num_layers, num_test_games,
                    modelB.lstm.hidden_size),
        torch.zeros(modelB.lstm.num_layers, num_test_games,
                    modelB.lstm.hidden_size),
    )  # hidden and cell states
    next_done = torch.zeros(num_test_games)

    count_truth_comm = 0
    for _ in range(0, num_steps):
        # ALGO LOGIC: action logic
        with torch.no_grad():
            action, logprob, _, value, next_lstm_state = modelB.get_action_and_value(
                next_obs, next_mask, next_lstm_state, next_done, deterministic=True)
            if briscola_communicate and (action >= 40).all():
                # Action is communicating
                count_truth_comm += (action <
                                     (40+BriscolaCommsAction.NUM_MESSAGES)).sum()

        # TRY NOT TO MODIFY: execute the game and log data.
        data, reward, done, _, info = env.step(action.cpu().numpy())
        next_obs, next_mask, next_done = torch.tensor(data['observation'],  dtype=torch.float), torch.tensor(
            data['action_mask'], dtype=torch.bool), torch.tensor(done, dtype=torch.float)

    reward_good_B = reward.mean()
    reward_bad_A = 120.0-reward_good_B

    return reward_bad_A, reward_good_A, reward_bad_B, reward_good_B,


In [35]:
modelA_path = f"./ELO_tournament/nocom/{df2.iloc[20]['file_name']}"
modelA_rnn_out_size = df2.iloc[20]['rnn_out_size']
modelA_hidden_dim = df2.iloc[20]['hidden_dim']
modelB_path = f"./ELO_tournament/nocom/{df2.iloc[21]['file_name']}"
modelB_rnn_out_size = df2.iloc[21]['rnn_out_size']
modelB_hidden_dim = df2.iloc[21]['hidden_dim']

evaluate_elo(modelA_path, modelA_rnn_out_size, modelA_hidden_dim, modelB_path, modelB_rnn_out_size, modelB_hidden_dim)

  gym.logger.warn(


(84.402, 31.189999999999998, 88.81, 35.598)

In [36]:
df2

Unnamed: 0.1,Unnamed: 0,name,id,arch,comm,hidden_dim,rnn_out_size,step,last,file_name
23,0,P2_PPO_universal_RNN_h=128_rnn=128__1__1685970630,95o6x1dd,rnn,nocom,128,128,12800000,False,95o6x1dd_12800000.pt
24,0,P2_PPO_universal_RNN_h=128_rnn=128__1__1685970630,95o6x1dd,rnn,nocom,128,128,25600000,False,95o6x1dd_25600000.pt
25,0,P2_PPO_universal_RNN_h=128_rnn=128__1__1685970630,95o6x1dd,rnn,nocom,128,128,38400000,False,95o6x1dd_38400000.pt
26,0,P2_PPO_universal_RNN_h=128_rnn=128__1__1685970630,95o6x1dd,rnn,nocom,128,128,51200000,False,95o6x1dd_51200000.pt
27,0,P2_PPO_universal_RNN_h=128_rnn=64__1__1685970621,jgfgmbkg,rnn,nocom,128,64,12800000,False,jgfgmbkg_12800000.pt
...,...,...,...,...,...,...,...,...,...,...
92,0,T_PPO_universal_RNN_h=64_rnn=64__1__1685745066,p7d4raf4,rnn,nocom,64,64,38400000,False,p7d4raf4_38400000.pt
93,0,T_PPO_universal_RNN_h=64_rnn=64__1__1685745066,p7d4raf4,rnn,nocom,64,64,51200000,False,p7d4raf4_51200000.pt
94,0,T_PPO_universal_RNN_h=64_rnn=64__1__1685745066,p7d4raf4,rnn,nocom,64,64,64000000,False,p7d4raf4_64000000.pt
95,0,T_PPO_universal_RNN_h=64_rnn=64__1__1685745066,p7d4raf4,rnn,nocom,64,64,76800000,False,p7d4raf4_76800000.pt
