In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from multi_env import make_reversi_vec_env, SelfPlayEnv
import torch as th
from players import RandomPlayer
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy
import numpy as np

In [29]:
board_shape = 8
n_envs = 10
env = make_reversi_vec_env(
    SelfPlayEnv, n_envs=n_envs,
    env_kwargs={
        'board_shape': board_shape,
        'LocalPlayer': RandomPlayer
    }
)

# Modificación de librería para que haga argmax solo sobre las válidas

In [4]:
model = PPO(
    ActorCriticPolicy,
    env,
    verbose=0,
)

In [5]:
model.predict(env.reset())

(array([45,  9, 32, 52, 38,  6, 57, 12, 12, 57], dtype=int64), None)

# Custom ActorCriticPolicy 

https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/policies.py

In [6]:
from boardgame2 import ReversiEnv

In [7]:
env_not_vect = ReversiEnv(board_shape)

In [8]:
def get_actions_mask(state):
    player = 1
    valid_actions = env_not_vect.get_valid((state, player))
    return valid_actions.reshape(-1)  


In [9]:
get_actions_mask(env.reset()[0][0])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int8)

In [43]:
class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(
        self,
        *args, # Todos los argumentos posicionales de ActorCriticPolicy
        actions_mask_func=None, # El nuevo argumento
        **kwargs # Todos los argumentos opcionales de ActorCriticPolicy
    ):
        super(CustomActorCriticPolicy, self).__init__(
            *args,
            **kwargs
        )
        if actions_mask_func:
            self.get_actions_mask = actions_mask_func
    
    
    
    def sample_masked_actions(self, obs, distribution, deterministic=False, return_distribution=False):
        # Dada las obs y distribuciones luego de evaluar la red neuronal, samplear solo las acciones válidas
        # Las obs se usan para que con self.get_actions_mask se obtengan las acciones válidas
        # las distribuciones son el resultado de evaluar la red neuronal y van a dar acciones no validas
        # Generar una nueva distribución (del lado de los logits preferentemente) donde las acciones no válidas
        # tengan probabildad nula de ser muestreadas
        # Luego se modifican abajo los métodos
        # _predict, forward y evaluate_actions
        # Si tiene el flag de return_distribution en true devuelve la distribución nueva
        # Caso contrario devuelve las acciones
        # Para tener en cuenta, obs tiene dimensión [batch_size, channels, H, W]
        # Recomendamos poner un print(obs.shape)
        # y correr:
        # obs = env.reset()
        # actions, _ = model.predict(obs)
        # Para sacarse las dudas
        def get_mask(obs):
            masks = np.zeros((len(obs), obs.shape[-1] * obs.shape[-2]))
            for i, board in enumerate(obs):
                board = board[0].cpu().numpy()
                masks[i] = 1 - self.get_actions_mask(board)
            return th.from_numpy(masks).to(self.device)
        masks = get_mask(obs)
        masks[masks == 1] = -1e6
        masked_logits = distribution.logits + masks
        if return_distribution:
            return th.distributions.Categorical(logits=masked_logits)
        if deterministic:
            return th.argmax(masked_logits, axis=1)
        return th.distributions.Categorical(logits=masked_logits).sample()
    
    def _predict(self, observation, deterministic=False):
        """
        Get the action according to the policy for a given observation.
        :param observation:
        :param deterministic: Whether to use stochastic or deterministic actions
        :return: Taken action according to the policy
        """
        latent_pi, _, latent_sde = self._get_latent(observation)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
        
        if self.get_actions_mask:
            actions = self.sample_masked_actions(observation, distribution.distribution, deterministic=deterministic)
        else:
            actions = distribution.get_actions(deterministic=deterministic)
        
        return actions
    
    def forward(self, obs: th.Tensor, deterministic: bool = False):
        """
        Forward pass in all the networks (actor and critic)
        :param obs: Observation
        :param deterministic: Whether to sample or use deterministic actions
        :return: action, value and log probability of the action
        """
        latent_pi, latent_vf, latent_sde = self._get_latent(obs)
        # Evaluate the values for the given observations
        values = self.value_net(latent_vf)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde=latent_sde)
        
        
        if self.get_actions_mask:
            actions = self.sample_masked_actions(obs, distribution.distribution, deterministic=deterministic)
        else:
            actions = distribution.get_actions(deterministic=deterministic)

        log_prob = distribution.log_prob(actions)
        return actions, values, log_prob
    
    def evaluate_actions(self, obs: th.Tensor, actions: th.Tensor):
        """
        Evaluate actions according to the current policy,
        given the observations.
        :param obs:
        :param actions:
        :return: estimated value, log likelihood of taking those actions
            and entropy of the action distribution.
        """
        latent_pi, latent_vf, latent_sde = self._get_latent(obs)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
        distrib = self.sample_masked_actions(obs, distribution.distribution, return_distribution=True)

        log_prob = distrib.log_prob(actions)
        values = self.value_net(latent_vf)
        return values, log_prob, distrib.entropy()

In [44]:
model = PPO(
    CustomActorCriticPolicy,
    env,
    verbose=0,
    policy_kwargs = {'actions_mask_func': get_actions_mask}
)


In [45]:
# Testeo de predict
model.policy.get_actions_mask(env.reset()[0][0])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int8)

In [30]:
obs = env.reset()
actions, _ = model.predict(obs)

In [47]:
# Verificar que las acciones son válidas
actions

array([19, 29, 26, 29, 20, 42, 42, 19, 26, 20], dtype=int64)

In [48]:
# Testeo de forward
model.policy(th.from_numpy(obs).to(model.device))

(tensor([21, 29, 26, 34, 20, 26, 42, 37, 44, 43]),
 tensor([[ 0.2870],
         [ 0.1920],
         [-0.8251],
         [ 0.1920],
         [ 0.1920],
         [-0.5964],
         [-0.8251],
         [ 0.2870],
         [-0.8251],
         [ 0.1920]], grad_fn=<AddmmBackward>),
 tensor([-4.1566, -4.1581, -4.1583, -4.1579, -4.1617, -4.1621, -4.1679, -4.1609,
         -4.1599, -4.1637], grad_fn=<SqueezeBackward1>))

# Corremos PPO

In [49]:
board_shape = 8
n_envs = 6
gamma = 0.99
ent_coef = 0.0
gae_lambda = 0.95
n_epochs = 10

In [50]:
prefix = 'Reversi_PPO'
suffix = 'masked_actions'
model_name = f'{prefix}_{board_shape}by{board_shape}_{gamma}_{gae_lambda}_{ent_coef}_{n_epochs}_{n_envs}_{suffix}'
best_model_save_path = f'./models/{model_name}'
print(model_name)
print(best_model_save_path)

Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions
./models/Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions


In [51]:
model = PPO(
    CustomActorCriticPolicy,
    env,
    verbose=0,
    tensorboard_log='tensorboard_log',
    gamma=gamma,
    gae_lambda=gae_lambda,
    ent_coef=ent_coef,
    n_epochs=n_epochs,
    policy_kwargs = {'actions_mask_func': get_actions_mask}
)

In [52]:
from stable_baselines3.common.callbacks import EvalCallback

In [53]:
# El entorno de evaluación no corre en paralelo por eso uno solo
eval_env = make_reversi_vec_env(
    SelfPlayEnv, n_envs=1,
    env_kwargs={
        'board_shape': board_shape,
        'LocalPlayer': RandomPlayer
    }
)

In [54]:
eval_callback = EvalCallback(
    eval_env = eval_env,
    eval_freq=1_000,
    n_eval_episodes=500,
    deterministic=True,
    verbose=1,
    best_model_save_path=best_model_save_path,
) 

In [55]:
model.learn(total_timesteps=int(1e10), callback=[eval_callback])

Eval num_timesteps=10000, episode_reward=0.09 +/- 0.97
Episode length: 30.01 +/- 0.63
New best mean reward!
Eval num_timesteps=20000, episode_reward=0.03 +/- 0.98
Episode length: 29.93 +/- 1.06
Eval num_timesteps=30000, episode_reward=0.19 +/- 0.96
Episode length: 29.97 +/- 0.58
New best mean reward!
Eval num_timesteps=40000, episode_reward=0.20 +/- 0.96
Episode length: 30.04 +/- 0.56
New best mean reward!
Eval num_timesteps=50000, episode_reward=0.43 +/- 0.88
Episode length: 30.07 +/- 1.08
New best mean reward!
Eval num_timesteps=60000, episode_reward=0.39 +/- 0.90
Episode length: 30.09 +/- 0.56
Eval num_timesteps=70000, episode_reward=0.50 +/- 0.85
Episode length: 29.94 +/- 1.27
New best mean reward!
Eval num_timesteps=80000, episode_reward=0.48 +/- 0.86
Episode length: 29.99 +/- 0.60
Eval num_timesteps=90000, episode_reward=0.58 +/- 0.80
Episode length: 30.07 +/- 0.58
New best mean reward!
Eval num_timesteps=100000, episode_reward=0.54 +/- 0.81
Episode length: 30.00 +/- 0.85
Eval nu

Eval num_timesteps=940000, episode_reward=-0.26 +/- 0.95
Episode length: 29.91 +/- 1.33
Eval num_timesteps=950000, episode_reward=-0.08 +/- 0.98
Episode length: 29.91 +/- 1.23
Eval num_timesteps=960000, episode_reward=-0.13 +/- 0.97
Episode length: 30.01 +/- 0.97
Eval num_timesteps=970000, episode_reward=0.02 +/- 0.98
Episode length: 29.99 +/- 0.59
Eval num_timesteps=980000, episode_reward=-0.04 +/- 0.97
Episode length: 29.96 +/- 0.58
Eval num_timesteps=990000, episode_reward=-0.32 +/- 0.93
Episode length: 29.83 +/- 0.86
Eval num_timesteps=1000000, episode_reward=-0.27 +/- 0.94
Episode length: 29.78 +/- 1.21
Eval num_timesteps=1010000, episode_reward=0.17 +/- 0.96
Episode length: 30.02 +/- 0.62
Eval num_timesteps=1020000, episode_reward=0.12 +/- 0.97
Episode length: 29.91 +/- 1.30
Eval num_timesteps=1030000, episode_reward=-0.08 +/- 0.97
Episode length: 29.93 +/- 0.59
Eval num_timesteps=1040000, episode_reward=-0.13 +/- 0.98
Episode length: 29.97 +/- 0.60
Eval num_timesteps=1050000, ep

KeyboardInterrupt: 

# Se hacen unas pequeñas modificaciones en el Forward del CustomActorCritic y se vuelve a correr

In [10]:
class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(
        self,
        *args, # Todos los argumentos posicionales de ActorCriticPolicy
        actions_mask_func=None, # El nuevo argumento
        **kwargs # Todos los argumentos opcionales de ActorCriticPolicy
    ):
        super(CustomActorCriticPolicy, self).__init__(
            *args,
            **kwargs
        )
        if actions_mask_func:
            self.get_actions_mask = actions_mask_func
    
    
    
    def sample_masked_actions(self, obs, distribution, deterministic=False, return_distribution=False):
        # Dada las obs y distribuciones luego de evaluar la red neuronal, samplear solo las acciones válidas
        # Las obs se usan para que con self.get_actions_mask se obtengan las acciones válidas
        # las distribuciones son el resultado de evaluar la red neuronal y van a dar acciones no validas
        # Generar una nueva distribución (del lado de los logits preferentemente) donde las acciones no válidas
        # tengan probabildad nula de ser muestreadas
        # Luego se modifican abajo los métodos
        # _predict, forward y evaluate_actions
        # Si tiene el flag de return_distribution en true devuelve la distribución nueva
        # Caso contrario devuelve las acciones
        # Para tener en cuenta, obs tiene dimensión [batch_size, channels, H, W]
        # Recomendamos poner un print(obs.shape)
        # y correr:
        # obs = env.reset()
        # actions, _ = model.predict(obs)
        # Para sacarse las dudas
        def get_mask(obs):
            masks = np.zeros((len(obs), obs.shape[-1] * obs.shape[-2]))
            for i, board in enumerate(obs):
                board = board[0].cpu().numpy()
                masks[i] = 1 - self.get_actions_mask(board)
            return th.from_numpy(masks).to(self.device)
        masks = get_mask(obs)
        masks[masks == 1] = -1e6
        masked_logits = distribution.logits + masks
        if return_distribution:
            return th.distributions.Categorical(logits=masked_logits)
        if deterministic:
            return th.argmax(masked_logits, axis=1)
        return th.distributions.Categorical(logits=masked_logits).sample()
    
    def _predict(self, observation, deterministic=False):
        """
        Get the action according to the policy for a given observation.
        :param observation:
        :param deterministic: Whether to use stochastic or deterministic actions
        :return: Taken action according to the policy
        """
        latent_pi, _, latent_sde = self._get_latent(observation)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
        
        if self.get_actions_mask:
            actions = self.sample_masked_actions(observation, distribution.distribution, deterministic=deterministic)
        else:
            actions = distribution.get_actions(deterministic=deterministic)
        
        return actions
    
    def forward(self, obs: th.Tensor, deterministic: bool = False):
        """
        Forward pass in all the networks (actor and critic)
        :param obs: Observation
        :param deterministic: Whether to sample or use deterministic actions
        :return: action, value and log probability of the action
        """
        latent_pi, latent_vf, latent_sde = self._get_latent(obs)
        # Evaluate the values for the given observations
        values = self.value_net(latent_vf)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde=latent_sde)
        #agrego el distrib
        distrib = self.sample_masked_actions(obs, distribution.distribution, return_distribution=True)
        
        if self.get_actions_mask:
            actions = self.sample_masked_actions(obs, distribution.distribution, deterministic=deterministic)
        else:
            actions = distribution.get_actions(deterministic=deterministic)
        
        #reemplazo el log_pro con el distrib
        #log_prob = distribution.log_prob(actions)
        log_prob = distrib.log_prob(actions)
        return actions, values, log_prob
    
    def evaluate_actions(self, obs: th.Tensor, actions: th.Tensor):
        """
        Evaluate actions according to the current policy,
        given the observations.
        :param obs:
        :param actions:
        :return: estimated value, log likelihood of taking those actions
            and entropy of the action distribution.
        """
        latent_pi, latent_vf, latent_sde = self._get_latent(obs)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
        distrib = self.sample_masked_actions(obs, distribution.distribution, return_distribution=True)

        log_prob = distrib.log_prob(actions)
        values = self.value_net(latent_vf)
        return values, log_prob, distrib.entropy()

In [11]:
model = PPO(
    CustomActorCriticPolicy,
    env,
    verbose=0,
    policy_kwargs = {'actions_mask_func': get_actions_mask}
)


In [12]:
# Testeo de predict
model.policy.get_actions_mask(env.reset()[0][0])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int8)

In [13]:
obs = env.reset()
actions, _ = model.predict(obs)

In [14]:
# Verificar que las acciones son válidas
actions

array([19, 20, 29, 20, 19, 42, 26, 19, 19, 29], dtype=int64)

In [15]:
# Testeo de forward
model.policy(th.from_numpy(obs).to(model.device))

(tensor([37, 20, 43, 34, 19, 42, 44, 21, 21, 29]),
 tensor([[ 0.8869],
         [-0.2728],
         [-0.2728],
         [-0.2728],
         [ 0.8869],
         [ 0.5980],
         [ 0.5980],
         [ 0.7883],
         [ 0.8869],
         [-0.2728]], grad_fn=<AddmmBackward>),
 tensor([-1.0977, -1.3857, -1.3854, -1.3888, -1.0981, -1.0995, -1.0994, -1.0982,
         -1.1001, -1.3852], dtype=torch.float64, grad_fn=<SqueezeBackward1>))

Corremos PPO

In [16]:
board_shape = 8
n_envs = 6
gamma = 0.99
ent_coef = 0.0
gae_lambda = 0.95
n_epochs = 10

In [17]:
prefix = 'Reversi_PPO'
suffix = 'masked_actions_distrib'
model_name = f'{prefix}_{board_shape}by{board_shape}_{gamma}_{gae_lambda}_{ent_coef}_{n_epochs}_{n_envs}_{suffix}'
best_model_save_path = f'./models/{model_name}'
print(model_name)
print(best_model_save_path)

Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions_distrib
./models/Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions_distrib


In [18]:
model = PPO(
    CustomActorCriticPolicy,
    env,
    verbose=0,
    tensorboard_log='tensorboard_log',
    gamma=gamma,
    gae_lambda=gae_lambda,
    ent_coef=ent_coef,
    n_epochs=n_epochs,
    policy_kwargs = {'actions_mask_func': get_actions_mask}
)

In [19]:
from stable_baselines3.common.callbacks import EvalCallback

In [20]:
# El entorno de evaluación no corre en paralelo por eso uno solo
eval_env = make_reversi_vec_env(
    SelfPlayEnv, n_envs=1,
    env_kwargs={
        'board_shape': board_shape,
        'LocalPlayer': RandomPlayer
    }
)

In [21]:
eval_callback = EvalCallback(
    eval_env = eval_env,
    eval_freq=1_000,
    n_eval_episodes=500,
    deterministic=True,
    verbose=1,
    best_model_save_path=best_model_save_path,
) 

In [22]:
model.learn(total_timesteps=int(1e10), callback=[eval_callback])

Eval num_timesteps=10000, episode_reward=-0.07 +/- 0.96
Episode length: 29.83 +/- 2.01
New best mean reward!
Eval num_timesteps=20000, episode_reward=-0.15 +/- 0.96
Episode length: 29.89 +/- 1.67
Eval num_timesteps=30000, episode_reward=0.34 +/- 0.91
Episode length: 30.15 +/- 0.62
New best mean reward!
Eval num_timesteps=40000, episode_reward=0.33 +/- 0.92
Episode length: 30.09 +/- 0.64
Eval num_timesteps=50000, episode_reward=0.39 +/- 0.90
Episode length: 30.06 +/- 0.55
New best mean reward!
Eval num_timesteps=60000, episode_reward=0.40 +/- 0.89
Episode length: 30.08 +/- 0.56
New best mean reward!
Eval num_timesteps=70000, episode_reward=0.51 +/- 0.84
Episode length: 30.06 +/- 1.13
New best mean reward!
Eval num_timesteps=80000, episode_reward=0.54 +/- 0.83
Episode length: 30.11 +/- 0.54
New best mean reward!
Eval num_timesteps=90000, episode_reward=0.57 +/- 0.80
Episode length: 30.10 +/- 0.55
New best mean reward!
Eval num_timesteps=100000, episode_reward=0.55 +/- 0.82
Episode length

Eval num_timesteps=900000, episode_reward=0.89 +/- 0.44
Episode length: 30.12 +/- 0.56
Eval num_timesteps=910000, episode_reward=0.84 +/- 0.52
Episode length: 30.12 +/- 1.19
Eval num_timesteps=920000, episode_reward=0.84 +/- 0.52
Episode length: 30.13 +/- 0.65
Eval num_timesteps=930000, episode_reward=0.90 +/- 0.43
Episode length: 30.16 +/- 0.60
New best mean reward!
Eval num_timesteps=940000, episode_reward=0.85 +/- 0.51
Episode length: 30.15 +/- 0.63
Eval num_timesteps=950000, episode_reward=0.91 +/- 0.41
Episode length: 30.09 +/- 1.21
New best mean reward!
Eval num_timesteps=960000, episode_reward=0.85 +/- 0.51
Episode length: 30.09 +/- 1.21
Eval num_timesteps=970000, episode_reward=0.88 +/- 0.44
Episode length: 30.12 +/- 1.21
Eval num_timesteps=980000, episode_reward=0.87 +/- 0.47
Episode length: 30.14 +/- 0.56
Eval num_timesteps=990000, episode_reward=0.88 +/- 0.47
Episode length: 30.13 +/- 0.58
Eval num_timesteps=1000000, episode_reward=0.90 +/- 0.42
Episode length: 30.03 +/- 1.8

KeyboardInterrupt: 

In [23]:
# Load the TensorBoard notebook extension
%reload_ext tensorboard

In [28]:
%tensorboard --logdir 'tensorboard_log' --host localhost --port 8088
