In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%load_ext tensorboard

In [3]:
from multi_env import make_reversi_vec_env, SelfPlayEnv
import torch as th
from players import RandomPlayer, DictPolicyPlayer, GreedyPlayer
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy
import numpy as np

In [4]:
board_shape = 8
n_envs = 10
env = make_reversi_vec_env(
    SelfPlayEnv, n_envs=n_envs,
    env_kwargs={
        'board_shape': board_shape,
        'LocalPlayer': RandomPlayer
    }
)

# Modificación de librería para que haga argmax solo sobre las válidas

In [5]:
model = PPO(
    ActorCriticPolicy,
    env,
    verbose=0,
)

In [27]:
model.predict(env.reset())

(array([19, 20, 44, 34, 29, 20, 37, 20, 21, 21]), None)

# Custom ActorCriticPolicy 

https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/policies.py

In [7]:
from boardgame2 import ReversiEnv

In [8]:
env_not_vect = ReversiEnv(board_shape)

In [9]:
(state, player) = env_not_vect.reset()
print(state)
print(player)

[[ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  1 -1  0  0  0]
 [ 0  0  0 -1  1  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]]
1


In [10]:
env_not_vect.get_valid((state, player))

array([[0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0]], dtype=int8)

In [11]:
def get_actions_mask(state):
    player = 1
    valid_actions = env_not_vect.get_valid((state, player))
    return valid_actions.reshape(-1)  


In [12]:
m = get_actions_mask(env.reset()[0][0])
print(m.reshape(board_shape, board_shape))

[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0]
 [0 0 0 0 0 1 0 0]
 [0 0 1 0 0 0 0 0]
 [0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]


In [13]:
class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(
        self,
        *args, # Todos los argumentos posicionales de ActorCriticPolicy
        actions_mask_func=None, # El nuevo argumento
        **kwargs # Todos los argumentos opcionales de ActorCriticPolicy
    ):
        super(CustomActorCriticPolicy, self).__init__(
            *args,
            **kwargs
        )
        if actions_mask_func:
            self.get_actions_mask = actions_mask_func
    
    
    
    def sample_masked_actions(self, obs, distribution, deterministic=False, return_distribution=False):
        # Dada las obs y distribuciones luego de evaluar la red neuronal, samplear solo las acciones válidas
        # Las obs se usan para que con self.get_actions_mask se obtengan las acciones válidas
        # las distribuciones son el resultado de evaluar la red neuronal y van a dar acciones no validas
        # Generar una nueva distribución (del lado de los logits preferentemente) donde las acciones no válidas
        # tengan probabildad nula de ser muestreadas
        # Luego se modifican abajo los métodos
        # _predict, forward y evaluate_actions
        # Si tiene el flag de return_distribution en true devuelve la distribución nueva
        # Caso contrario devuelve las acciones
        # Para tener en cuenta, obs tiene dimensión [batch_size, channels, H, W]
        # Recomendamos poner un print(obs.shape)
        # y correr:
        # obs = env.reset()
        # actions, _ = model.predict(obs)
        # Para sacarse las dudas
        
        
        def get_mask(obs):
            masks = np.zeros((len(obs), obs.shape[-1] * obs.shape[-2]))
            for i, board in enumerate(obs):
                board =  board[0].cpu().numpy()
                masks[i] = 1 - self.get_actions_mask(board)
            return masks
 
        masks = th.as_tensor(get_mask(obs))
        masks[masks == 1] = -np.inf

        #print(distribution.logits.dtype, distribution.logits.shape)
        #print(masks.dtype, masks.shape)

        masked_logits = distribution.logits + masks
        #print(masked_logits)

        if return_distribution:
            return th.distributions.Categorical(logits=masked_logits, validate_args = True)
        if deterministic:
            return th.argmax(masked_logits, axis=1)
        return th.distributions.Categorical(logits=masked_logits).sample()
                
    
    def _predict(self, observation, deterministic=False):
        """
        Get the action according to the policy for a given observation.
        :param observation:
        :param deterministic: Whether to use stochastic or deterministic actions
        :return: Taken action according to the policy
        """
        latent_pi, _, latent_sde = self._get_latent(observation)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
        
        if self.get_actions_mask:
            actions = self.sample_masked_actions(observation, distribution.distribution, deterministic=deterministic)
        else:
            actions = distribution.get_actions(deterministic=deterministic)
        
        return actions
    
    def forward(self, obs: th.Tensor, deterministic: bool = False):
        """
        Forward pass in all the networks (actor and critic)
        :param obs: Observation
        :param deterministic: Whether to sample or use deterministic actions
        :return: action, value and log probability of the action
        """
        latent_pi, latent_vf, latent_sde = self._get_latent(obs)
        # Evaluate the values for the given observations
        values = self.value_net(latent_vf)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde=latent_sde)
        distrib = self.sample_masked_actions(obs, distribution.distribution, return_distribution=True)
        
        actions = self.sample_masked_actions(obs, distribution.distribution, deterministic=deterministic)

        log_prob = distrib.log_prob(actions)        
        
        #if self.get_actions_mask:
        #    actions = self.sample_masked_actions(obs, distribution.distribution, deterministic=deterministic)
        #else:
        #    actions = distribution.get_actions(deterministic=deterministic)
        #
        #log_prob = distribution.log_prob(actions)
        
        
        return actions, values, log_prob
    
    def evaluate_actions(self, obs: th.Tensor, actions: th.Tensor) -> [th.Tensor, th.Tensor, th.Tensor]:
        """
        Evaluate actions according to the current policy,
        given the observations.
        :param obs:
        :param actions:
        :return: estimated value, log likelihood of taking those actions
            and entropy of the action distribution.
        """
        latent_pi, latent_vf, latent_sde = self._get_latent(obs)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
        distrib = self.sample_masked_actions(obs, distribution.distribution, return_distribution=True)

        log_prob = distrib.log_prob(actions)
        values = self.value_net(latent_vf)
        return values, log_prob, distrib.entropy()

In [14]:
model = PPO(
    CustomActorCriticPolicy,
    env,
    verbose=0,
    policy_kwargs = {'actions_mask_func': get_actions_mask}
)


In [15]:
model.policy

CustomActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=64, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)

In [16]:
# Testeo de predict
mask = model.policy.get_actions_mask(env.reset()[0][0])
print(mask.reshape(board_shape, board_shape))

[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0]
 [0 0 0 0 0 1 0 0]
 [0 0 1 0 0 0 0 0]
 [0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]


In [17]:
obs = env.reset()
actions, _ = model.predict(obs)

In [18]:
# Verificar que las acciones son válidas
actions

array([29, 44, 34, 37, 43, 20, 29, 20, 26, 26])

In [19]:
# Testeo de forward
model.policy(th.from_numpy(obs).to(model.device))

(tensor([34, 42, 34, 37, 29, 29, 43, 20, 26, 42]),
 tensor([[ 0.0187],
         [ 0.0781],
         [ 0.0187],
         [-0.0073],
         [ 0.0187],
         [ 0.0187],
         [ 0.0187],
         [ 0.0187],
         [-0.1200],
         [-0.1200]], grad_fn=<AddmmBackward>),
 tensor([-1.3834, -1.0962, -1.3834, -1.1027, -1.3880, -1.3880, -1.3857, -1.3881,
         -1.0998, -1.0984], dtype=torch.float64, grad_fn=<SqueezeBackward1>))

# Corremos PPO

In [20]:
board_shape = 8
n_envs = 6
gamma = 0.99
ent_coef = 0.0
gae_lambda = 0.95
n_epochs = 10

In [21]:
prefix = 'Reversi_PPO'
suffix = 'masked_actions_2'
model_name = f'{prefix}_{board_shape}by{board_shape}_{gamma}_{gae_lambda}_{ent_coef}_{n_epochs}_{n_envs}_{suffix}'
best_model_save_path = f'./models/{model_name}'
print(model_name)
print(best_model_save_path)

Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions_2
./models/Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions_2


In [22]:
model = PPO(
    CustomActorCriticPolicy,
    env,
    verbose=0,
    tensorboard_log='tensorboard_log',
    gamma=gamma,
    gae_lambda=gae_lambda,
    ent_coef=ent_coef,
    n_epochs=n_epochs,
    policy_kwargs = {'actions_mask_func': get_actions_mask}
)

In [23]:
from stable_baselines3.common.callbacks import EvalCallback

In [24]:
# El entorno de evaluación no corre en paralelo por eso uno solo
eval_env = make_reversi_vec_env(
    SelfPlayEnv, n_envs=1,
    env_kwargs={
        'board_shape': board_shape,
        'LocalPlayer': RandomPlayer
    }
)

In [25]:
eval_callback = EvalCallback(
    eval_env = eval_env,
    eval_freq=1_000,
    n_eval_episodes=500,
    deterministic=True,
    verbose=1,
    best_model_save_path=best_model_save_path,
) 

In [None]:
model.learn(total_timesteps=int(1e10), callback=[eval_callback])

Eval num_timesteps=6640, episode_reward=0.71 +/- 0.70
Episode length: 30.06 +/- 0.60
Eval num_timesteps=16640, episode_reward=0.74 +/- 0.65
Episode length: 30.12 +/- 0.57
New best mean reward!
Eval num_timesteps=26640, episode_reward=0.75 +/- 0.64
Episode length: 30.11 +/- 1.16
New best mean reward!
Eval num_timesteps=36640, episode_reward=0.72 +/- 0.68
Episode length: 30.08 +/- 0.57
Eval num_timesteps=46640, episode_reward=0.71 +/- 0.68
Episode length: 30.06 +/- 0.59
Eval num_timesteps=56640, episode_reward=0.72 +/- 0.67
Episode length: 30.11 +/- 0.54
Eval num_timesteps=66640, episode_reward=0.76 +/- 0.63
Episode length: 29.96 +/- 2.07
New best mean reward!
Eval num_timesteps=76640, episode_reward=0.73 +/- 0.67
Episode length: 30.04 +/- 1.65
Eval num_timesteps=86640, episode_reward=0.81 +/- 0.57
Episode length: 30.10 +/- 0.58
New best mean reward!
Eval num_timesteps=96640, episode_reward=0.78 +/- 0.62
Episode length: 30.11 +/- 0.55
Eval num_timesteps=106640, episode_reward=0.73 +/- 0.