In [51]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
from multi_env import make_reversi_vec_env, SelfPlayEnv
import torch as th
from players import RandomPlayer
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy
import numpy as np
import gym

In [53]:
board_shape = 8
n_envs = 1
env = make_reversi_vec_env(
    SelfPlayEnv, n_envs=n_envs,
    env_kwargs={
        'board_shape': board_shape,
        'LocalPlayer': RandomPlayer
    }
)

# Modificación de librería para que haga argmax solo sobre las válidas

In [54]:
model = PPO(
    ActorCriticPolicy,
    env,
    verbose=0,
)

In [55]:
model.predict(env.reset())

(array([22]), None)

# Custom ActorCriticPolicy 

https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/policies.py

In [56]:
from boardgame2 import ReversiEnv

In [57]:
env_not_vect = ReversiEnv(board_shape)

In [58]:
state = env.reset()[0][0]
player = 1

In [59]:
env_not_vect.get_valid((state, player))

array([[0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0]], dtype=int8)

In [60]:
def get_actions_mask(state):
    player = 1
    valid_actions = env_not_vect.get_valid((state, player))
    #print(state)
    return valid_actions.reshape(-1)

In [61]:
get_actions_mask(env.reset()[0][0]).reshape(4,4)

ValueError: cannot reshape array of size 64 into shape (4,4)

In [78]:
class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(
        self,
        #observation_space: gym.spaces.Space,
        #action_space: gym.spaces.Space,
        #lr_schedule,
        #net_arch=None,
        #activation_fn=nn.Tanh,
        #features_extractor_kwargs=dict(feature_dim=128),
        #feature_extractor_class=NewCustomCNN,
        *args, # Todos los argumentos posicionales de ActorCriticPolicy
        actions_mask_func=None, # El nuevo argumento
        **kwargs # Todos los argumentos opcionales de ActorCriticPolicy
    ):
        super(CustomActorCriticPolicy, self).__init__(
            *args,
            **kwargs
        )
        if actions_mask_func:
            self.get_actions_mask = actions_mask_func
    
    
    
    def sample_masked_actions(self, obs, distribution, deterministic=False, return_distribution=False):
        # Dada las obs y distribuciones luego de evaluar la red neuronal, samplear solo las acciones válidas
        # Las obs se usan para que con self.get_actions_mask se obtengan las acciones válidas
        # las distribuciones son el resultado de evaluar la red neuronal y van a dar acciones no validas
        # Generar una nueva distribución (del lado de los logits preferentemente) donde las acciones no válidas
        # tengan probabildad nula de ser muestreadas
        # Luego se modifican abajo los métodos
        # _predict, forward y evaluate_actions
        # Si tiene el flag de return_distribution en true devuelve la distribución nueva
        # Caso contrario devuelve las acciones
        # Para tener en cuenta, obs tiene dimensión [batch_size, channels, H, W]
        # Recomendamos poner un print(obs.shape)
        # y correr:
        # obs = env.reset()
        # actions, _ = model.predict(obs)
        # Para sacarse las dudas
        
        def get_mask(obs):
            masks = np.zeros((len(obs), obs.shape[-1] * obs.shape[-2]))
            for i, board in enumerate(obs):
                board = board[0].cpu().numpy()
                masks[i] = 1 - self.get_actions_mask(board)
            return th.from_numpy(masks).to(self.device)
        #print(f'Obs: {obs}')
        #print(f'Mask: {masks}')
        masks = get_mask(obs)
        masks[masks == 1] = -1e6#-np.inf
        masked_logits = distribution.logits + masks
        
        if return_distribution:
            return th.distributions.Categorical(logits=masked_logits)
        if deterministic:
            return th.argmax(masked_logits, axis=1)
        return th.distributions.Categorical(logits=masked_logits).sample()
    
    def _predict(self, observation, deterministic=False):
        """
        Get the action according to the policy for a given observation.
        :param observation:
        :param deterministic: Whether to use stochastic or deterministic actions
        :return: Taken action according to the policy
        """
        latent_pi, _, latent_sde = self._get_latent(observation)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
        
        #print(f'predict - latent_pi: {latent_pi}')
        #print(f'predict - latent_sde: {latent_sde}')
        #print(f'predict - distribution: {distribution.get_actions()}')
        
        if self.get_actions_mask:
            #print(f'Entró por get_actions_mask')
            actions = self.sample_masked_actions(observation, distribution.distribution, deterministic=deterministic)
        else:
            actions = distribution.get_actions(deterministic=deterministic)
        
        #print(f'predict - actions: {actions}')
        return actions
    
    def forward(self, obs: th.Tensor, deterministic: bool = False):
        """
        Forward pass in all the networks (actor and critic)
        :param obs: Observation
        :param deterministic: Whether to sample or use deterministic actions
        :return: action, value and log probability of the action
        """
        latent_pi, latent_vf, latent_sde = self._get_latent(obs)
        # Evaluate the values for the given observations
        values = self.value_net(latent_vf)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde=latent_sde)
        distrib = self.sample_masked_actions(obs, distribution.distribution, return_distribution=True)

        actions = self.sample_masked_actions(obs, distribution.distribution, deterministic=deterministic)
        
        #if self.get_actions_mask:
        #    actions = self.sample_masked_actions(obs, distribution.distribution, deterministic=deterministic)
        #else:
        #    actions = distribution.get_actions(deterministic=deterministic)

        log_prob = distribution.log_prob(actions)
        return actions, values, log_prob
    
    def evaluate_actions(self, obs: th.Tensor, actions: th.Tensor) :#-> tuple[th.Tensor, th.Tensor, th.Tensor]:
        """
        Evaluate actions according to the current policy,
        given the observations.
        :param obs:
        :param actions:
        :return: estimated value, log likelihood of taking those actions
            and entropy of the action distribution.
        """
        latent_pi, latent_vf, latent_sde = self._get_latent(obs)
        distribution = self._get_action_dist_from_latent(latent_pi, latent_sde)
        distrib = self.sample_masked_actions(obs, distribution.distribution, return_distribution=True)

        log_prob = distrib.log_prob(actions)
        values = self.value_net(latent_vf)
        return values, log_prob, distrib.entropy()

In [63]:
obs = env.reset()
actions, _ = model.predict(obs)

In [64]:
actions

array([41])

In [42]:
policy_kwargs = dict(activation_fn=th.nn.ReLU,
                     net_arch=[dict(pi=[32, 32], vf=[32, 32])],
                    actions_mask_func=get_actions_mask)

In [65]:
model = PPO(
    CustomActorCriticPolicy,
    env,
    verbose=0,
    policy_kwargs = {'actions_mask_func': get_actions_mask}
    #policy_kwargs = policy_kwargs
)


In [66]:
# Testeo de predict
model.policy.get_actions_mask(env.reset()[0][0])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int8)

In [67]:
obs

array([[[[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  1., -1.,  0.,  0.,  0.],
         [ 0.,  0.,  0., -1.,  1.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]]], dtype=float32)

In [18]:
obs = env.reset()
actions, _ = model.predict(obs)

In [19]:
# Verificar que las acciones son válidas
actions

array([20])

In [20]:
# Testeo de forward
model.policy(th.from_numpy(obs).to(model.device))

(tensor([20]),
 tensor([[-0.1257]], grad_fn=<AddmmBackward>),
 tensor([-4.1631], grad_fn=<SqueezeBackward1>))

# Corremos PPO

In [79]:
board_shape = 8
n_envs = 5
gamma = 0.99
ent_coef = 0.0
gae_lambda = 0.95
n_epochs = 10

In [80]:
prefix = 'Reversi_PPO'
suffix = 'masked_actions_25_08'
model_name = f'{prefix}_{board_shape}by{board_shape}_{gamma}_{gae_lambda}_{ent_coef}_{n_epochs}_{n_envs}_{suffix}'
best_model_save_path = f'./models/{model_name}'
print(model_name)
print(best_model_save_path)

Reversi_PPO_8by8_0.99_0.95_0.0_10_5_masked_actions_25_08
./models/Reversi_PPO_8by8_0.99_0.95_0.0_10_5_masked_actions_25_08


In [81]:
model = PPO(
    CustomActorCriticPolicy, 
    env,
    verbose=0,
    tensorboard_log='tensorboard_log',
    gamma=gamma,
    gae_lambda=gae_lambda,
    ent_coef=ent_coef,
    n_epochs=n_epochs,
    policy_kwargs = {'actions_mask_func': get_actions_mask}
    #policy_kwargs = policy_kwargs
)

In [82]:
from stable_baselines3.common.callbacks import EvalCallback

In [83]:
# El entorno de evaluación no corre en paralelo por eso uno solo
eval_env = make_reversi_vec_env(
    SelfPlayEnv, n_envs=1,
    env_kwargs={
        'board_shape': board_shape,
        'LocalPlayer': RandomPlayer
    }
)

In [84]:
eval_callback = EvalCallback(
    eval_env = eval_env,
    eval_freq=1_000,
    n_eval_episodes=500,
    deterministic=True,
    verbose=1,
    best_model_save_path=best_model_save_path,
) 

In [85]:
model.learn(total_timesteps=int(1e10), callback=[eval_callback])

Eval num_timesteps=1000, episode_reward=0.07 +/- 0.97
Episode length: 30.04 +/- 0.60
New best mean reward!
Eval num_timesteps=2000, episode_reward=0.10 +/- 0.98
Episode length: 30.06 +/- 0.55
New best mean reward!
Eval num_timesteps=3000, episode_reward=-0.15 +/- 0.96
Episode length: 29.91 +/- 0.59
Eval num_timesteps=4000, episode_reward=-0.06 +/- 0.98
Episode length: 29.96 +/- 0.60
Eval num_timesteps=5000, episode_reward=-0.01 +/- 0.98
Episode length: 29.85 +/- 1.68
Eval num_timesteps=6000, episode_reward=0.06 +/- 0.98
Episode length: 29.99 +/- 0.59
Eval num_timesteps=7000, episode_reward=0.10 +/- 0.98
Episode length: 29.97 +/- 0.60
New best mean reward!
Eval num_timesteps=8000, episode_reward=0.05 +/- 0.98
Episode length: 29.99 +/- 0.56
Eval num_timesteps=9000, episode_reward=0.14 +/- 0.98
Episode length: 30.05 +/- 0.57
New best mean reward!
Eval num_timesteps=10000, episode_reward=0.21 +/- 0.96
Episode length: 29.99 +/- 1.17
New best mean reward!
Eval num_timesteps=11000, episode_re

KeyboardInterrupt: 

### Lo testeamos

In [96]:
env_test = ReversiEnv(board_shape)

In [97]:
state = env_test.reset()[0]
player = 1

In [98]:
state

array([[ 0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  1, -1,  0,  0,  0],
       [ 0,  0,  0, -1,  1,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0]], dtype=int8)

In [None]:
tp.predict(board[np.newaxis,:,:])

In [105]:
env_test.reset()[0][np.newaxis,:,:]

array([[[ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  1, -1,  0,  0,  0],
        [ 0,  0,  0, -1,  1,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0]]], dtype=int8)

In [107]:
model.predict(env_test.reset()[0][np.newaxis,:,:])

(29, None)

In [108]:
board_shape = 8
n_envs = 1
env = make_reversi_vec_env(
    SelfPlayEnv, n_envs=n_envs,
    env_kwargs={
        'board_shape': board_shape,
        'LocalPlayer': RandomPlayer
    }
)

In [109]:
obs = env.reset()

In [110]:
obs

array([[[[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  1., -1.,  0.,  0.,  0.],
         [ 0.,  0.,  0., -1.,  1.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]]], dtype=float32)

In [111]:
model.predict(obs)[0]

array([29])

In [112]:
obs = (env.step(model.predict(obs)[0]))[0]

In [113]:
obs

array([[[[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0., -1.,  0.,  0.],
         [ 0.,  0.,  0.,  1., -1.,  1.,  0.,  0.],
         [ 0.,  0.,  0., -1.,  1.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
         [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]]]], dtype=float32)

In [None]:
#model.summa

In [114]:
model.policy

CustomActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten()
  )
  (mlp_extractor): MlpExtractor(
    (shared_net): Sequential()
    (policy_net): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=64, out_features=64, bias=True)
      (1): Tanh()
      (2): Linear(in_features=64, out_features=64, bias=True)
      (3): Tanh()
    )
  )
  (action_net): Linear(in_features=64, out_features=64, bias=True)
  (value_net): Linear(in_features=64, out_features=1, bias=True)
)

# CUSTOMCNNfrom stable_baselines3.common.torch_layers import BaseFeaturesExtractor

In [4]:
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

In [5]:
class CustomCNN(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of unit for the last layer.
    """

    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 256):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        # We assume CxHxW images (channels first)
        # Re-ordering will be done by pre-preprocessing or wrapper
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=8, stride=4, padding=0),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with th.no_grad():
            n_flatten = self.cnn(
                th.as_tensor(observation_space.sample()[None]).float()
            ).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))

In [6]:
policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=128),
)