# Crear un TorchPlayer


Recibe el modelo a instanciar como path y juega con el mismo

- Pensar como resolver el problema de que solo samplee las válidas
- Agregarle la opción de monte carlo tree search (opcional) con las opciones de iterationLimit, timeLimit

Si va a agregar MCTS mirar la notebook 007_MCTS.ipnb

In [1]:
from boardgame2 import ReversiEnv
import numpy as np
from players import RandomPlayer, DictPolicyPlayer, GreedyPlayer
from multi_env import make_reversi_vec_env, SelfPlayEnv
import torch as th
from stable_baselines3 import PPO
from stable_baselines3.common.policies import ActorCriticPolicy

In [275]:
class TorchPlayer():
    def __init__(self, model, player=1, board_shape=None, env=None, flatten_action=False, deterministic = True):
        if (env is None) and (board_shape is None):
            print("board_shape and env can't be both None")
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
        self.env = env
        self.player = player # player number. 1 o -1
        self.flatten_action = flatten_action
        self.board_shape = self.env.board.shape[0]
        self.deterministic = deterministic
        self.model = model

    def predict(self, board):
        obs = np.array([board*self.player])
        action, _ = self.model.predict(obs)
        if self.flatten_action:
            return action
        else:
            return [action // self.board_shape, action % self.board_shape]
        
#    def predict(self, board):
#        valid_actions = np.argwhere(self.env.get_valid((board, self.player)) == 1)
#        if len(valid_actions) == 0:
#            print('pass')
#            action = self.env.PASS
#        else:
#            board_tuple = tuple((board * self.player).reshape(-1))
#            action, _ = self.model.predict(board_tuple)
#            #action = self.pi_dict[board_tuple]
#        if self.flatten_action:
#            return action[0] * self.board_shape + action[1]
#        else:
#            return action
##        #if self.flatten_action:
# #       #    return action
# #       #else:
#        #    return [action // self.board_shape, action % self.board_shape]


# Arena

Testear el jugador contra los distintos jugadores

In [266]:
def evaluate_player(player_1, player_2, N=500):
    
    env = ReversiEnv(board_shape=board_shape)
    wins_as_first = 0
    wins_as_second = 0
    plays_as_first = 0
    plays_as_second = 0
    total_steps = 0
#    player_1 = Player_1(player=1, board_shape=board_shape, flatten_action=False)
#    player_2 = Player_2(player=-1, board_shape=board_shape, flatten_action=False)

    for i in range(N):
        # Aveces empieza un jugador, a veces el otro
        first_player = np.random.choice([-1, 1])
        #print(first_player)
        player_1.player = first_player
        player_2.player = -first_player
        
        plays_as_first = plays_as_first + (first_player == 1)
        plays_as_second = plays_as_second + (first_player == -1)
        
        done = False
        n_steps = 0
        (board, player) = env.reset()
        
#        print(board.shape)
#        print(board)
        
        while not done:
            if first_player == player:
                action = player_1.predict(board)
            else:
                action = player_2.predict(board)
#            print(action)
            (board, player), reward, done, info = env.step(action)
#            print(board)
#            print('-------------')
            n_steps = n_steps + 1
        total_steps = total_steps + n_steps
        wins_as_first = wins_as_first + (reward == first_player) * (first_player == 1)
        wins_as_second = wins_as_second + (reward == first_player) * (first_player == -1)
    print(f'Wins as first: {wins_as_first/plays_as_first}')
    print(f'Wins as second: {wins_as_second/plays_as_second}')
    print(f'Plays as first: {plays_as_first}')
    print(f'Plays as second: {plays_as_second}')
    print(f'Avg game duration: {total_steps/N}')
        
    

In [267]:
N = 1000
board_shape = 8
random_player_1 = RandomPlayer(board_shape = board_shape)
random_player_2 = RandomPlayer(board_shape = board_shape)
greedy_player_2 = GreedyPlayer(board_shape = board_shape)

In [276]:
model_8_mlp = PPO.load('models/Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions/best_model.zip')
ppo_model_mlp_player_1 = TorchPlayer(model = model_8_mlp, board_shape = board_shape, deterministic = True)

In [284]:
model_8_mlp_2 = PPO.load('models/Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions_2/best_model.zip')
ppo_model_mlp_player_2 = TorchPlayer(model = model_8_mlp_2, board_shape = board_shape, deterministic = True)

In [277]:
board_shape = 8
n_envs = 1
env = make_reversi_vec_env(
    SelfPlayEnv, n_envs=n_envs,
    env_kwargs={
        'board_shape': board_shape,
        'LocalPlayer': RandomPlayer
    }
)
obs = env.reset()
#obs = np.array((board.shape))
board=obs[0,0]
print(obs.shape)
print(obs)
print(model_8_mlp.predict(obs))
obs_2 = np.array([board])
print(obs_2.shape)
print(obs_2)
print(model_8_mlp.predict(obs_2))
print(board.shape)
print(board)
print(ppo_model_mlp_player_1.predict(board))

(1, 1, 8, 8)
[[[[ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0. -1.  1.  0.  0.  0.]
   [ 0.  0.  0. -1. -1.  0.  0.  0.]
   [ 0.  0.  0. -1.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]]]]
(array([42]), None)
(1, 8, 8)
[[[ 0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0. -1.  1.  0.  0.  0.]
  [ 0.  0.  0. -1. -1.  0.  0.  0.]
  [ 0.  0.  0. -1.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.]
  [ 0.  0.  0.  0.  0.  0.  0.  0.]]]
(42, None)
(8, 8)
[[ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0. -1.  1.  0.  0.  0.]
 [ 0.  0.  0. -1. -1.  0.  0.  0.]
 [ 0.  0.  0. -1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]]
[5, 2]


In [286]:
_ = evaluate_player(ppo_model_mlp_player_2, random_player_2, N=500)

Wins as first: 0.9692307692307692
Wins as second: 0.9416666666666667
Plays as first: 260
Plays as second: 240
Avg game duration: 59.936


In [285]:
_ = evaluate_player(ppo_model_mlp_player_1, random_player_2, N=500)

Wins as first: 0.9090909090909091
Wins as second: 0.9479553903345725
Plays as first: 231
Plays as second: 269
Avg game duration: 59.986


In [280]:
_ = evaluate_player(ppo_model_mlp_player_1, greedy_player_2, N=500)

Wins as first: 0.898876404494382
Wins as second: 0.8841201716738197
Plays as first: 267
Plays as second: 233
Avg game duration: 58.754


In [287]:
_ = evaluate_player(ppo_model_mlp_player_1, ppo_model_mlp_player_2, N=500)

Wins as first: 0.32644628099173556
Wins as second: 0.3875968992248062
Plays as first: 242
Plays as second: 258
Avg game duration: 59.968


In [283]:
_ = evaluate_player(ppo_model_mlp_player_1, ppo_model_mlp_player_2, N=10000)

Wins as first: 0.46733360555328707
Wins as second: 0.5076440611524892
Plays as first: 4898
Plays as second: 5102
Avg game duration: 59.9322


In [74]:
_ = evaluate_player(random_player_1, random_player_2, N=100)

Wins as first: 0.5510204081632653
Wins as second: 0.5882352941176471
Plays as first: 49
Plays as second: 51
Avg game duration: 59.99


In [77]:
_ = evaluate_player(random_player_1, greedy_player_2, N=1000)

Wins as first: 0.32745098039215687
Wins as second: 0.38979591836734695
Plays as first: 510
Plays as second: 490
Avg game duration: 58.194


In [78]:
_ = evaluate_player(greedy_player_2, random_player_2, N=1000)

Wins as first: 0.6050583657587548
Wins as second: 0.5432098765432098
Plays as first: 514
Plays as second: 486
Avg game duration: 57.77
