# Crear un TorchPlayer


Recibe el modelo a instanciar como path y juega con el mismo

- Pensar como resolver el problema de que solo samplee las válidas
- Agregarle la opción de monte carlo tree search (opcional) con las opciones de iterationLimit, timeLimit

Si va a agregar MCTS mirar la notebook 007_MCTS.ipnb

In [1]:
from stable_baselines3 import PPO
from players import DictPolicyPlayer, RandomPlayer, GreedyPlayer
from boardgame2 import ReversiEnv
import numpy as np

In [2]:
class TorchPlayer():
    # 
    def __init__(self, model_path=None, player=1, board_shape=None, env=None, deterministic=True, only_valid=True, mcts=False, iterationLimit=None, timeLimit=None, flatten_action=False):
        if model_path is None:
            model_path = './models/Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions/best_model.zip'
        self.model = PPO.load(model_path)
        self.player = player
        self.flatten_action = flatten_action
        if (env is None) and (board_shape is None):
            print("board_shape and env can't be both None")
        if env is None:
            self.env = self.model.env
        if board_shape is None:
            self.board_shape = env.board_shape
        else:
            self.board_shape = board_shape
        
    def predict(self, board):
        board = board * self.player
        board_reshape = board[np.newaxis,:,:]
        action = self.model.predict(board_reshape)[0]
        if self.flatten_action:
            return action
        else:
            return [action // self.board_shape, action % self.board_shape]
        return action

# Arena

Testear el jugador contra los distintos jugadore

In [3]:
board_shape = 8
env = ReversiEnv(board_shape=board_shape)
(board, player) = env.reset()
print(board)
print(player)

[[ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  1 -1  0  0  0]
 [ 0  0  0 -1  1  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]]
1


In [4]:
gp = GreedyPlayer(player=1, board_shape=8)
rp = RandomPlayer(player=1, board_shape=8)
op = DictPolicyPlayer(player=1, board_shape=8, flatten_action=True)

In [5]:
gp.predict(board)

array([4, 2])

In [6]:
#model_path = './models/Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions/best_model.zip'
model_path = './models/Reversi_PPO_8by8_0.99_0.95_0.0_10_5_masked_actions_25_08/best_model.zip'

In [7]:
tp = TorchPlayer(model_path=model_path,player=1,board_shape=8, flatten_action=True)

In [8]:
tp.predict(board[np.newaxis,:,:])

array([20])

### Arena

In [9]:
def arena_stats(Player_1, Player_2, board_shape, N=500):
    
    env = ReversiEnv(board_shape=board_shape)
    wins_as_first = 0
    wins_as_second = 0
    plays_as_first = 0
    plays_as_second = 0
    total_steps = 0
    player_1 = Player_1(player=1, board_shape=board_shape, flatten_action=False)
    player_2 = Player_2(player=-1, board_shape=board_shape, flatten_action=False) # Implementar
    for i in range(N):
        # Aveces empieza un jugador, a veces el otro
        first_player = np.random.choice([-1, 1])
        player_1.player = first_player
        player_2.player = -first_player
        
        plays_as_first = plays_as_first + (first_player == 1)
        plays_as_second = plays_as_second + (first_player == -1)
        
        done = False
        n_steps = 0
        (board, player) = env.reset()
        
        while not done:
            if first_player == player:
                action = player_1.predict(board=board)# Juega el jugador 1
            else:
                action = player_2.predict(board=board)# Juega el jugador 2
            (board, player), reward, done, info = env.step(action)
            n_steps = n_steps + 1
        total_steps = total_steps + n_steps
        wins_as_first = wins_as_first + (reward == first_player) * (first_player == 1)
        wins_as_second = wins_as_second + (reward == first_player) * (first_player == -1)
    print(f'Wins as first: {wins_as_first/plays_as_first}')
    print(f'Wins as second: {wins_as_second/plays_as_second}')
    print(f'Plays as first: {plays_as_first}')
    print(f'Plays as second: {plays_as_second}')
    print(f'Avg game duration: {total_steps/N}')

In [10]:
arena_stats(TorchPlayer, GreedyPlayer, 8, N=2000)

Wins as first: 0.7073921971252567
Wins as second: 0.6617933723196882
Plays as first: 974
Plays as second: 1026
Avg game duration: 59.6305


In [11]:
arena_stats(TorchPlayer, RandomPlayer, 8, N=2000)

Wins as first: 0.7441204139228599
Wins as second: 0.7033084311632871
Plays as first: 1063
Plays as second: 937
Avg game duration: 59.8785


In [10]:
arena_stats(TorchPlayer, GreedyPlayer, 8, N=2000)

Wins as first: 0.6958661417322834
Wins as second: 0.6453252032520326
Plays as first: 1016
Plays as second: 984
Avg game duration: 59.7705


In [11]:
arena_stats(TorchPlayer, RandomPlayer, 8, N=2000)

Wins as first: 0.7400204708290685
Wins as second: 0.7253176930596286
Plays as first: 977
Plays as second: 1023
Avg game duration: 59.9015
