# Crear un TorchPlayer


Recibe el modelo a instanciar como path y juega con el mismo

- Pensar como resolver el problema de que solo samplee las válidas
- Agregarle la opción de monte carlo tree search (opcional) con las opciones de iterationLimit, timeLimit

Si va a agregar MCTS mirar la notebook 007_MCTS.ipnb

In [97]:
from stable_baselines3 import PPO
from players import DictPolicyPlayer, RandomPlayer, GreedyPlayer
from boardgame2 import ReversiEnv
import numpy as np
from multi_env import make_reversi_vec_env, SelfPlayEnv

In [112]:
class TorchPlayer():
    #     
    def __init__(self, model_path=None, player=1, board_shape=None, env=None, deterministic=True, only_valid=True, 
                 flatten_action=False, mcts=False, iterationLimit=None, timeLimit=None):
        if (env is None) and (board_shape is None):
            print("board_shape and env can't be both None")
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
        self.env = env
        self.model = model_path
        self.player = player # player number. 1 o -1
        if board_shape is None:
            self.board_shape = self.env.board.shape[0]
        else:
            self.board_shape = board_shape        
        self.flatten_action = flatten_action
        self.deterministic = deterministic
        self.only_valid = only_valid
        
    def predict(self, board):
        board_player = [board*self.player]
        action, _ = self.model.predict(board_player)
        if self.flatten_action:
            return action
        else:
            return [action // self.board_shape, action % self.board_shape]

In [103]:
board_shape = 8
env = ReversiEnv(board_shape=board_shape)
(board, player) = env.reset()
print(board.shape)
print(board)
print(player)

(8, 8)
[[ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  1 -1  0  0  0]
 [ 0  0  0 -1  1  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0]]
1


In [9]:
board_shape = 8
n_envs = 1
env = make_reversi_vec_env(
    SelfPlayEnv, n_envs=n_envs,
    env_kwargs={
        'board_shape': board_shape,
        'LocalPlayer': RandomPlayer
    }
)
obs = env.reset()
print(obs)

[[[[ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0. -1.  1.  0.  0.  0.]
   [ 0.  0. -1. -1. -1.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]
   [ 0.  0.  0.  0.  0.  0.  0.  0.]]]]


In [113]:
model_8_ma = PPO.load('models/Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions/best_model.zip')
ppo_model_ma = TorchPlayer(model_path = model_8_ma, board_shape = board_shape, deterministic = True)

In [114]:
ppo_model_ma.predict(board)

[3, 5]

# Arena

Testear el jugador contra los distintos jugadore

In [17]:
gp = GreedyPlayer(player=1, board_shape=8)
rp = RandomPlayer(player=1, board_shape=8)
#op = DictPolicyPlayer(player=1, board_shape=8, flatten_action=True)

In [115]:
model_8_ma = PPO.load('models/Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions/best_model.zip')
ppo_model_ma = TorchPlayer(model_path = model_8_ma, board_shape = board_shape, deterministic = True)

In [131]:
ppo_model_ma.predict(board)

[5, 3]

In [116]:
model_8_ma_distrib = PPO.load('models/Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions_distrib/best_model.zip')
ppo_model_ma_distrib = TorchPlayer(model_path = model_8_ma_distrib, board_shape = board_shape, deterministic = True)

In [136]:
ppo_model_ma_distrib.predict(board)

[2, 4]

In [78]:
def arena_stats(player_1, player_2, board_shape, N=500):
    
    env = ReversiEnv(board_shape=board_shape)
    wins_as_first = 0
    wins_as_second = 0
    plays_as_first = 0
    plays_as_second = 0
    total_steps = 0
    for i in range(N):
        # Aveces empieza un jugador, a veces el otro
        first_player = np.random.choice([-1, 1])
        player_1.player = first_player
        player_2.player = -first_player
        
        plays_as_first = plays_as_first + (first_player == 1)
        plays_as_second = plays_as_second + (first_player == -1)
        
        done = False
        n_steps = 0
        (board, player) = env.reset()
        
        while not done:
            if first_player == player:
                action = player_1.predict(board)# Juega el jugador 1
            else:
                action = player_2.predict(board)# Juega el jugador 2
            (board, player), reward, done, info = env.step(action)
            n_steps = n_steps + 1
        total_steps = total_steps + n_steps
        wins_as_first = wins_as_first + (reward == first_player) * (first_player == 1)
        wins_as_second = wins_as_second + (reward == first_player) * (first_player == -1)
    print(f'Wins as first: {wins_as_first/plays_as_first}')
    print(f'Wins as second: {wins_as_second/plays_as_second}')
    print(f'Plays as first: {plays_as_first}')
    print(f'Plays as second: {plays_as_second}')
    print(f'Avg game duration: {total_steps/N}')

## Random contra Greedy

In [92]:
arena_stats(rp, gp, 8, N=1000)

Wins as first: 0.36809815950920244
Wins as second: 0.3816046966731898
Plays as first: 489
Plays as second: 511
Avg game duration: 57.697


## Modelo Base contra Greedy y Random

In [93]:
arena_stats(ppo_model_ma, gp,8, N=1000)

Wins as first: 0.6168582375478927
Wins as second: 0.6234309623430963
Plays as first: 522
Plays as second: 478
Avg game duration: 59.176


In [94]:
arena_stats(ppo_model_ma, rp, 8, N=1000)

Wins as first: 0.6742738589211619
Wins as second: 0.7316602316602316
Plays as first: 482
Plays as second: 518
Avg game duration: 59.987


## Modelo con modificación del Forward contra Greedy y Random

In [101]:
arena_stats(ppo_model_ma_distrib, gp, 8, N=1000)

Wins as first: 0.7723735408560312
Wins as second: 0.7551440329218106
Plays as first: 514
Plays as second: 486
Avg game duration: 59.473


In [137]:
arena_stats(ppo_model_ma_distrib, rp, 8, N=1000)

Wins as first: 0.9081196581196581
Wins as second: 0.8890977443609023
Plays as first: 468
Plays as second: 532
Avg game duration: 59.991


## Modelo Base contra modificado

In [138]:
arena_stats(ppo_model_ma, ppo_model_ma_distrib, 8, N=1000)

Wins as first: 0.14919354838709678
Wins as second: 0.1865079365079365
Plays as first: 496
Plays as second: 504
Avg game duration: 59.991


In [141]:
arena_stats(ppo_model_ma_distrib, ppo_model_ma, 8, N=1000)

Wins as first: 0.7933070866141733
Wins as second: 0.8048780487804879
Plays as first: 508
Plays as second: 492
Avg game duration: 59.988


Se observa que el modelo modificado es mucho mejor que el modelo base

### Se prueba el mejor modelo, una version con Deterministic en True y otra en False

In [139]:
model_8_ma_distrib_detFalse = PPO.load('models/Reversi_PPO_8by8_0.99_0.95_0.0_10_6_masked_actions_distrib/best_model.zip')
ppo_model_ma_distrib_detFalse = TorchPlayer(model_path = model_8_ma_distrib_detFalse, board_shape = board_shape, deterministic = False)

In [140]:
arena_stats(ppo_model_ma_distrib, ppo_model_ma_distrib_detFalse, 8, N=1000)

Wins as first: 0.5159362549800797
Wins as second: 0.46987951807228917
Plays as first: 502
Plays as second: 498
Avg game duration: 59.992


No se observa una diferencia sustancial entre utilizar el parámetro Deterministic en True o False