In [4]:
%load_ext autoreload
%autoreload 2

# Importar entorno y familiarizarse

In [5]:
from boardgame2 import ReversiEnv
import numpy as np

# Crear 3 tipos de jugador
- Random: Selecciona uniformemente una de las acciones válidas
- Greedy: Selecciona la acción que le da más ganancia inmediata (cantidad de piezas que come). Si hay más de una acción que da máxima ganancia samplear uniformemente entre ellas
- Optimum (solo para 4x4): Usando resultados de la PI optima obtenida por policy iteration

Tener en cuenta que:
- ReversiEnv tiene los métodos get_valid y next_step y no es necesario mantener el estado del entorno
- env.PASS ([-1,  0]) es una acción valida posible y debería hacerse cuando no get_valid devuelve una matriz de ceros

Para el optimo en 4x4 bajar usar la PI obtenida en la notebook anterior guardado en /mdp

In [6]:
import random

In [7]:
def sample_valid_actions(state):
    # np.argwhere junto con env.get_valid y randint solucionan el problema en pocas lineas pero puede usar otra estrategia
    valid_actions = np.argwhere(env.get_valid(state)) # Acciones validas
    sampled_valid_action = valid_actions[random.randint(0,len(valid_actions)-1)]
    return sampled_valid_action

In [44]:
class GreedyPlayer():
    def __init__(self, player=1, board_shape=None, env=None, flatten_action=False):
        if (env is None) and (board_shape is None):
            print("board_shape and env can't be both None")
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
        self.env = env
        self.player = player # player number. 1 o -1
        self.flatten_action = flatten_action
        self.board_shape = self.env.board.shape[0]
    
    def predict(self, board):
        # Implementar
        # Primero obtengo las acciones válidas
        valid_actions = np.argwhere(self.env.get_valid((board,self.player)))     
        if len(valid_actions)==0:
            print('PASS')
            action = self.env.PASS
        else:
            actions_score = []
            for a in valid_actions:
                (next_state, _), reward, done, _ = self.env.next_step((board, self.player), a)
                actions_score.append(next_state.sum()*self.player)
            best_action_score = max(actions_score)
            best_actions = valid_actions[np.array(actions_score)==best_action_score]
            action = best_actions[random.randint(0,len(best_actions)-1)]
        # Tiene que devoler la acción en la que come más piezas.
        # A igualdad de piezas comidas, samplear uniformemente
        self.env.action_space
        if self.flatten_action:
            return action[0] * self.board_shape + action[1]
        else:
            return action
        
class RandomPlayer():
    def __init__(self, player=1, board_shape=None, env=None, flatten_action=False):
        if (env is None) and (board_shape is None):
            print("board_shape and env can't be both None")
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
        self.env = env
        self.player = player
        self.flatten_action = flatten_action
        self.board_shape = self.env.board.shape[0]
    
    def predict(self, board):
        # Muestrea aleatoriamente las acciones válidas
        # Puede usar la función creada en la notebook anterior
        #action = sample_valid_actions((board,self.player))
        #print(self.env.get_valid((board,self.player)))
        mat_valid_actions = self.env.get_valid((board,self.player))
        if mat_valid_actions.sum()==0:
            print('PASS')
            action = self.env.PASS
        else:
            valid_actions = np.argwhere(mat_valid_actions) # Acciones validas
            action = valid_actions[random.randint(0,len(valid_actions)-1)]
        if self.flatten_action:
            return action[0] * self.board_shape + action[1]
        else:
            return action
        

class DictPolicyPlayer():
    def __init__(self, player=1, board_shape=4, env=None, flatten_action=False, dict_folder='mdp/pi_mdp.npy'):
        self.pi_dict = np.load(dict_folder, allow_pickle=True).item()
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
        self.player = player
        self.flatten_action = flatten_action
        self.board_shape = board_shape
    
    def predict(self, board):
        # Elegir la acción optima y devolverla
        board_tuple = tuple((board*self.player).reshape(-1))
        if board_tuple in self.pi_dict:
            action = np.array(self.pi_dict[board_tuple])
        else:
            print(f'PASS - DICT - Player: {self.player}')
            print((board*self.player))
            action = np.array([-1, 0])
        if self.flatten_action:
            return action[0] * self.board_shape + action[1]
        else:
            return action

In [9]:
board_shape = 4
env = ReversiEnv(board_shape=board_shape)
(board, player) = env.reset()
print(board)
print(player)

[[ 0  0  0  0]
 [ 0  1 -1  0]
 [ 0 -1  1  0]
 [ 0  0  0  0]]
1


In [37]:
env.get_valid((board,1))

array([[1, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 0, 0, 0],
       [1, 1, 0, 0]], dtype=int8)

In [38]:
gp = GreedyPlayer(player=1, board_shape=4)
rp = RandomPlayer(player=1, board_shape=4)
op = DictPolicyPlayer(player=1, board_shape=4, flatten_action=True)

In [39]:
board = np.array([
         [0,-1,1,-1],
         [0,1,1,0],
         [0,-1,1,0],
         [0,0,0,0]]
)

In [40]:
gp.predict(board)

array([3, 0])

In [41]:
rp.predict(board)

array([3, 0])

In [42]:
op.predict(board)

PASS - DICT
[[ 0 -1  1 -1]
 [ 0  1  1  0]
 [ 0 -1  1  0]
 [ 0  0  0  0]]


-4

# Verificar que el pass funciona OK

In [45]:
gp = GreedyPlayer(player=1, board_shape=8)
rp = RandomPlayer(player=1, board_shape=8)
op = DictPolicyPlayer(player=1, board_shape=8)

In [27]:
board = np.array([
    [-1, 0, 0, 0],
    [0, 1, 1, 0],
    [0, 1, 1, 0],
    [0, 0, 0, 0]]
)
rp.predict(board)
gp.predict(board)
op.predict(board)

PASS
PASS
PASS - DICT


array([-1,  0])

# Completar la función que dado dos jugadores imprima estadísticas de las partidas

In [46]:
def arena_stats(Player_1, Player_2, board_shape, N=500):
    
    env = ReversiEnv(board_shape=board_shape)
    wins_as_first = 0
    wins_as_second = 0
    plays_as_first = 0
    plays_as_second = 0
    total_steps = 0
    player_1 = Player_1(player=1, board_shape=board_shape, flatten_action=False)
    player_2 = Player_2(player=-1, board_shape=board_shape, flatten_action=False) # Implementar
    for i in range(N):
        # Aveces empieza un jugador, a veces el otro
        first_player = np.random.choice([-1, 1])
        player_1.player = first_player
        player_2.player = -first_player
        
        plays_as_first = plays_as_first + (first_player == 1)
        plays_as_second = plays_as_second + (first_player == -1)
        
        done = False
        n_steps = 0
        (board, player) = env.reset()
        
        while not done:
            if first_player == player:
                action = player_1.predict(board=board)# Juega el jugador 1
            else:
                action = player_2.predict(board=board)# Juega el jugador 2
            (board, player), reward, done, info = env.step(action)
            n_steps = n_steps + 1
        total_steps = total_steps + n_steps
        wins_as_first = wins_as_first + (reward == first_player) * (first_player == 1)
        wins_as_second = wins_as_second + (reward == first_player) * (first_player == -1)
    print(f'Wins as first: {wins_as_first/plays_as_first}')
    print(f'Wins as second: {wins_as_second/plays_as_second}')
    print(f'Plays as first: {plays_as_first}')
    print(f'Plays as second: {plays_as_second}')
    print(f'Avg game duration: {total_steps/N}')
        
    

In [48]:
arena_stats( GreedyPlayer,DictPolicyPlayer, 4, N=2000)

Wins as first: 0.0
Wins as second: 0.1694255111976631
Plays as first: 973
Plays as second: 1027
Avg game duration: 11.7385


In [376]:
arena_stats(DictPolicyPlayer, RandomPlayer, 4, N=1000)

Wins as first: 0.818
Wins as second: 1.0
Plays as first: 500
Plays as second: 500
Avg game duration: 11.634


In [377]:
arena_stats(RandomPlayer, DictPolicyPlayer, 4, N=1000)

Wins as first: 0.0
Wins as second: 0.125
Plays as first: 488
Plays as second: 512
Avg game duration: 11.684


In [382]:
arena_stats(RandomPlayer, GreedyPlayer, 8, N=1000)

Wins as first: 0.376
Wins as second: 0.394
Plays as first: 500
Plays as second: 500
Avg game duration: 58.115


In [383]:
arena_stats(GreedyPlayer,RandomPlayer, 8, N=1000)

KeyboardInterrupt: 

In [379]:
arena_stats(RandomPlayer, RandomPlayer, 4)

Wins as first: 0.35019455252918286
Wins as second: 0.588477366255144
Plays as first: 257
Plays as second: 243
Avg game duration: 11.752


In [380]:
arena_stats(GreedyPlayer, GreedyPlayer, 4)

Wins as first: 0.4291497975708502
Wins as second: 0.4980237154150198
Plays as first: 247
Plays as second: 253
Avg game duration: 11.586


In [381]:
arena_stats(RandomPlayer, GreedyPlayer, 8, N=1000)

Wins as first: 0.3804780876494024
Wins as second: 0.3755020080321285
Plays as first: 502
Plays as second: 498
Avg game duration: 57.943


# Guardar todas las clases de jugadores en un player.py para que luego se puedan importar de la siguiente forma:

from players import RandomPlayer

from players import GreedyPlayer