In [1]:
%load_ext autoreload
%autoreload 2

# Importar entorno y familiarizarse

In [2]:
from boardgame2 import ReversiEnv
import numpy as np
from random import randint

# Crear 3 tipos de jugador
- Random: Selecciona uniformemente una de las acciones válidas
- Greedy: Selecciona la acción que le da más ganancia inmediata (cantidad de piezas que come). Si hay más de una acción que da máxima ganancia samplear uniformemente entre ellas
- Optimum (solo para 4x4): Usando resultados de la PI optima obtenida por policy iteration

Tener en cuenta que:
- ReversiEnv tiene los métodos get_valid y next_step y no es necesario mantener el estado del entorno
- env.PASS ([-1,  0]) es una acción valida posible y debería hacerse cuando no get_valid devuelve una matriz de ceros

Para el optimo en 4x4 bajar usar la PI obtenida en la notebook anterior guardado en /mdp

In [3]:
from players import RandomPlayer as RandomPlayer2
from players import GreedyPlayer as GreedyPlayer2
from players import DictPolicyPlayer as DictPolicyPlayer2

class GreedyPlayer3():
    def __init__(self, player=1, board_shape=None, env=None, flatten_action=False):
        if (env is None) and (board_shape is None):
            print("board_shape and env can't be both None")
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
        self.env = env
        self.player = player # player number. 1 o -1
        #self.env.player = player
        self.flatten_action = flatten_action
        self.board_shape = self.env.board.shape[0]
    
    def predict(self, boardToMove):
        # Tiene que devolver la acción en la que come más piezas.
        # A igualdad de piezas comidas, samplear uniformemente
        valid_actions = self.env.get_valid((boardToMove, self.player))
        sampled_valid_action = np.argwhere(valid_actions==self.player)
        max_rewards = []
        max_actions = []
        board = boardToMove
        for action in sampled_valid_action:
            (board, player), reward, done, info = self.env.next_step((boardToMove, self.player), action)
            print("Step", action, reward)
            if len(max_rewards) == 0 or reward == max_rewards[0]:
                max_rewards += [reward]
                max_actions += [action]
            if len(max_rewards) > 0 and reward > max_rewards[0]:
                max_rewards = [reward]
                max_actions = [action]
        if len(max_actions) == 0:
            return self.env.PASS
        else:
            return max_actions[randint(0, len(max_actions) -1)]
        
class RandomPlayer3():
    def __init__(self, player=1, board_shape=None, env=None, flatten_action=False):
        if (env is None) and (board_shape is None):
            print("board_shape and env can't be both None")
        if env is None:
            env = ReversiEnv(board_shape=board_shape)
        self.env = env
        self.player = player
        self.flatten_action = flatten_action
        self.board_shape = self.env.board.shape[0]
    
    def predict(self, board):
        # Muestrea aleatoriamente las acciones válidas
        # Puede usar la función creada en la notebook anterior
        valid_actions = self.env.get_valid((board, self.player))
        sampled_valid_action = np.argwhere(valid_actions==self.player)
        if len(sampled_valid_action) == 0:
            print ("No mas movimientos para jugador: " + str(self.player))
            action = self.env.PASS
            return action
        action = sampled_valid_action[randint(0, len(sampled_valid_action) -1)]
        if self.flatten_action:
            return action[0] * self.board_shape + action[1]
        else:
            return action


class DictPolicyPlayer3():
    def __init__(self, player=1, board_shape=4, env=None, flatten_action=False, dict_folder='mdp/pi_mdp.npy'):
        self.pi_dict = np.load(dict_folder, allow_pickle=True).item()
        self.env = env
        if env is None:
            self.env = ReversiEnv(board_shape=board_shape)
        self.player = player
        self.flatten_action = flatten_action
        self.board_shape = board_shape
    
    def predict(self, board):
        # Elegir la acción optima y devolverla
        board_tuple = tuple((board * self.player).reshape(-1))
        action = self.pi_dict.get(board_tuple, None)
        if action is None:
            action = self.env.PASS
        if not self.flatten_action:
            return action
        else:
            return [action // self.board_shape, action % self.board_shape]

In [4]:
# Mi imp
#DictPolicyPlayer = DictPolicyPlayer3
#RandomPlayer = RandomPlayer3
#GreedyPlayer = GreedyPlayer3

# Players imp
DictPolicyPlayer = DictPolicyPlayer2
RandomPlayer = RandomPlayer2
GreedyPlayer = GreedyPlayer2

# Verificar que el pass funciona OK

In [5]:
gp = GreedyPlayer(player=1, board_shape=4)
rp = RandomPlayer(player=1, board_shape=4)
board = np.array([
    [1, 0, 0, 0],
    [0, -1, 1, 0],
    [0, 1, 1, 0],
    [0, 0, 0, 0]]
)

boardInicial = np.array([
    [0, 0, 0, 0],
    [0, -1, 1, 0],
    [0, 1, -1, 0],
    [0, 0, 0, 0]]
)

boardToMove = np.array([
    [0, 0, 0, 0],
    [0, -1, -1, 0],
    [0, -1, 1, 0],
    [1, 0, 0, 0]]
)

board3 = np.array([
    [0, 0, 0, 0],
    [0, -1, -1, -1],
    [0, 1, -1, 0],
    [0, 0, 0, 0]]
)

In [6]:
#(board, player), reward, done, info = gp.env.next_step((boardToMove, 1), [0,2])
next_state, _, _, _ = gp.env.next_step((board, gp.player), [0,3])
next_state[0]

array([[ 1,  0,  0,  0],
       [ 0, -1,  1,  0],
       [ 0,  1,  1,  0],
       [ 0,  0,  0,  0]])

In [7]:
dp = DictPolicyPlayer(player=1, board_shape=4, dict_folder='mdp/pi_mdp.npy')


In [8]:
print("RANDOM", rp.predict(board))
print("Greedy", gp.predict(board))
print("Direct", dp.predict(board))


RANDOM [0 1]
Greedy [0 1]
Direct [-1  0]


# Completar la función que dado dos jugadores imprima estadísticas de las partidas

In [9]:
def arena_stats(Player_1, Player_2, board_shape, N=500):
    
    env = ReversiEnv(board_shape=board_shape)
    wins_as_first = 0
    wins_as_second = 0
    plays_as_first = 0
    plays_as_second = 0
    total_steps = 0
    player_1 = Player_1(player=1, board_shape=board_shape, flatten_action=False)
    player_2 = Player_2(player=-1, board_shape=board_shape, flatten_action=False)
    for i in range(N):
        # Aveces empieza un jugador, a veces el otro
        first_player = np.random.choice([-1, 1])
        player_1.player = first_player
        player_2.player = -first_player
        
        plays_as_first = plays_as_first + (first_player == 1)
        plays_as_second = plays_as_second + (first_player == -1)
        
        done = False
        n_steps = 0
        (board, player) = env.reset()
        
        while not done:
            if first_player == player:
                action = player_1.predict(board) # Juega el jugador 1
            else:
                action = player_2.predict(board) # Juega el jugador 2
            (board, player), reward, done, info = env.step(action)
            n_steps = n_steps + 1
        total_steps = total_steps + n_steps
        wins_as_first = wins_as_first + (reward == first_player) * (first_player == 1)
        wins_as_second = wins_as_second + (reward == first_player) * (first_player == -1)
    print(f'Wins as first: {wins_as_first/plays_as_first}')
    print(f'Wins as second: {wins_as_second/plays_as_second}')
    print(f'Plays as first: {plays_as_first}')
    print(f'Plays as second: {plays_as_second}')
    print(f'Avg game duration: {total_steps/N}')

    return [player_1, player_2, wins_as_first + wins_as_second, total_steps, N]

In [10]:
stats = []
stats += [arena_stats(DictPolicyPlayer, GreedyPlayer, 4, N=2000)]

Wins as first: 0.8399592252803262
Wins as second: 1.0
Plays as first: 981
Plays as second: 1019
Avg game duration: 11.7355


In [11]:
stats += [arena_stats(DictPolicyPlayer, RandomPlayer, 4, N=1000)]

Wins as first: 0.8153846153846154
Wins as second: 1.0
Plays as first: 520
Plays as second: 480
Avg game duration: 11.664


In [12]:
stats += [arena_stats(RandomPlayer, DictPolicyPlayer, 4, N=1000)]

Wins as first: 0.0
Wins as second: 0.12909441233140656
Plays as first: 481
Plays as second: 519
Avg game duration: 11.646


In [13]:
stats += [arena_stats(RandomPlayer, GreedyPlayer, 4, N=1000)]

Wins as first: 0.39644970414201186
Wins as second: 0.49898580121703856
Plays as first: 507
Plays as second: 493
Avg game duration: 11.675


In [14]:
stats += [arena_stats(RandomPlayer, RandomPlayer, 4)]

Wins as first: 0.36531365313653136
Wins as second: 0.5021834061135371
Plays as first: 271
Plays as second: 229
Avg game duration: 11.782


In [15]:
stats += [arena_stats(GreedyPlayer, GreedyPlayer, 4)]

Wins as first: 0.3953488372093023
Wins as second: 0.45867768595041325
Plays as first: 258
Plays as second: 242
Avg game duration: 11.416


In [16]:
stats += [arena_stats(RandomPlayer, GreedyPlayer, 8, N=1000)]

Wins as first: 0.38181818181818183
Wins as second: 0.32475247524752476
Plays as first: 495
Plays as second: 505
Avg game duration: 57.991


# Guardar todas las clases de jugadores en un player.py para que luego se puedan importar de la siguiente forma:

from players import RandomPlayer

from players import GreedyPlayer

In [17]:
for stat in stats:
    print(type(stat[0]).__name__, "(" + str(stat[2]) + " - " + str(round(100 * stat[2] / stat[4])) + "%) vs", type(stat[1]).__name__, "(" + str(stat[4] - stat[2]) + ")", " (moves", str(round(stat[3] / stat[4], 1)) + ")")

DictPolicyPlayer (1843 - 92%) vs GreedyPlayer (157)  (moves 11.7)
DictPolicyPlayer (904 - 90%) vs RandomPlayer (96)  (moves 11.7)
RandomPlayer (67 - 7%) vs DictPolicyPlayer (933)  (moves 11.6)
RandomPlayer (447 - 45%) vs GreedyPlayer (553)  (moves 11.7)
RandomPlayer (214 - 43%) vs RandomPlayer (286)  (moves 11.8)
GreedyPlayer (213 - 43%) vs GreedyPlayer (287)  (moves 11.4)
RandomPlayer (353 - 35%) vs GreedyPlayer (647)  (moves 58.0)


DictPolicyPlayer3 (2000 - 100%) vs GreedyPlayer3 (0)  (moves 6.8)
DictPolicyPlayer3 (1000 - 100%) vs RandomPlayer3 (0)  (moves 6.9)
RandomPlayer3 (0 - 0%) vs DictPolicyPlayer3 (1000)  (moves 6.8)
RandomPlayer3 (474 - 47%) vs GreedyPlayer3 (526)  (moves 2.0)
RandomPlayer3 (275 - 55%) vs RandomPlayer3 (225)  (moves 2.0)
GreedyPlayer3 (247 - 49%) vs GreedyPlayer3 (253)  (moves 2.0)
RandomPlayer3 (489 - 49%) vs GreedyPlayer3 (511)  (moves 2.0)

DictPolicyPlayer (1829 - 91%) vs GreedyPlayer (171)  (moves 11.7)
DictPolicyPlayer (907 - 91%) vs RandomPlayer (93)  (moves 11.7)
RandomPlayer (69 - 7%) vs DictPolicyPlayer (931)  (moves 11.6)
RandomPlayer (455 - 46%) vs GreedyPlayer (545)  (moves 11.6)
RandomPlayer (229 - 46%) vs RandomPlayer (271)  (moves 11.8)
GreedyPlayer (236 - 47%) vs GreedyPlayer (264)  (moves 11.6)
RandomPlayer (380 - 38%) vs GreedyPlayer (620)  (moves 58.0)

In [18]:
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()
# dd/mm/YY H:M:S
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
print("date and time =", dt_string)

date and time = 29/01/2023 00:26:27
