In [38]:
%load_ext autoreload
%autoreload 2

# Enhanced policy (Minimize pieces in board)

Instead of the reward being 1 for winning and -1 for loosing, the reward is 16 - (number_of_pieces_in_board)

In [1]:
import numpy as np
from dynamic_programming import policy_improve, policy_iteration, generate_deterministic_policy, deterministic_policy_eval_step, policy_improve

In [2]:
states_actions = np.load('states_actions.npy').item()

In [3]:
np.abs(np.array((1, -1, 1, 1)).sum())

2

In [47]:
def policy_improve(V, states_actions):
    pi = {}
    for state, actions in states_actions.items():
        actions_list = list(actions.keys())
        expected_rewards = np.zeros(len(actions_list))
        for i, (action, data) in enumerate(actions.items()):
            next_state = data['next_node']
            # This is the modification
            winner = data['winner']
            steps = np.abs(np.array(next_state)).sum() - 4
            reward = winner * (16 - steps)
            if winner == 0:
                expected_rewards[i] = - V[next_state]
            else:
                # Esto es un nodo terminal
                expected_rewards[i] = - reward
        pi[state] = actions_list[np.argmax(expected_rewards)]
    return pi

In [48]:
def deterministic_policy_eval_step(states_actions, V, pi):
    # Evaluation in place (in contrast with evaluation with 2 arrays).
    # Needs less memory and converges too
    # pi is a dict and pi[s] is the best action for that state. (The most probable action)
    delta = 0
    for state, actions in states_actions.items():
        V_updated = 0
        action = pi[state]
        next_node = actions[action]['next_node']
        # This is the modification
        winner = actions[action]['winner']
        steps = np.abs(np.array(next_node)).sum() - 4
        reward = winner * (16 - steps)
        if winner == 0:
            V_updated = V_updated + (-V[next_node])
        else:
            # Esto es un nodo terminal
            V_updated = V_updated - reward
        delta = max(delta, np.abs(V_updated - V[state]))
        V[state] = V_updated
    return V, delta

In [49]:
initial_policy = generate_deterministic_policy(states_actions)
optimum_policy, optimum_V = policy_iteration(states_actions, 
                                             initial_policy, 
                                             deterministic_policy_eval_step, 
                                             policy_improve,
                                             verbose = 1)

Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 23034
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 
Number of differences of new policy vs old policy: 5383
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 1558
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 
Number of differences of new policy vs old policy: 352
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 70
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 10
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 0
---------------------------


In [50]:
from othello.OthelloGame import OthelloGame as Game
from othello.OthelloGame import display as displayGame

In [51]:
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1
print(board)

[[ 0  0  0  0]
 [ 0 -1  1  0]
 [ 0  1 -1  0]
 [ 0  0  0  0]]


In [52]:
# Empieza player_1
first_player = 1
print(optimum_V[tuple(first_player * board.reshape(-1))])

-5


In [53]:
#np.save('Value_func_diff_reward', optimum_V)
#np.save('pi_func_diff_reward', optimum_policy)

# Lets play game

In [54]:
from othello.OthelloGame import OthelloGame as Game
from othello.OthelloGame import display as displayGame
import numpy as np

from playing_stats import EvaluatePolicy

In [55]:
# optimum_policy = np.load('pi_func_only_winner.npy').item()
evalPolicy = EvaluatePolicy(optimum_policy)
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1

## Policy plays second against random

In [56]:
episodes = 1000
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.random_player, -1: evalPolicy.policy_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')
print('Max, Mean, Min margins: ', end ='')
print(np.max(margins), np.mean(margins), np.min(margins))
print('Max, Mean, Min steps: ', end ='')
print(np.max(steps_array), np.mean(steps_array), np.min(steps_array))
print('Max, Mean, Min pieces: ', end ='')
print(np.max(pieces), np.mean(pieces), np.min(pieces))

player_1 wins: 0%
player_2 wins: 100%
ties: 0%
Max, Mean, Min margins: -5 -9.326 -12
Max, Mean, Min steps: 12 9.038 8
Max, Mean, Min pieces: 15 12.034 11


## Policy plays first against random

In [57]:
episodes = 1000
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player, -1: evalPolicy.random_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')
print('Max, Mean, Min margins: ', end ='')
print(np.max(margins), np.mean(margins), np.min(margins))
print('Max, Mean, Min steps: ', end ='')
print(np.max(steps_array), np.mean(steps_array), np.min(steps_array))
print('Max, Mean, Min pieces: ', end ='')
print(np.max(pieces), np.mean(pieces), np.min(pieces))

player_1 wins: 80%
player_2 wins: 20%
ties: 0%
Max, Mean, Min margins: 12 5.039 -12
Max, Mean, Min steps: 14 10.667 7
Max, Mean, Min pieces: 16 13.507 10
