In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Enhanced policy (Maximize Margin)

Instead of the reward being 1 for winning and -1 for loosing, the reward is the number of pieces winner is over looser

In [8]:
import numpy as np
from dynamic_programming import policy_improve, policy_iteration, generate_deterministic_policy, deterministic_policy_eval_step, policy_improve

In [9]:
states_actions = np.load('states_actions.npy').item()

In [10]:
def policy_improve(V, states_actions):
    pi = {}
    for state, actions in states_actions.items():
        actions_list = list(actions.keys())
        expected_rewards = np.zeros(len(actions_list))
        for i, (action, data) in enumerate(actions.items()):
            next_state = data['next_node']
            # This is the modification
            winner = data['winner']
            reward = np.array(next_state).sum()
            if winner == 0:
                expected_rewards[i] = - V[next_state]
            else:
                # Esto es un nodo terminal
                expected_rewards[i] = - reward
        pi[state] = actions_list[np.argmax(expected_rewards)]
    return pi

In [11]:
def deterministic_policy_eval_step(states_actions, V, pi):
    # Evaluation in place (in contrast with evaluation with 2 arrays).
    # Needs less memory and converges too
    # pi is a dict and pi[s] is the best action for that state. (The most probable action)
    delta = 0
    for state, actions in states_actions.items():
        V_updated = 0
        action = pi[state]
        next_node = actions[action]['next_node']
        # This is the modification
        winner = actions[action]['winner']
        reward = np.array(next_node).sum()
        if winner == 0:
            V_updated = V_updated + (-V[next_node])
        else:
            # Esto es un nodo terminal
            V_updated = V_updated - reward
        delta = max(delta, np.abs(V_updated - V[state]))
        V[state] = V_updated
    return V, delta

In [12]:
initial_policy = generate_deterministic_policy(states_actions)
optimum_policy, optimum_V = policy_iteration(states_actions, 
                                             initial_policy, 
                                             deterministic_policy_eval_step, 
                                             policy_improve,
                                             verbose = 1)

Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 
Number of differences of new policy vs old policy: 23012
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
Number of differences of new policy vs old policy: 7131
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
Number of differences of new policy vs old policy: 1868
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 
Number of differences of new policy vs old policy: 440
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 85
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
Number of differences of new policy vs old policy: 18
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 2
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10

In [13]:
from othello.OthelloGame import OthelloGame as Game
from othello.OthelloGame import display as displayGame

In [14]:
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1
print(board)

[[ 0  0  0  0]
 [ 0 -1  1  0]
 [ 0  1 -1  0]
 [ 0  0  0  0]]


In [15]:
# Empieza player_1
first_player = 1
print(optimum_V[tuple(first_player * board.reshape(-1))])

-8


In [16]:
np.save('Value_func_margin_reward', optimum_V)
np.save('pi_func_margin_reward', optimum_policy)

In [17]:
!ls -lah *.npy

-rw-rw-r-- 1 usuario usuario  80M mar 16 12:46 pi_func_margin_reward.npy
-rw-rw-r-- 1 usuario usuario  80M feb 23 02:50 pi_func.npy
-rw-rw-r-- 1 usuario usuario  80M mar 16 12:40 pi_func_only_winner.npy
-rw-r--r-- 1 usuario usuario 272M mar 16 10:17 rook_final.npy
-rw-rw-r-- 1 usuario usuario 144M mar 16 12:38 states_actions.npy
-rw-rw-r-- 1 usuario usuario  81M mar 16 12:45 Value_func_margin_reward.npy
-rw-rw-r-- 1 usuario usuario  76M feb 23 02:50 Value_func.npy
-rw-rw-r-- 1 usuario usuario  76M mar 16 12:39 Value_func_only_winner.npy
-rw-rw-r-- 1 usuario usuario  133 mar 16 10:16 Value_func_steps_reward.npy


# Lets play game

In [18]:
from othello.OthelloGame import OthelloGame as Game
from othello.OthelloGame import display as displayGame
import numpy as np

from playing_stats import EvaluatePolicy

In [29]:
optimum_policy = np.load('pi_func_margin_reward.npy').item()
evalPolicy = EvaluatePolicy(optimum_policy)
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1

In [30]:
def display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces):
    print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
    print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
    print('ties:', str(int(100*ties/episodes + 0.5))+ '%')
    print('Max, Mean, Min margins: ', end ='')
    print(np.max(margins), np.mean(margins), np.min(margins))
    print('Max, Mean, Min steps: ', end ='')
    print(np.max(steps_array), np.mean(steps_array), np.min(steps_array))
    print('Max, Mean, Min pieces: ', end ='')
    print(np.max(pieces), np.mean(pieces), np.min(pieces))

## Policy plays second against random

In [31]:
episodes = 1000
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.random_player, -1: evalPolicy.policy_player}, 
                                                episodes)
display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces)

player_1 wins: 0%
player_2 wins: 100%
ties: 0%
Max, Mean, Min margins: -8 -13.318 -16
Max, Mean, Min steps: 16 13.758 10
Max, Mean, Min pieces: 16 15.636 13


## Policy plays first against random

In [32]:
episodes = 1000
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player, -1: evalPolicy.random_player}, 
                                                episodes)
display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces)

player_1 wins: 95%
player_2 wins: 3%
ties: 2%
Max, Mean, Min margins: 15 8.337 -8
Max, Mean, Min steps: 15 13.06 10
Max, Mean, Min pieces: 16 15.339 13


## Player1: Optumin Policy (Margin), Player2: Optimun Policy (only win)

In [33]:
!ls *.npy

pi_func_margin_reward.npy  states_actions.npy
pi_func.npy		   Value_func_margin_reward.npy
pi_func_only_winner.npy    Value_func.npy
pi_func_steps_reward.npy   Value_func_only_winner.npy
rook_final.npy		   Value_func_steps_reward.npy


In [34]:
pi_margin = np.load('pi_func_margin_reward.npy').item(0)
pi_only_wins = np.load('pi_func_only_winner.npy').item(0)

In [35]:
# optimum_policy = np.load('pi_func_only_winner.npy').item()
evalPolicy = EvaluatePolicy(pi_margin, pi_only_wins)
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1

In [36]:
episodes = 1
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player, -1: evalPolicy.policy_player_pi2}, 
                                                episodes)
display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces)

player_1 wins: 0%
player_2 wins: 100%
ties: 0%
Max, Mean, Min margins: -2 -2.0 -2
Max, Mean, Min steps: 13 13.0 13
Max, Mean, Min pieces: 16 16.0 16


- It has no sense to play more than once because they are deterministic policies
- Very interesting to note that player 2 wins but with a margin of only -2, the minimum margin

## Player1: Optimun Policy (only win), Player2: Optumin Policy (Margin)

In [37]:
episodes = 1
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player_pi2, -1: evalPolicy.policy_player}, 
                                                episodes)
display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces)

player_1 wins: 0%
player_2 wins: 100%
ties: 0%
Max, Mean, Min margins: -9 -9.0 -9
Max, Mean, Min steps: 10 10.0 10
Max, Mean, Min pieces: 13 13.0 13


- It has no sense to play more than once because they are deterministic policies
- Margin policy as second player wins by a margin of 9. Es expected more or equal than 8

In [28]:
arr_1 = np.array([1, 2, 3, 4])
arr_2 = arr_1[0]
arr_2 = 10
list_1 = [1, 2, 3, 4]
list_2 = list_1[:3]
list_2[0] = 10
print(list_1, arr_1)

[1, 2, 3, 4] [1 2 3 4]


In [58]:
list_2

10

In [71]:
arr_1 = np.array([10,  2,  8,  7,  6,  5,  4,  1,  9,  3])
arr_sorted_1 = arr_1.sort()
arr_sorted_2 = np.sort(arr_1)
print(arr_1, arr_sorted_1, arr_sorted_2)

[ 1  2  3  4  5  6  7  8  9 10] None [ 1  2  3  4  5  6  7  8  9 10]


In [66]:
np.sort(arr_1)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [77]:
arr_1 = np.array([10,  2,  8,  7,  6,  5,  4,  1,  9,  3])
list_1 = [0, 3, 5, -2]
print(arr_1[list_1])

[10  7  5  9]


In [83]:
arr_1 = np.array([10,  2,  8,  7,  6,  5])
print(np.argsort(arr_1)[::-1])

[0 2 3 4 5 1]


In [82]:
print(np.sort(arr_1)[::-1])

[10  8  7  6  5  2]
