In [1]:
%load_ext autoreload
%autoreload 2

# Value Iteration (Maximize Margin)

Instead of the reward being 1 for winning and -1 for loosing, the reward is the number of pieces winner is over looser

In [3]:
import numpy as np
from dynamic_programming import policy_improve, policy_iteration, generate_deterministic_policy, deterministic_policy_eval_step, policy_improve

In [4]:
states_actions = np.load('states_actions.npy').item()

In [27]:
def value_iteration(states_actions, theta=1e-8, winning_reward=1e3):
    V = {}
    iters = 0
    for state in states_actions:
        V[state] = 0
    delta = theta + 1
    iterat = 0
    N = len(states_actions)
    while theta<delta: 
        suma = 0
        delta = 0
        for state, actions in states_actions.items():
            expected_rewards = []
            for action in actions:
                next_state = actions[action]['next_node']
                winner = actions[action]['winner']
                reward = np.array(next_state).sum()
                if winner == 0:
                    expected_rewards.append(-V[next_state])
                else:
                    # Esto es un nodo terminal
                    expected_rewards.append(-reward)
            V_updated = max(expected_rewards)
            suma = suma + np.abs(V_updated - V[state])
            delta = max(delta, np.abs(V_updated - V[state]))
            V[state] = V_updated
        iterat += 1
        print(iterat, delta, suma/N)
    return V, delta

In [28]:
%time V, delta = value_iteration(states_actions)

1 16 1.3893429153611885
2 16 1.8213996187638308
3 16 1.6185995048311825
4 16 1.1144913564558183
5 16 0.632315242873732
6 16 0.26934116255121493
7 16 0.115093884884205
8 14 0.03661181832124625
9 14 0.013003658990819658
10 9 0.004108148375364256
11 8 0.0015994391008084835
12 8 0.00010955062334304682
13 0 0.0
CPU times: user 9.86 s, sys: 3.25 ms, total: 9.86 s
Wall time: 10.1 s


In [29]:
def policy_improve(V, states_actions):
    pi = {}
    for state, actions in states_actions.items():
        actions_list = list(actions.keys())
        expected_rewards = np.zeros(len(actions_list))
        for i, (action, data) in enumerate(actions.items()):
            next_state = data['next_node']
            # This is the modification
            winner = data['winner']
            reward = np.array(next_state).sum()
            if winner == 0:
                expected_rewards[i] = - V[next_state]
            else:
                # Esto es un nodo terminal
                expected_rewards[i] = - reward
        pi[state] = actions_list[np.argmax(expected_rewards)]
    return pi

In [30]:
%time pi = policy_improve(V, states_actions)

CPU times: user 879 ms, sys: 206 ms, total: 1.09 s
Wall time: 888 ms


In [31]:
optimum_policy = pi.copy()
optimum_V = V.copy()

In [32]:
from othello.OthelloGame import OthelloGame as Game
from othello.OthelloGame import display as displayGame

In [33]:
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1
print(board)

[[ 0  0  0  0]
 [ 0 -1  1  0]
 [ 0  1 -1  0]
 [ 0  0  0  0]]


In [34]:
# Empieza player_1
first_player = 1
print(optimum_V[tuple(first_player * board.reshape(-1))])

-8


In [35]:
np.save('Value_func_margin_reward_val_iter', optimum_V)
np.save('pi_func_margin_reward_val_iter', optimum_policy)

In [36]:
!ls -lah *.npy

-rw-rw-r-- 1 usuario usuario  18M mar 16 15:49 chess_min_steps_pi.npy
-rw-rw-r-- 1 usuario usuario  18M mar 16 16:38 chess_min_steps_pi_value_iter.npy
-rw-rw-r-- 1 usuario usuario  32M mar 16 15:49 chess_min_steps_V.npy
-rw-rw-r-- 1 usuario usuario  32M mar 16 16:38 chess_min_steps_V_value_iter.npy
-rw-rw-r-- 1 usuario usuario  18M mar 16 17:17 chess_pi.npy
-rw-rw-r-- 1 usuario usuario  14M mar 16 17:17 chess_V.npy
-rw-rw-r-- 1 usuario usuario  80M mar 16 12:46 pi_func_margin_reward.npy
-rw-rw-r-- 1 usuario usuario  80M mar 19 11:15 pi_func_margin_reward_val_iter.npy
-rw-rw-r-- 1 usuario usuario  80M mar 16 13:04 pi_func_min_pieces_reward.npy
-rw-rw-r-- 1 usuario usuario  80M feb 23 02:50 pi_func.npy
-rw-rw-r-- 1 usuario usuario  80M mar 16 12:40 pi_func_only_winner.npy
-rw-rw-r-- 1 usuario usuario  80M mar 16 15:12 pi_func_steps_reward.npy
-rw-r--r-- 1 usuario usuario 272M mar 16 10:17 rook_final.npy
-rw-rw-r-- 1 usuario usuario 144M mar 16 12:38 states_actions.npy
-rw-rw-r-- 1 usuari

# Lets play game

In [37]:
from othello.OthelloGame import OthelloGame as Game
from othello.OthelloGame import display as displayGame
import numpy as np

from playing_stats import EvaluatePolicy

In [39]:
optimum_policy = np.load('pi_func_margin_reward_val_iter.npy').item()
evalPolicy = EvaluatePolicy(optimum_policy)
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1

In [40]:
def display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces):
    print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
    print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
    print('ties:', str(int(100*ties/episodes + 0.5))+ '%')
    print('Max, Mean, Min margins: ', end ='')
    print(np.max(margins), np.mean(margins), np.min(margins))
    print('Max, Mean, Min steps: ', end ='')
    print(np.max(steps_array), np.mean(steps_array), np.min(steps_array))
    print('Max, Mean, Min pieces: ', end ='')
    print(np.max(pieces), np.mean(pieces), np.min(pieces))

## Policy plays second against random

In [41]:
episodes = 1000
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.random_player, -1: evalPolicy.policy_player}, 
                                                episodes)
display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces)

player_1 wins: 0%
player_2 wins: 100%
ties: 0%
Max, Mean, Min margins: -8 -13.449 -16
Max, Mean, Min steps: 16 13.812 10
Max, Mean, Min pieces: 16 15.667 13


## Policy plays first against random

In [42]:
episodes = 1000
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player, -1: evalPolicy.random_player}, 
                                                episodes)
display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces)

player_1 wins: 96%
player_2 wins: 2%
ties: 2%
Max, Mean, Min margins: 15 8.624 -8
Max, Mean, Min steps: 15 13.071 10
Max, Mean, Min pieces: 16 15.332 13


## Player1: Optumin Policy (Margin), Player2: Optimun Policy (only win)

In [43]:
!ls *.npy

chess_min_steps_pi.npy		    pi_func_only_winner.npy
chess_min_steps_pi_value_iter.npy   pi_func_steps_reward.npy
chess_min_steps_V.npy		    rook_final.npy
chess_min_steps_V_value_iter.npy    states_actions.npy
chess_pi.npy			    Value_func_margin_reward.npy
chess_V.npy			    Value_func_margin_reward_val_iter.npy
pi_func_margin_reward.npy	    Value_func_min_pieces_reward.npy
pi_func_margin_reward_val_iter.npy  Value_func.npy
pi_func_min_pieces_reward.npy	    Value_func_only_winner.npy
pi_func.npy			    Value_func_steps_reward.npy


In [44]:
pi_margin = np.load('pi_func_margin_reward.npy').item(0)
pi_only_wins = np.load('pi_func_only_winner.npy').item(0)

In [45]:
# optimum_policy = np.load('pi_func_only_winner.npy').item()
evalPolicy = EvaluatePolicy(pi_margin, pi_only_wins)
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1

In [46]:
episodes = 1
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player, -1: evalPolicy.policy_player_pi2}, 
                                                episodes)
display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces)

player_1 wins: 0%
player_2 wins: 100%
ties: 0%
Max, Mean, Min margins: -2 -2.0 -2
Max, Mean, Min steps: 13 13.0 13
Max, Mean, Min pieces: 16 16.0 16


- It has no sense to play more than once because they are deterministic policies
- Very interesting to note that player 2 wins but with a margin of only -2, the minimum margin

## Player1: Optimun Policy (only win), Player2: Optumin Policy (Margin)

In [47]:
episodes = 1
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player_pi2, -1: evalPolicy.policy_player}, 
                                                episodes)
display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces)

player_1 wins: 0%
player_2 wins: 100%
ties: 0%
Max, Mean, Min margins: -9 -9.0 -9
Max, Mean, Min steps: 10 10.0 10
Max, Mean, Min pieces: 13 13.0 13


- It has no sense to play more than once because they are deterministic policies
- Margin policy as second player wins by a margin of 9. Es expected more or equal than 8