In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Dynamic Programming

In [8]:
from othello.OthelloGame import OthelloGame as Game
from othello.OthelloGame import display as displayGame
from matplotlib import pyplot as plt
from matplotlib import patches
import numpy as np
from tree_search_algs import bfs_cannonical

## Tree Search to find all states
Note: we are finding all states given that player 1 (white) starts playing. 
States of player 2 are expressed in cannonical form. 

In [9]:
n = 4
game = Game(n)
board = game.getInitBoard()

In [10]:
# Find all states of game doing a search tree if player 1 starts
first_player = 1
%time states_actions_player_1 = bfs_cannonical(game, board, first_player) # (1 white)

CPU times: user 30.6 s, sys: 342 ms, total: 30.9 s
Wall time: 30.3 s


In [11]:
# Find all states of game doing a search tree if player 2 starts
first_player = -1
%time states_actions_player_2 = bfs_cannonical(game, board, first_player) #(-1 black)

CPU times: user 30.6 s, sys: 462 ms, total: 31.1 s
Wall time: 30.4 s


In [12]:
states_actions = {**states_actions_player_1, **states_actions_player_2}

In [13]:
print('Number of states:', len(states_actions_player_1))
print('Number of states:', len(states_actions_player_2))

Number of states: 53651
Number of states: 53651


In [14]:
print('Number of states:', len(states_actions))

Number of states: 91282


In [15]:
np.save('states_actions', states_actions)

In [16]:
# Size of the file. Only works if linux or mac. Try dir for windows
! ls -lah states_actions.npy

-rw-rw-r-- 1 usuario usuario 144M mar 16 12:38 states_actions.npy


In [17]:
state = list(states_actions.keys())[0]
print('Example of state:')
print(state)
print()
print('Actions, rewards and next nodes of state:')
for k, v in states_actions[state].items():
    print(k, v)

Example of state:
(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)

Actions, rewards and next nodes of state:
1 {'winner': 0, 'next_node': (0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0)}
4 {'winner': 0, 'next_node': (0, 0, 0, 0, -1, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0)}
11 {'winner': 0, 'next_node': (0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, -1, 0, 0, 0, 0)}
14 {'winner': 0, 'next_node': (0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, 0, 0, 0, -1, 0)}


### Questions:

In [18]:
# Board after player 1 plays action 1
board_1 = np.array(
[[0,  1,  0,  0],
[ 0,  1,  1,  0],
[ 0,  1, -1,  0],
[ 0,  0,  0,  0]])
states_actions[tuple(board_1.reshape(-1))]

KeyError: (0, 1, 0, 0, 0, 1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)

why the above is not a valid state? Which is the state for the Board after player 1 plays action 1?

## Generate a uniform stochastic policy
It is used as initial policy to test stochastic policy evalution 

In [19]:
states_actions = np.load('states_actions.npy').item()

In [20]:
from dynamic_programming import generate_uniform_stochastic_policy
pi = generate_uniform_stochastic_policy(states_actions)

for i in range(2):
    state = list(pi.keys())[i]
    print('Example of state:')
    print(state)
    print('Actions, Probabilities:')
    for k, v in pi[state].items():
        print(k, v)
    print('------------------')

Example of state:
(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)
Actions, Probabilities:
1 0.25
4 0.25
11 0.25
14 0.25
------------------
Example of state:
(0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0)
Actions, Probabilities:
0 0.3333333333333333
2 0.3333333333333333
8 0.3333333333333333
------------------


In [21]:
pi[(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)]

{1: 0.25, 4: 0.25, 11: 0.25, 14: 0.25}

## Policy evaluation test
### Stochastic policy evaluation

In [22]:
from dynamic_programming import policy_evaluation, stochastic_policy_eval_step
# stochastic_policy_eval_step does policy_evaluation using probabilities. Check library

In [23]:
# Run it multiple times to check it takes different number of iterations to converge
stochastic_pi = generate_uniform_stochastic_policy(states_actions)
V_stochastic, iters = policy_evaluation(stochastic_policy_eval_step, 
                             states_actions, 
                             stochastic_pi, 1e-8, verbose=1)

Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 


In [24]:
print('10 examples of Value function:')
i = 0 
for k, v in V_stochastic.items():
    print(k, v)
    if i == 10:
        break
    i += 1

10 examples of Value function:
(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -0.19926756827206293
(0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0) 0.19926756827206293
(0, 0, 0, 0, -1, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0) 0.19926756827206293
(0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, -1, 0, 0, 0, 0) 0.1992675682720629
(0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, 0, 0, 0, -1, 0) 0.19926756827206293
(-1, 1, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -0.7334317552457347
(0, 1, -1, 0, 0, 1, -1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 0.10847170895012459
(0, 1, 0, 0, 0, 1, 1, 0, -1, -1, -1, 0, 0, 0, 0, 0) 0.027157341479421185
(-1, 0, 0, 0, 1, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -0.7334317552457347
(0, 0, -1, 0, 1, 1, -1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 0.027157341479421192
(0, 0, 0, 0, 1, 1, 1, 0, -1, -1, -1, 0, 0, 0, 0, 0) 0.10847170895012459


### Deterministic policy evaluation

In [25]:
from dynamic_programming import generate_deterministic_policy, deterministic_policy_eval_step

In [26]:
deterministic_pi = generate_deterministic_policy(states_actions)
for i in range(2):
    state = list(pi.keys())[i]
    print('Example of state:', state)
    print('"best" action:', deterministic_pi[state])
    print('------------------')

Example of state: (0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)
"best" action: 4
------------------
Example of state: (0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0)
"best" action: 2
------------------


In [27]:
deterministic_pi[(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)]

4

In [28]:
# Run it multiple times to check it always takes the same number of iterations
V_deterministic, iters = policy_evaluation(deterministic_policy_eval_step, 
                             states_actions, 
                             deterministic_pi, 1e-8, verbose=1)

Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 


In [29]:
print('10 examples of Value function:')
i = 0 
for k, v in V_deterministic.items():
    print(k, v)
    if i == 10:
        break
    i += 1

10 examples of Value function:
(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -1
(0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0) -1
(0, 0, 0, 0, -1, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0) 1
(0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, -1, 0, 0, 0, 0) 1
(0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, 0, 0, 0, -1, 0) -1e-06
(-1, 1, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 1e-06
(0, 1, -1, 0, 0, 1, -1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 1
(0, 1, 0, 0, 0, 1, 1, 0, -1, -1, -1, 0, 0, 0, 0, 0) -1
(-1, 0, 0, 0, 1, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -1
(0, 0, -1, 0, 1, 1, -1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -1
(0, 0, 0, 0, 1, 1, 1, 0, -1, -1, -1, 0, 0, 0, 0, 0) 1


See values are just 1 or -1 (They could be +-0.2 for ties)

## Policy Improve

In [30]:
from dynamic_programming import policy_improve, policy_iteration

In [31]:
initial_policy = generate_deterministic_policy(states_actions)
optimum_policy, optimum_V = policy_iteration(states_actions, initial_policy, verbose = 1)

Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 23010
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 
Number of differences of new policy vs old policy: 4100
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 
Number of differences of new policy vs old policy: 1092
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 
Number of differences of new policy vs old policy: 271
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 66
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 18
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 9
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number 

In [32]:
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1
print(board)

[[ 0  0  0  0]
 [ 0 -1  1  0]
 [ 0  1 -1  0]
 [ 0  0  0  0]]


In [33]:
# Empieza player_1
first_player = 1
print(optimum_V[tuple(first_player * board.reshape(-1))])

-1


The above result indicates that player 1 always looses

In [34]:
# Empieza player_2
first_player = -1
print(optimum_V[tuple(first_player * board.reshape(-1))])

-1


In [35]:
np.save('Value_func_only_winner', optimum_V)

In [36]:
np.save('pi_func_only_winner', optimum_policy)

In [37]:
! ls -lah *.npy

-rw-rw-r-- 1 usuario usuario  80M feb 23 02:50 pi_func.npy
-rw-rw-r-- 1 usuario usuario  80M mar 16 12:40 pi_func_only_winner.npy
-rw-r--r-- 1 usuario usuario 272M mar 16 10:17 rook_final.npy
-rw-rw-r-- 1 usuario usuario 144M mar 16 12:38 states_actions.npy
-rw-rw-r-- 1 usuario usuario  133 mar 16 10:16 Value_func_margin_reward.npy
-rw-rw-r-- 1 usuario usuario  76M feb 23 02:50 Value_func.npy
-rw-rw-r-- 1 usuario usuario  76M mar 16 12:39 Value_func_only_winner.npy
-rw-rw-r-- 1 usuario usuario  133 mar 16 10:16 Value_func_steps_reward.npy


# Lets play game with optimun policy

In [38]:
from othello.OthelloGame import OthelloGame as Game
from othello.OthelloGame import display as displayGame
from matplotlib import pyplot as plt
from matplotlib import patches
import numpy as np

from playing_stats import EvaluatePolicy

In [39]:
optimum_policy = np.load('pi_func_only_winner.npy').item()

In [40]:
evalPolicy = EvaluatePolicy(optimum_policy)

In [41]:
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1

# Random vs Random as and Greedy vs Greedy as reference

In [42]:
def display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces):
    print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
    print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
    print('ties:', str(int(100*ties/episodes + 0.5))+ '%')
    print('Max, Mean, Min margins: ', end ='')
    print(np.max(margins), np.mean(margins), np.min(margins))
    print('Max, Mean, Min steps: ', end ='')
    print(np.max(steps_array), np.mean(steps_array), np.min(steps_array))
    print('Max, Mean, Min pieces: ', end ='')
    print(np.max(pieces), np.mean(pieces), np.min(pieces))

In [43]:
episodes = 1000
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.random_player, -1: evalPolicy.random_player}, 
                                                episodes)
display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces)

player_1 wins: 35%
player_2 wins: 56%
ties: 9%
Max, Mean, Min margins: 16 -1.703 -16
Max, Mean, Min steps: 16 12.522 7
Max, Mean, Min pieces: 16 15.725 10


Second player seems to have more probability to win playing random

In [44]:
episodes = 1000
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.greedy_player, -1: evalPolicy.greedy_player}, 
                                                episodes)
display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces)

player_1 wins: 46%
player_2 wins: 46%
ties: 8%
Max, Mean, Min margins: 15 -0.209 -16
Max, Mean, Min steps: 15 12.196 7
Max, Mean, Min pieces: 16 15.513 10


## Policy plays second against random

In [45]:
episodes = 1000
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.random_player, -1: evalPolicy.policy_player}, 
                                                episodes)
display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces)

player_1 wins: 0%
player_2 wins: 100%
ties: 0%
Max, Mean, Min margins: -2 -6.873 -16
Max, Mean, Min steps: 14 12.653 10
Max, Mean, Min pieces: 16 15.859 13


Optimal policy always win as second player

## Policy plays first against random

In [46]:
episodes = 1000
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player, -1: evalPolicy.random_player}, 
                                                episodes)
display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces)

player_1 wins: 82%
player_2 wins: 18%
ties: 0%
Max, Mean, Min margins: 14 3.404 -9
Max, Mean, Min steps: 14 11.633 9
Max, Mean, Min pieces: 16 15.128 12


Obtimal policy can't win all but does much better than random player 1

## Policy plays first against greedy

In [47]:
episodes = 1000
player_1_wins, player_2_wins, ties, margins, steps_array, pieces = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player, -1: evalPolicy.greedy_player}, 
                                                episodes)
display_results(player_1_wins, player_2_wins, ties, margins, steps_array, pieces)

player_1 wins: 67%
player_2 wins: 33%
ties: 0%
Max, Mean, Min margins: 14 2.675 -9
Max, Mean, Min steps: 14 11.645 9
Max, Mean, Min pieces: 16 15.065 12


Also does good as player 1 against greedy