In [1]:
%load_ext autoreload
%autoreload 2

# Dynamic Programming

In [2]:
from othello.OthelloGame import OthelloGame as Game
from othello.OthelloGame import display as displayGame
from matplotlib import pyplot as plt
from matplotlib import patches
import numpy as np
from tree_search_algs import bfs_cannonical

## Tree Search to find all states
Note: we are finding all states given that player 1 (white) starts playing. 
States of player 2 are expressed in cannonical form. 

In [3]:
n = 4
game = Game(n)
board = game.getInitBoard()

In [4]:
# Find all states of game doing a search tree
first_player = 1
%time states_actions_player_1 = bfs_cannonical(game, board, first_player) # (1 white)

CPU times: user 55.4 s, sys: 385 ms, total: 55.8 s
Wall time: 56 s


In [5]:
# Find all states of game doing a search tree
first_player = -1
%time states_actions_player_2 = bfs_cannonical(game, board, first_player) #(-1 black)

CPU times: user 55.3 s, sys: 352 ms, total: 55.6 s
Wall time: 55.7 s


In [6]:
states_actions = {**states_actions_player_1, **states_actions_player_2}

In [7]:
print('Number of states:', len(states_actions_player_1))
print('Number of states:', len(states_actions_player_2))

Number of states: 53651
Number of states: 53651


In [8]:
print('Number of states:', len(states_actions))

Number of states: 91282


In [9]:
np.save('states_actions', states_actions)

In [30]:
# Size of the file. Only works if linux or mac. Try dir for windows
! ls -lah states_actions.npy

-rw-r--r--  1 julianganzabal  staff   144M Feb 23 12:31 states_actions.npy


In [31]:
state = list(states_actions.keys())[0]
print('Example of state:')
print(state)
print()
print('Actions, rewards and next nodes of state:')
for k, v in states_actions[state].items():
    print(k, v)

Example of state:
(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)

Actions, rewards and next nodes of state:
1 {'reward': 0, 'next_node': (0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0)}
4 {'reward': 0, 'next_node': (0, 0, 0, 0, -1, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0)}
11 {'reward': 0, 'next_node': (0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, -1, 0, 0, 0, 0)}
14 {'reward': 0, 'next_node': (0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, 0, 0, 0, -1, 0)}


### Questions:

In [32]:
# Board after player 1 plays action 1
board_1 = np.array(
[[0,  1,  0,  0],
[ 0,  1,  1,  0],
[ 0,  1, -1,  0],
[ 0,  0,  0,  0]])
states_actions[tuple(board_1.reshape(-1))]

KeyError: (0, 1, 0, 0, 0, 1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)

why the above is not a valid state? Which is the state for the Board after player 1 plays action 1?

## Generate a uniform stochastic policy
It is used as initial policy to test stochastic policy evalution 

In [34]:
from dynamic_programming import generate_uniform_stochastic_policy
pi = generate_uniform_stochastic_policy(states_actions)

for i in range(2):
    state = list(pi.keys())[i]
    print('Example of state:')
    print(state)
    print('Actions, Probabilities:')
    for k, v in pi[state].items():
        print(k, v)
    print('------------------')

Example of state:
(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)
Actions, Probabilities:
1 0.25
4 0.25
11 0.25
14 0.25
------------------
Example of state:
(0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0)
Actions, Probabilities:
0 0.3333333333333333
2 0.3333333333333333
8 0.3333333333333333
------------------


## Policy evaluation test
### Stochastic policy evaluation

In [35]:
from dynamic_programming import policy_evaluation, stochastic_policy_eval_step
# stochastic_policy_eval_step does policy_evaluation using probabilities. Check library

In [36]:
# Run it multiple times to check it takes different number of iterations to converge
stochastic_pi = generate_uniform_stochastic_policy(states_actions)
V_stochastic, iters = policy_evaluation(stochastic_policy_eval_step, 
                             states_actions, 
                             stochastic_pi, 1e-8, verbose=1)

Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 


In [37]:
print('10 examples of Value function:')
i = 0 
for k, v in V_stochastic.items():
    print(k, v)
    if i == 10:
        break
    i += 1

10 examples of Value function:
(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -0.20959130704875395
(0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0) 0.20959130704875395
(0, 0, 0, 0, -1, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0) 0.20959130704875395
(0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, -1, 0, 0, 0, 0) 0.20959130704875395
(0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, 0, 0, 0, -1, 0) 0.209591307048754
(-1, 1, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -0.7356919474451302
(0, 1, -1, 0, 0, 1, -1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 0.0992612276020233
(0, 1, 0, 0, 0, 1, 1, 0, -1, -1, -1, 0, 0, 0, 0, 0) 0.007656798696845005
(-1, 0, 0, 0, 1, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -0.7356919474451302
(0, 0, -1, 0, 1, 1, -1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 0.007656798696845005
(0, 0, 0, 0, 1, 1, 1, 0, -1, -1, -1, 0, 0, 0, 0, 0) 0.0992612276020233


### Deterministic policy evaluation

In [38]:
from dynamic_programming import generate_deterministic_policy, deterministic_policy_eval_step

In [39]:
deterministic_pi = generate_deterministic_policy(states_actions)
for i in range(2):
    state = list(pi.keys())[i]
    print('Example of state:', state)
    print('"best" action:', deterministic_pi[state])
    print('------------------')

Example of state: (0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)
"best" action: 14
------------------
Example of state: (0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0)
"best" action: 2
------------------


In [40]:
# Run it multiple times to check it always takes the same number of iterations
V_deterministic, iters = policy_evaluation(deterministic_policy_eval_step, 
                             states_actions, 
                             deterministic_pi, 1e-8, verbose=1)

Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 


In [41]:
print('10 examples of Value function:')
i = 0 
for k, v in V_deterministic.items():
    print(k, v)
    if i == 10:
        break
    i += 1

10 examples of Value function:
(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 0.2
(0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0) -1
(0, 0, 0, 0, -1, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0) 1
(0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, -1, 0, 0, 0, 0) 1
(0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, 0, 0, 0, -1, 0) -0.2
(-1, 1, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 1
(0, 1, -1, 0, 0, 1, -1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 1
(0, 1, 0, 0, 0, 1, 1, 0, -1, -1, -1, 0, 0, 0, 0, 0) 1
(-1, 0, 0, 0, 1, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -1
(0, 0, -1, 0, 1, 1, -1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 1
(0, 0, 0, 0, 1, 1, 1, 0, -1, -1, -1, 0, 0, 0, 0, 0) -1


See values are just 1 or -1 (They could be +-0.2 for ties)

## Policy Improve

In [44]:
from dynamic_programming import policy_improve, policy_iteration

In [45]:
initial_policy = generate_deterministic_policy(states_actions)
optimum_policy, optimum_V = policy_iteration(states_actions, initial_policy, verbose = 1)

Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 
Number of differences of new policy vs old policy: 23051
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 
Number of differences of new policy vs old policy: 3996
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 1172
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 
Number of differences of new policy vs old policy: 293
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 72
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 23
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of differences of new policy vs old policy: 9
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Number of 

In [46]:
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1
print(board)

[[ 0  0  0  0]
 [ 0 -1  1  0]
 [ 0  1 -1  0]
 [ 0  0  0  0]]


In [49]:
# Empieza player_1
first_player = 1
print(optimum_V[tuple(first_player * board.reshape(-1))])

-1


The above result indicates that player 1 always looses

In [50]:
# Empieza player_2
first_player = -1
print(optimum_V[tuple(first_player * board.reshape(-1))])

-1


In [51]:
np.save('Value_func', optimum_V)

In [52]:
np.save('pi_func', optimum_policy)

In [53]:
! ls -lah *.npy

-rw-r--r--  1 julianganzabal  staff    76M Feb 23 12:46 Value_func.npy
-rw-r--r--  1 julianganzabal  staff    80M Feb 23 12:46 pi_func.npy
-rw-r--r--  1 julianganzabal  staff   144M Feb 23 12:31 states_actions.npy


# Lets play game with optimun policy

In [54]:
from playing_stats import EvaluatePolicy

In [55]:
evalPolicy = EvaluatePolicy(optimum_policy)

In [56]:
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1

# Random vs Random as and Greedy vs Greedy as reference

In [57]:
episodes = 1000
player_1_wins, player_2_wins, ties = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.random_player, -1: evalPolicy.random_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')

player_1 wins: 38%
player_2 wins: 53%
ties: 9%


Second player seems to have more probability to win playing random

In [62]:
episodes = 1000
player_1_wins, player_2_wins, ties = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.greedy_player, -1: evalPolicy.greedy_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')

player_1 wins: 44%
player_2 wins: 50%
ties: 6%


## Policy plays second against random

In [58]:
episodes = 1000
player_1_wins, player_2_wins, ties = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.random_player, -1: evalPolicy.policy_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')

player_1 wins: 0%
player_2 wins: 100%
ties: 0%


Optimal policy always win as second player

## Policy plays first against random

In [59]:
episodes = 1000
player_1_wins, player_2_wins, ties = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player, -1: evalPolicy.random_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')

player_1 wins: 82%
player_2 wins: 18%
ties: 0%


Obtimal policy can't win all but does much better than random player 1

## Policy plays first against greedy

In [60]:
episodes = 1000
player_1_wins, player_2_wins, ties = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player, -1: evalPolicy.greedy_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')

player_1 wins: 67%
player_2 wins: 33%
ties: 0%


Also does good as player 1 against greedy