In [99]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Dynamic Programming

In [100]:
from othello.OthelloGame import OthelloGame as Game
from othello.OthelloGame import display as displayGame
from matplotlib import pyplot as plt
from matplotlib import patches
import numpy as np
from tree_search_algs import bfs_cannonical

## Tree Search to find all states
Note: we are finding all states given that player 1 (white) starts playing. 
States of player 2 are expressed in cannonical form. 

In [156]:
n = 4
game = Game(n)
board = game.getInitBoard()

In [157]:
# Find all states of game doing a search tree
first_player = 1
%time states_actions_player_1 = bfs_cannonical(game, board, first_player) # (1 white)

CPU times: user 29.4 s, sys: 167 ms, total: 29.6 s
Wall time: 29.3 s


In [158]:
# Find all states of game doing a search tree
first_player = -1
%time states_actions_player_2 = bfs_cannonical(game, board, first_player) #(-1 black)

CPU times: user 28.9 s, sys: 248 ms, total: 29.2 s
Wall time: 29 s


In [163]:
states_actions = {**states_actions_player_1, **states_actions_player_2}

In [164]:
print('Number of states:', len(states_actions_player_1))
print('Number of states:', len(states_actions_player_2))

Number of states: 53651
Number of states: 53651


In [165]:
print('Number of states:', len(states_actions))

Number of states: 91282


In [196]:
np.save('states_actions', states_actions)

In [198]:
# Los estados ocupan 144M
% ls -lah sta*

-rw-rw-r-- 1 usuario usuario 144M feb 23 02:47 states_actions.npy


In [166]:
state = list(states_actions.keys())[0]
print('Example of state:')
print(state)
print()
print('Actions, rewards and next nodes of state:')
for k, v in states_actions[state].items():
    print(k, v)

Example of state:
(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)

Actions, rewards and next nodes of state:
1 {'reward': 0, 'next_node': (0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0)}
4 {'reward': 0, 'next_node': (0, 0, 0, 0, -1, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0)}
11 {'reward': 0, 'next_node': (0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, -1, 0, 0, 0, 0)}
14 {'reward': 0, 'next_node': (0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, 0, 0, 0, -1, 0)}


### Questions:

In [167]:
# Board after player 1 plays action 1
board_1 = np.array(
[[0,  1,  0,  0],
[ 0,  1,  1,  0],
[ 0,  1, -1,  0],
[ 0,  0,  0,  0]])
states_actions[tuple(board_1.reshape(-1))]

KeyError: (0, 1, 0, 0, 0, 1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)

In [168]:
board

array([[ 0,  0,  0,  0],
       [ 0, -1,  1,  0],
       [ 0,  1, -1,  0],
       [ 0,  0,  0,  0]])

In [169]:
board_2 = np.array(
[[0,  0,  -1,  0],
[ 0,  -1,  -1,  0],
[ 0,  1, -1,  0],
[ 0,  0,  0,  0]])
states_actions[tuple(board_2.reshape(-1))]

{1: {'reward': 0,
  'next_node': (0, -1, 1, 0, 0, -1, 1, 0, 0, -1, 1, 0, 0, 0, 0, 0)},
 3: {'reward': 0,
  'next_node': (0, 0, 1, -1, 0, 1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0)},
 11: {'reward': 0,
  'next_node': (0, 0, 1, 0, 0, 1, 1, 0, 0, -1, -1, -1, 0, 0, 0, 0)}}

why the above is not a valid state? Which is the state for the Board after player 1 plays action 1?

## Generate a uniform stochastic policy
It is used as initial policy to test stochastic policy evalution 

In [170]:
from dynamic_programming import generate_uniform_stochastic_policy
pi = generate_uniform_stochastic_policy(states_actions)

for i in range(2):
    state = list(pi.keys())[i]
    print('Example of state:')
    print(state)
    print('Actions, Probabilities:')
    for k, v in pi[state].items():
        print(k, v)
    print('------------------')

Example of state:
(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)
Actions, Probabilities:
1 0.25
4 0.25
11 0.25
14 0.25
------------------
Example of state:
(0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0)
Actions, Probabilities:
0 0.3333333333333333
2 0.3333333333333333
8 0.3333333333333333
------------------


## Policy evaluation test
### Stochastic policy evaluation

In [171]:
from dynamic_programming import policy_evaluation, stochastic_policy_eval_step
# stochastic_policy_eval_step does policy_evaluation using probabilities. Check library

In [172]:
# Run it multiple times to check it takes different number of iterations to converge
stochastic_pi = generate_uniform_stochastic_policy(states_actions)
V_stochastic, iters = policy_evaluation(stochastic_policy_eval_step, 
                             states_actions, 
                             stochastic_pi, 1e-8, verbose=1)

Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 


In [173]:
print('10 examples of Value function:')
i = 0 
for k, v in V_stochastic.items():
    print(k, v)
    if i == 10:
        break
    i += 1

10 examples of Value function:
(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -0.20959130704875395
(0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0) 0.20959130704875395
(0, 0, 0, 0, -1, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0) 0.20959130704875395
(0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, -1, 0, 0, 0, 0) 0.20959130704875395
(0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, 0, 0, 0, -1, 0) 0.209591307048754
(-1, 1, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -0.7356919474451302
(0, 1, -1, 0, 0, 1, -1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 0.0992612276020233
(0, 1, 0, 0, 0, 1, 1, 0, -1, -1, -1, 0, 0, 0, 0, 0) 0.007656798696845005
(-1, 0, 0, 0, 1, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -0.7356919474451302
(0, 0, -1, 0, 1, 1, -1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 0.007656798696845005
(0, 0, 0, 0, 1, 1, 1, 0, -1, -1, -1, 0, 0, 0, 0, 0) 0.0992612276020233


### Deterministic policy evaluation

In [174]:
from dynamic_programming import generate_deterministic_policy, deterministic_policy_eval_step

In [175]:
deterministic_pi = generate_deterministic_policy(states_actions)
for i in range(2):
    state = list(pi.keys())[i]
    print('Example of state:', state)
    print('"best" action:', deterministic_pi[state])
    print('------------------')

Example of state: (0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0)
"best" action: 14
------------------
Example of state: (0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0)
"best" action: 8
------------------


In [176]:
# Run it multiple times to check it always takes the same number of iterations
V_deterministic, iters = policy_evaluation(deterministic_policy_eval_step, 
                             states_actions, 
                             deterministic_pi, 1e-8, verbose=1)

Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 


In [177]:
print('10 examples of Value function:')
i = 0 
for k, v in V_deterministic.items():
    print(k, v)
    if i == 10:
        break
    i += 1

10 examples of Value function:
(0, 0, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 1
(0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0) 1
(0, 0, 0, 0, -1, -1, -1, 0, 0, -1, 1, 0, 0, 0, 0, 0) 1
(0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, -1, 0, 0, 0, 0) 1
(0, 0, 0, 0, 0, 1, -1, 0, 0, -1, -1, 0, 0, 0, -1, 0) -1
(-1, 1, 0, 0, 0, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -1
(0, 1, -1, 0, 0, 1, -1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -1
(0, 1, 0, 0, 0, 1, 1, 0, -1, -1, -1, 0, 0, 0, 0, 0) -1
(-1, 0, 0, 0, 1, -1, 1, 0, 0, 1, -1, 0, 0, 0, 0, 0) -1
(0, 0, -1, 0, 1, 1, -1, 0, 0, 1, -1, 0, 0, 0, 0, 0) 1
(0, 0, 0, 0, 1, 1, 1, 0, -1, -1, -1, 0, 0, 0, 0, 0) 1


See values are just 1 or -1 (They could be +-0.2 for ties)

## Policy Improve

In [178]:
from dynamic_programming import policy_improve, policy_iteration

In [179]:
initial_policy = generate_deterministic_policy(states_actions)
optimum_policy, optimum_V = policy_iteration(states_actions, initial_policy, verbose = 1)

Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Cantidad de diferencias de la vieja politica con la nueva: 23166
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 
Cantidad de diferencias de la vieja politica con la nueva: 4278
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 14 
Cantidad de diferencias de la vieja politica con la nueva: 1221
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Cantidad de diferencias de la vieja politica con la nueva: 303
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Cantidad de diferencias de la vieja politica con la nueva: 82
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Cantidad de diferencias de la vieja politica con la nueva: 27
---------------------------
Iteration number:  1 2 3 4 5 6 7 8 9 10 11 12 13 
Cantidad de diferencias de la vieja politica con la nueva: 9
---------------------------
Iterati

In [183]:
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1
print(board)

[[ 0  0  0  0]
 [ 0 -1  1  0]
 [ 0  1 -1  0]
 [ 0  0  0  0]]


In [184]:
# Empieza player_1
print(optimum_V[tuple(board.reshape(-1))])

-1


The above result indicates that player 1 always looses

In [186]:
# Empieza player_2
print(optimum_V[tuple(-board.reshape(-1))])

-1


In [200]:
np.save('Value_func', optimum_V)

In [201]:
np.save('pi_func', optimum_policy)

In [207]:
! ls -lah *.npy

-rw-rw-r-- 1 usuario usuario  80M feb 23 02:50 pi_func.npy
-rw-rw-r-- 1 usuario usuario 144M feb 23 02:47 states_actions.npy
-rw-rw-r-- 1 usuario usuario  76M feb 23 02:50 Value_func.npy


# Lets play game with optimun policy

In [187]:
from playing_stats import EvaluatePolicy

In [188]:
evalPolicy = EvaluatePolicy(optimum_policy)

In [189]:
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1

# Random vs Random as a reference

In [190]:
episodes = 1000
player_1_wins, player_2_wins, ties = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.random_player, -1: evalPolicy.random_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')

player_1 wins: 34%
player_2 wins: 56%
ties: 9%


Second player seems to have more probability to win playing random

## Policy plays second against random

In [193]:
episodes = 1000
player_1_wins, player_2_wins, ties = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.random_player, -1: evalPolicy.policy_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')

player_1 wins: 0%
player_2 wins: 100%
ties: 0%


Optimal policy always win as second player

## Policy plays first against random

In [194]:
episodes = 1000
player_1_wins, player_2_wins, ties = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player, -1: evalPolicy.random_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')

player_1 wins: 81%
player_2 wins: 19%
ties: 0%


Obtimal policy can't win all but does much better than random player 1

## Policy plays first against greedy

In [195]:
episodes = 1000
player_1_wins, player_2_wins, ties = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player, -1: evalPolicy.greedy_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')

player_1 wins: 67%
player_2 wins: 33%
ties: 0%


Also does good as player 1 against greedy