In [1]:
%load_ext autoreload
%autoreload 2

# Dynamic Programming

In [4]:
from othello.OthelloGame import OthelloGame as Game
from othello.OthelloGame import display as displayGame
from matplotlib import pyplot as plt
from matplotlib import patches
import numpy as np
from tree_search_algs import bfs_cannonical
from numba import jit

In [5]:
n = 4
game = Game(n)
board = game.getInitBoard()

In [6]:
%time states_actions = bfs_cannonical(game, board)

CPU times: user 28.7 s, sys: 257 ms, total: 28.9 s
Wall time: 28.6 s


In [7]:
# len(states_actions), list(states_actions.values())[0]

In [8]:
def generate_random_policy(states_actions):
    # Cada estado contienen un dict con las probs de cada acción
    pi = {}
    for state, actions in states_actions.items():
        pi[state] = {}
        prob = 1/len(actions)
        for action, data in actions.items():
            pi[state][action] = prob
    return pi

In [9]:
# Descomentar para probar imprimir resultados de policy
# pi = generate_random_policy(states_actions)
# len(pi), list(pi.values())[0]

In [10]:
# @jit
def policy_eval_step_random(states_actions, V, pi):
    # In place
    # Pi[s] posee la probabilidad de cada acción
    delta = 0
    for state, actions in states_actions.items():
        V_updated = 0
        for action, data in actions.items():
            next_node = data['next_node']
            reward = data['reward']
            prob = pi[state][action]
            if reward == 0:
                V_updated = V_updated + prob*(- V[next_node])
            else:
                # Esto es un nodo terminal
                V_updated = V_updated - prob*reward
        delta = max(delta, np.abs(V_updated - V[state]))
        V[state] = V_updated
    return V, delta

In [11]:
def policy_eval(policy_eval_step, states_actions, pi, theta, verbose=0):
    # In place
    # Pi[s] posee la probabilidad de cada acción
    if verbose:
        print('Iteración numero:')
    
    V = {}
    iters = 0
    for state in states_actions:
        V[state] = 0
    delta = theta + 1
    while theta<delta: 
        V, delta = policy_eval_step(states_actions, V, pi)
        iters += 1
        if verbose:
            print(iters, end=' ')
    print()
    return V, iters

In [12]:
initial_random_pi = generate_random_policy(states_actions)
%time V, iters = policy_eval(policy_eval_step_random, states_actions, initial_random_pi, 1e-6, verbose=1)

Iteración numero:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 
CPU times: user 1.08 s, sys: 0 ns, total: 1.08 s
Wall time: 1.08 s


In [13]:
# Descomentar para ver resultados
# len(V), iters, V

# Policy fixed

In [14]:
def generate_fixed_policy_rand(states_actions):
    # Cada estado contienen una sola acción elegida aleatoriamente dentro de las posibles
    pi = {}
    for state, actions in states_actions.items():
        pi[state] = np.random.choice(list(actions.keys()))
    return pi

In [15]:
def generate_fixed_policy_first(states_actions):
    # Cada estado contienen la primer acción de la lista
    pi = {}
    for state, actions in states_actions.items():
        pi[state] = list(actions.keys())[0]
    return pi

In [16]:
def policy_eval_step(states_actions, V, pi):
    # In place
    # Pi[s] posee la máxima acción
    delta = 0
    for state, actions in states_actions.items():
        V_updated = 0
        action = pi[state]
        next_node = actions[action]['next_node']
        reward = actions[action]['reward']
        if reward == 0:
            V_updated = V_updated + (-V[next_node])
        else:
            # Esto es un nodo terminal
            V_updated = V_updated - reward
        delta = max(delta, np.abs(V_updated - V[state]))
        V[state] = V_updated
    return V, delta

In [17]:
initial_fixed_pi = generate_fixed_policy_first(states_actions)
%time V_fixed, iters = policy_eval(policy_eval_step, states_actions, initial_fixed_pi, 1e-6, verbose=1)

Iteración numero:
1 2 3 4 5 6 7 8 9 10 11 12 13 
CPU times: user 670 ms, sys: 7.98 ms, total: 678 ms
Wall time: 676 ms


In [18]:
from collections import Counter
Counter(list(V_fixed.values())).most_common()

[(1, 24729), (-1, 23819), (0.2, 2819), (-0.2, 2284)]

In [19]:
def policy_improve(V, states_actions):
    pi = {}
    for state, actions in states_actions.items():
        actions_list = list(actions.keys())
        expected_rewards = np.zeros(len(actions_list))
        for i, (action, data) in enumerate(actions.items()):
            next_state = data['next_node']
            reward = data['reward']
            if reward == 0:
                expected_rewards[i] = - V[next_state]
            else:
                # Esto es un nodo terminal
                expected_rewards[i] = - reward
        pi[state] = actions_list[np.argmax(expected_rewards)]
    return pi

In [20]:
def policy_iteration(pi_old, verbose = 0):
    # Politica inicial
    policy_updates = 100
    while policy_updates > 0:
        # Calculo values de politica
        V, iters = policy_eval(policy_eval_step, states_actions, pi_old, 1e-6, verbose=verbose)
        # Mejoro política con values
        pi = policy_improve(V, states_actions)

        policy_updates = 0
        for j, (state, accion) in enumerate(pi.items()):
            if accion != pi_old[state]:
                 policy_updates += 1
        pi_old = pi.copy()
        if verbose:
            print('Cantidad de diferencias de la vieja politica con la nueva:', policy_updates)
            print('---------------------------')
    return pi_old, V

#initial_policy = generate_fixed_policy_first(states_actions)
initial_policy = generate_fixed_policy_rand(states_actions)
final_policy, V = policy_iteration(initial_policy, verbose = 1)

Iteración numero:
1 2 3 4 5 6 7 8 9 10 11 12 13 
Cantidad de diferencias de la vieja politica con la nueva: 12703
---------------------------
Iteración numero:
1 2 3 4 5 6 7 8 9 10 11 12 13 
Cantidad de diferencias de la vieja politica con la nueva: 2207
---------------------------
Iteración numero:
1 2 3 4 5 6 7 8 9 10 11 12 13 
Cantidad de diferencias de la vieja politica con la nueva: 619
---------------------------
Iteración numero:
1 2 3 4 5 6 7 8 9 10 11 12 
Cantidad de diferencias de la vieja politica con la nueva: 184
---------------------------
Iteración numero:
1 2 3 4 5 6 7 8 9 10 11 12 13 
Cantidad de diferencias de la vieja politica con la nueva: 47
---------------------------
Iteración numero:
1 2 3 4 5 6 7 8 9 10 11 12 13 
Cantidad de diferencias de la vieja politica con la nueva: 16
---------------------------
Iteración numero:
1 2 3 4 5 6 7 8 9 10 11 12 13 
Cantidad de diferencias de la vieja politica con la nueva: 4
---------------------------
Iteración numero:
1 2 3 

# Jugamos con la policy obtenida

In [42]:
from playing_stats import EvaluatePolicy

In [43]:
evalPolicy = EvaluatePolicy(final_policy)

In [44]:
n = 4
game = Game(n)
board = game.getInitBoard()
player = 1

## Policy plays second against random

In [45]:
episodes = 1000
player_1_wins, player_2_wins, ties = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.random_player, -1: evalPolicy.policy_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')

player_1 wins: 0%
player_2 wins: 100%
ties: 0%


Notar que la politica gana siempre si juega primero

## Policy plays first against random

In [49]:
episodes = 1000
player_1_wins, player_2_wins, ties = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player, -1: evalPolicy.random_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')

player_1 wins: 81%
player_2 wins: 19%
ties: 0%


## Policy plays first against greedy

In [50]:
episodes = 1000
player_1_wins, player_2_wins, ties = evalPolicy.get_stats(game, 
                                                board, 
                                                {1: evalPolicy.policy_player, -1: evalPolicy.greedy_player}, 
                                                episodes)
print('player_1 wins:', str(int(100*player_1_wins/episodes + 0.5)) + '%')
print('player_2 wins:', str(int(100*player_2_wins/episodes + 0.5)) +'%')
print('ties:', str(int(100*ties/episodes + 0.5))+ '%')

player_1 wins: 69%
player_2 wins: 32%
ties: 0%
