In [1]:
import numpy as np

In [2]:
# iterative algorithms and utility functions for Markov Decision Processes (MDPs)
import mdp

In [3]:
# environment
from env import MazeGhostEnv

In [4]:
# "reward" on regular fields
r = -0.04
print('r:', r)

e = MazeGhostEnv('ghost_maze.txt', r)

gamma = 0.99
print('gamma:', gamma)

# perform value iteration
u = mdp.value_iteration(e.tprob, e.rewards, gamma)
print('value function for ghost at first accessible location:')
if hasattr(np, 'printoptions'):
    with np.printoptions(precision=3):
        print(e.maze_array(u[:len(e.locs)]))
else:
    print(e.maze_array(u[:len(e.locs)]))

# optimal policy corresponding to u
pol = mdp.policy_from_utility(e.tprob, u)
print('optimal policy (all possible ghost locations):')
print(e.draw_policy(pol))

# consistency check
if gamma < 1:
    upol = mdp.utility_from_policy(e.tprob, e.rewards, gamma, pol)
    uerr = np.linalg.norm(upol - u)
    print('utility from policy consistency check error:', uerr)

# alternative: policy iteration
if gamma < 1:
    pal = mdp.policy_iteration(e.tprob, e.rewards, gamma)
    palerr = np.linalg.norm((pal - pol) * e.pmask)
    print('policy iteration consistency check error:', palerr)

# Q-value function
Q = mdp.q_iteration(e.tprob, e.rewards, gamma)

uQ = mdp.utility_from_qvalue(Q)
uQerr = np.linalg.norm(u - uQ)
print('utility from Q-value consistency check error:', uQerr)

pQ = mdp.policy_from_qvalue(Q)
pQerr = np.linalg.norm((pQ - pol) * e.pmask)
print('policy from Q-value consistency check error:', pQerr)

# play a game
print('playing a game...')
e.play(pol, gamma)

r: -0.04
gamma: 0.99
value iteration with epsilon=1e-14 completed after 154 iterations
value function for ghost at first accessible location:
[[-1.223 -1.242    nan  1.   ]
 [-1.235    nan  0.824  0.924]
 [-1.141    nan  0.563 -1.   ]
 [-2.     0.035  0.302  0.064]]
optimal policy (all possible ghost locations):
↓ ← █ E
↑ █ → ↑
↓ █ ↑ F
G → ↑ ←

↓ ← █ E
↓ █ → ↑
↑ █ ↑ F
→ G ↑ ↓

↓ ← █ E
↓ █ → ↑
↓ █ ↑ F
← → G ←

↓ ← █ E
↓ █ → ↑
↓ █ ↑ F
→ ↑ ↑ G

↑ ← █ E
↓ █ → ↑
G █ ↑ F
→ → ↑ ←

↓ ← █ E
↓ █ → ↑
↓ █ G F
→ ↑ ↑ ↓

↓ ← █ E
↓ █ ↑ ↑
↓ █ ↑ G
→ → ↓ ←

↓ → █ E
G █ → ↑
↓ █ ↑ F
→ → ↑ ←

↓ ← █ E
↓ █ G ↑
↓ █ ↓ F
→ → ↓ ←

↓ ← █ E
↓ █ → G
↓ █ ← F
→ → ↑ ↓

G ← █ E
↓ █ → ↑
↓ █ ↑ F
→ → ↑ ←

↓ G █ E
↓ █ → ↑
↓ █ ↑ F
→ → ↑ ←

↓ ← █ G
↓ █ → ↑
↓ █ ↑ F
→ → ↑ ←
utility from policy consistency check error: 3.046221841855575e-15
policy iteration completed after 4 iterations
policy iteration consistency check error: 0.0
Q-value iteration with epsilon=1e-14 completed after 139 iterations
utility from Q-value consistenc