In [1]:
import numpy as np

In [2]:
# iterative algorithms and utility functions for Markov Decision Processes (MDPs)
import mdp

In [3]:
# environment
from env import MazeEnv

In [4]:
# "reward" on regular fields
r = -0.04
print('r:', r)

e = MazeEnv('simple_maze.txt', r)

gamma = 0.99
print('gamma:', gamma)

# perform value iteration
u = mdp.value_iteration(e.tprob, e.rewards, gamma)
print('value function:')
if hasattr(np, 'printoptions'):
    with np.printoptions(precision=3):
        print(e.maze_array(u))
else:
    print(e.maze_array(u))

# optimal policy corresponding to u
pol = mdp.policy_from_utility(e.tprob, u)
print('optimal policy:')
print(e.draw_policy(pol))

# consistency check
if gamma < 1:
    upol = mdp.utility_from_policy(e.tprob, e.rewards, gamma, pol)
    uerr = np.linalg.norm(upol - u)
    print('utility from policy consistency check error:', uerr)

# alternative: policy iteration
if gamma < 1:
    pal = mdp.policy_iteration(e.tprob, e.rewards, gamma)
    palerr = np.linalg.norm((pal - pol) * e.pmask)
    print('policy iteration consistency check error:', palerr)

# Q-value function
Q = mdp.q_iteration(e.tprob, e.rewards, gamma)
print(Q)

uQ = mdp.utility_from_qvalue(Q)
uQerr = np.linalg.norm(u - uQ)
print('utility from Q-value consistency check error:', uQerr)

pQ = mdp.policy_from_qvalue(Q)
pQerr = np.linalg.norm((pQ - pol) * e.pmask)
print('policy from Q-value consistency check error:', pQerr)

# play a game
print('playing a game...')
e.play(pol, gamma)

r: -0.04
gamma: 0.99
value iteration with epsilon=1e-14 completed after 56 iterations
value function:
[[ 0.776  0.844  0.905  1.   ]
 [ 0.717    nan  0.641 -1.   ]
 [ 0.651  0.593  0.56   0.338]]
optimal policy:
→ → → E
↑ █ ↑ F
↑ ← ↑ ←
utility from policy consistency check error: 3.554447978966673e-16
policy iteration completed after 3 iterations
policy iteration consistency check error: 0.0
Q-value iteration with epsilon=1e-14 completed after 52 iterations
[[ 0.56476064  0.65066309  0.61068739  0.59841561]
 [ 0.52092694  0.54926123  0.59267477  0.54926123]
 [ 0.34666916  0.5600724   0.54833699  0.49571846]
 [ 0.1621969  -0.74308651  0.33804366  0.31664407]
 [ 0.66883065  0.71663212  0.66883065  0.61721832]
 [-0.68694834  0.64132736  0.61298293  0.36806875]
 [-1.         -1.         -1.         -1.        ]
 [ 0.77618555  0.7351309   0.72252791  0.68796458]
 [ 0.84393511  0.79484347  0.74183811  0.79484347]
 [ 0.9050959   0.85938553  0.78149251  0.65048085]
 [ 1.          1.          1