In [1]:
import numpy as np

In [2]:
# policy gradient iteration
from pg import policy_gradient_iteration
# corresponding network
from policy_net import PolicyNet

In [3]:
# environment
from env import MazeEnv

In [4]:
# reference calculation
import mdp

In [5]:
# "reward" on regular fields
r = -0.04
print('r:', r)

e = MazeEnv('simple_maze.txt', r)

gamma = 0.99
print('gamma:', gamma)

net = PolicyNet(e.observation(0).size, e.num_actions)

print('starting policy gradient iteration...')
net = policy_gradient_iteration(net, e, gamma, nepisodes=50000)
# obtain policy from network: most likely action for each state
pol = np.zeros(e.num_states, dtype=int)
for s in range(e.num_states):
    x = e.observation(s).reshape(-1)
    aprob = net.evaluate(x[None, :])[0]
    pol[s] = np.argmax(aprob)

# reference optimal policy
pref = mdp.policy_iteration(e.tprob, e.rewards, gamma)
# corresponding value function
uref = mdp.utility_from_policy(e.tprob, e.rewards, gamma, pref)
# omit "game over" from average
umean = np.mean(uref[:-1])
print('optimal value function average:', umean)

# compare policy with reference
print('policy (most likely action, for all possible ghost locations):')
print(e.draw_policy(pol))
print('number of deviations from reference:', np.sum((pol - pref) * e.pmask != 0))

r: -0.04
gamma: 0.99
starting policy gradient iteration...
episode 500 completed, nsteps: 4, total discounted reward: -1.0891, running mean: 0.367476
episode 1000 completed, nsteps: 5, total discounted reward: -1.11821, running mean: 0.124843
episode 1500 completed, nsteps: 3, total discounted reward: -1.0597, running mean: 0.0151022
episode 2000 completed, nsteps: 7, total discounted reward: 0.707401, running mean: -0.0106291
episode 2500 completed, nsteps: 6, total discounted reward: 0.75495, running mean: 0.0294901
episode 3000 completed, nsteps: 2, total discounted reward: 0.95, running mean: 0.0999806
episode 3500 completed, nsteps: 2, total discounted reward: 0.95, running mean: 0.164274
episode 4000 completed, nsteps: 3, total discounted reward: -1.0597, running mean: 0.200543
episode 4500 completed, nsteps: 2, total discounted reward: 0.95, running mean: 0.254042
frames of episode 5000:
step 0
reward: -1.0
░ ░ ░ E
░ █ ░ ☺
░ ░ ░ ░
action: ↓
_____________
Game over!
episode 5000 

episode 37500 completed, nsteps: 1, total discounted reward: 1, running mean: 0.520468
episode 38000 completed, nsteps: 6, total discounted reward: -1.14703, running mean: 0.503242
episode 38500 completed, nsteps: 5, total discounted reward: 0.80298, running mean: 0.531567
episode 39000 completed, nsteps: 1, total discounted reward: -1, running mean: 0.537886
episode 39500 completed, nsteps: 8, total discounted reward: 0.660327, running mean: 0.530623
frames of episode 40000:
step 0
reward: -1.0
░ ░ ░ E
░ █ ░ ☺
░ ░ ░ ░
action: ←
_____________
Game over!
episode 40000 completed, nsteps: 1, total discounted reward: -1, running mean: 0.512641
episode 40500 completed, nsteps: 5, total discounted reward: 0.80298, running mean: 0.528818
episode 41000 completed, nsteps: 7, total discounted reward: 0.707401, running mean: 0.517344
episode 41500 completed, nsteps: 3, total discounted reward: 0.9005, running mean: 0.51134
episode 42000 completed, nsteps: 7, total discounted reward: 0.707401, run