In [1]:
import numpy as np

In [2]:
# policy gradient iteration
from pg import policy_gradient_iteration
# corresponding network
from policy_net import PolicyNet

In [3]:
# environment
from env import MazeGhostEnv

In [4]:
# reference calculation
import mdp

In [5]:
# "reward" on regular fields
r = -0.04
print('r:', r)

e = MazeGhostEnv('ghost_maze.txt', r)

gamma = 0.99
print('gamma:', gamma)

net = PolicyNet(e.observation(0).size, e.num_actions)

print('starting policy gradient iteration...')
net = policy_gradient_iteration(net, e, gamma)
# obtain policy from network: most likely action for each state
pol = np.zeros(e.num_states, dtype=int)
for s in range(e.num_states):
    x = e.observation(s).reshape(-1)
    aprob = net.evaluate(x[None, :])[0]
    pol[s] = np.argmax(aprob)

# reference optimal policy
pref = mdp.policy_iteration(e.tprob, e.rewards, gamma)
# corresponding value function
uref = mdp.utility_from_policy(e.tprob, e.rewards, gamma, pref)
# omit "game over" from average
umean = np.mean(uref[:-1])
print('optimal value function average:', umean)

# compare policy with reference
print('policy (most likely action, for all possible ghost locations):')
print(e.draw_policy(pol))
print('number of deviations from reference:', np.sum((pol - pref) * e.pmask != 0))


r: -0.04
gamma: 0.99
starting policy gradient iteration...
episode 1000 completed, nsteps: 2, total discounted reward: -1.03, running mean: -1.89397
episode 2000 completed, nsteps: 3, total discounted reward: -1.0597, running mean: -1.60206
episode 3000 completed, nsteps: 2, total discounted reward: 0.95, running mean: -1.44134
episode 4000 completed, nsteps: 1, total discounted reward: -1, running mean: -1.3422
episode 5000 completed, nsteps: 34, total discounted reward: -2.56454, running mean: -1.18577
episode 6000 completed, nsteps: 9, total discounted reward: -2.15451, running mean: -1.11136
episode 7000 completed, nsteps: 19, total discounted reward: -2.33097, running mean: -1.03203
episode 8000 completed, nsteps: 12, total discounted reward: -1.31399, running mean: -0.929293
episode 9000 completed, nsteps: 4, total discounted reward: -2.0594, running mean: -0.897773
frames of episode 10000:
step 0
reward: -0.04
░ ░ █ E
░ █ ░ G
░ █ ░ F
░ ☺ ░ ░
action: →
_____________
step 1
reward

episode 69000 completed, nsteps: 5, total discounted reward: 0.80298, running mean: -0.459147
frames of episode 70000:
step 0
reward: -0.04
░ ☺ █ G
░ █ ░ ░
░ █ ░ F
░ ░ ░ ░
action: ↑
_____________
step 1
reward: -0.04
░ ☺ █ G
░ █ ░ ░
░ █ ░ F
░ ░ ░ ░
action: ↑
_____________
step 2
reward: -0.04
░ ☺ █ E
░ █ ░ G
░ █ ░ F
░ ░ ░ ░
action: ←
_____________
step 3
reward: -0.04
☺ ░ █ E
░ █ G ░
░ █ ░ F
░ ░ ░ ░
action: ↓
_____________
step 4
reward: -0.04
░ ░ █ E
☺ █ ░ G
░ █ ░ F
░ ░ ░ ░
action: ↓
_____________
step 5
reward: -0.04
░ ░ █ G
░ █ ░ ░
☺ █ ░ F
░ ░ ░ ░
action: ↓
_____________
step 6
reward: -0.04
░ ░ █ G
░ █ ░ ░
░ █ ░ F
☺ ░ ░ ░
action: →
_____________
step 7
reward: -0.04
░ ░ █ E
░ █ ░ G
☺ █ ░ F
░ ░ ░ ░
action: ↓
_____________
step 8
reward: -0.04
░ ░ █ G
░ █ ░ ░
░ █ ░ F
☺ ░ ░ ░
action: →
_____________
step 9
reward: -0.04
░ ░ █ G
░ █ ░ ░
░ █ ░ F
░ ☺ ░ ░
action: →
_____________
step 10
reward: -0.04
░ ░ █ G
░ █ ░ ░
░ █ ░ F
░ ░ ☺ ░
action: ↑
_____________
step 11
reward: -0.04
░ ░ █ E
░ █