In [1]:
import numpy as np

In [2]:
# iterative algorithms and utility functions for Markov Decision Processes (MDPs)
import mdp
# environment
from env import MazeGhostEnv

In [None]:
def q_learn(e, gamma, eta, nepisodes=100000):
    """
    Q-learning algorithm.

    Args:
        e:         environment
        gamma:     discount factor
        eta:       learning rate
        nepisodes: number of episodes (to-be played games)

    Returns:
        numpy.ndarray: Q-value function
    """
    Q = np.zeros((e.num_states, e.num_actions))
    for i in range(nepisodes):
        # initial (uniformly random) state and reward
        s = np.random.randint(e.num_states)
        r = e.rewards[s]
        game_over = False
        while not game_over:
            # choose an action based on softmax probability derived from Q[s, :]
            pqs = np.exp(Q[s, :])
            pqs /= np.sum(pqs)
            a = np.random.choice(e.num_actions, p=pqs)
            # transition to next state
            (snext, rnext, game_over) = e.step(s, a)
            #
            # TODO: Update Q-value function for state 's' and action 'a'.
            #
            s = snext
            r = rnext
        if (i+1) % (nepisodes//10) == 0:
            print('{}%'.format(100*(i+1)//nepisodes))
    return Q

In [3]:
# Solution:
#
#def q_learn(e, gamma, eta, nepisodes=100000):
#    """
#    Q-learning algorithm.
#
#    Args:
#        e:         environment
#        gamma:     discount factor
#        eta:       learning rate
#        nepisodes: number of episodes (to-be played games)
#
#    Returns:
#        numpy.ndarray: Q-value function
#    """
#    Q = np.zeros((e.num_states, e.num_actions))
#    for i in range(nepisodes):
#        # initial (uniformly random) state and reward
#        s = np.random.randint(e.num_states)
#        r = e.rewards[s]
#        game_over = False
#        while not game_over:
#            # choose an action based on softmax probability derived from Q[s, :]
#            pqs = np.exp(Q[s, :])
#            pqs /= np.sum(pqs)
#            a = np.random.choice(e.num_actions, p=pqs)
#            # transition to next state
#            (snext, rnext, game_over) = e.step(s, a)
#            # update Q-value function
#            Q[s, a] += eta * (r + gamma * np.amax(Q[snext, :]) - Q[s, a])
#            s = snext
#            r = rnext
#        if (i+1) % (nepisodes//10) == 0:
#            print('{}%'.format(100*(i+1)//nepisodes))
#    return Q

In [4]:
# show maze description
with open('ghost_maze.txt', 'r') as f:
    for line in f:
        print(line, end='')

# S: start, X: inaccessible, E: exit with reward +1, F: exit with reward -1
..XE
.X..
.X.F
S...


In [5]:
# "reward" on regular fields
r = -0.04
print('r:', r)

e = MazeGhostEnv('ghost_maze.txt', r)

gamma = 0.99
print('gamma:', gamma)

eta = 0.01
print('learning rate eta:', eta)

# perform Q-learning
print('starting Q-learning algorithm...')
Qlearn = q_learn(e, gamma, eta)
pol = mdp.policy_from_qvalue(Qlearn)

# reference Q-value function and policy
Qref = mdp.q_iteration(e.tprob, e.rewards, gamma)
pref = mdp.policy_from_qvalue(Qref)

# compare Q-value function with reference
print('|Qlearn - Qref|:', np.linalg.norm((Qlearn - Qref).reshape(-1), np.inf))

# compare policy with reference
print('Qlearn policy (all possible ghost locations):')
print(e.draw_policy(pol))
print('number of deviations from reference:', np.sum((pol - pref) * e.pmask != 0))


r: -0.04
gamma: 0.99
learning rate eta: 0.01
starting Q-learning algorithm...
10%
20%
30%
40%
50%
60%
70%
80%
90%
100%
Q-value iteration with epsilon=1e-14 completed after 139 iterations
|Qlearn - Qref|: 0.30784180848870796
Qlearn policy (all possible ghost locations):
↓ ← █ E
← █ → ↑
↓ █ ↑ F
G → ↑ ←

↓ ← █ E
↓ █ → ↑
→ █ ↑ F
↑ G ↑ ↓

↓ ← █ E
↓ █ → ↑
↓ █ ↑ F
← → G ←

↓ ← █ E
↓ █ → ↑
↓ █ ↑ F
→ ↑ ↑ G

↑ ← █ E
↓ █ → ↑
G █ ↑ F
→ → ↑ ←

↓ ← █ E
↓ █ → ↑
↓ █ G F
→ ↓ ↑ ↓

↓ ← █ E
↓ █ ↑ ↑
↓ █ ↑ G
→ → ↓ ←

↓ ↑ █ E
G █ → ↑
↓ █ ↑ F
→ → ↑ ←

↓ ← █ E
↓ █ G ↑
↓ █ ↓ F
→ → ↓ ←

↓ ← █ E
↓ █ → G
↓ █ ← F
→ → ↑ ↓

G ← █ E
↓ █ → ↑
↓ █ ↑ F
→ → ↑ ←

↓ G █ E
↓ █ → ↑
↓ █ ↑ F
→ → ↑ ←

↓ ← █ G
↓ █ → ↑
↓ █ ↑ F
→ → ↑ ←
number of deviations from reference: 5
