# Gridworld Example

Rules:
1. If you are at $A$, you get a reward of +10 irrespective of the action only if you land in $A'$
2. If you are at $B$, you get a reward of +5 irrespective of the action only if you land in $B'$
3. If you are at a boundary except for $A$ and $B$, you get -1 if you take an action that takes you out of bounds, but you remain in the same state
4. Any other step has zero reward

In [1]:
import numpy as np
from numba import jit
from scipy import optimize

## Suboptimal policy

To solve the gridworld example, we suppose that the agent selects all four actions with equal probabilty in all states ( $\forall a.\pi(a\vert s) = \frac{1}{4}$). By the Jacobi-Bellman equation, we have

$$
\begin{aligned}
    v_\pi(s) &= \sum_{a} \pi(a\vert s) \sum_{s',r} p(s', r \vert s, a)[r + \gamma v_\pi(s')]\\
             &= \frac{1}{4}\sum_{a, s', r} p(s', r \vert s, a) [r + \gamma v_\pi(s')]
\end{aligned}
$$

Hence, every state $s$ needs to satisfy

$$
    \sum_{s'}v_\pi(s') \left[\mathbb{1}(s' = s) - \frac{\gamma}{4}\sum_{a}p(s' \vert s, a)\right] = \frac{1}{4} \sum_{a,s',r} r \cdot p(s',r\vert s,a)
$$

In [2]:
# actions, rewards, (*state)
n_actions = 4
n_rewards = 4
state_size = 25

# p(s',r | a, s)
p_gridworld = np.zeros((state_size, n_rewards, n_actions, state_size))
p_gridworld.shape

(25, 4, 4, 25)

In [3]:
lower_bound = 0
upper_bound = np.sqrt(state_size).astype(int) - 1

In [4]:
rewards = np.array([0, 5, 10, -1])
reward_map = {r: ix for ix, r in enumerate(rewards)}

actions = ["up", "right", "down", "left"]
actions_ix_map = {a: ix for ix, a in enumerate(actions)}

action_map = {
    "up": np.array([-1, 0]),
    "right": np.array([0, 1]),
    "down": np.array([1, 0]),
    "left": np.array([0, -1])
}

# mapping from special states to rewards
special_map = {
    1: 10,
    3: 5
}

# mapping from special states to terminal states
special_states = [1, 3]
special_state_map = {
    1: 21,
    3: 13
}

In [5]:
def get_pos(ix):
    col = ix % 5
    row = ix // 5
    state = np.asarray([row, col])
    return state

def get_state(position):
    row, col = position
    return 5 * row + col

def move(state, action):
    position = get_pos(state)
    new_position = position + action_map[action]
    return new_position

def is_out_of_bounds(position, lb=0, ub=4):
    return (new_pos < lb).any() or (new_pos > ub).any()

In [6]:
# p(s',r | a, s)
p_gridworld = np.zeros((state_size, n_rewards, n_actions, state_size))
for s in range(state_size):
    curr_pos = get_pos(s)
    for r in reward_map:
        for action in action_map:
            a_pos = actions_ix_map[action]
            r_pos = reward_map[r]
            new_pos = move(s, action)
            new_state = get_state(new_pos)

            val = 0
            if s in special_states:
                if r == special_map[s]:
                    val = 1
                new_state = special_state_map[s]
                new_pos = get_pos(new_state)
            elif is_out_of_bounds(new_pos):
                if r == -1:
                    val = 1
                new_pos = curr_pos
                new_state = s
            elif r == 0:
                val = 1
            
            if val == 1 and r == 10:
                pass
                print(f"{r=:2}, {action=:5}, {curr_pos} -> {new_pos}")
            p_gridworld[new_state, r_pos, a_pos, s] = val
p_gridworld = p_gridworld / p_gridworld.sum(axis=0, keepdims=True).sum(axis=1, keepdims=True)

r=10, action=up   , [0 1] -> [4 1]
r=10, action=right, [0 1] -> [4 1]
r=10, action=down , [0 1] -> [4 1]
r=10, action=left , [0 1] -> [4 1]


In [7]:
# Σ_{s', r, a} r * p(s', r | a, s)
b = (p_gridworld * rewards[None, :, None, None]).sum(axis=0).sum(axis=0).sum(axis=0) / 4

In [8]:
γ = 0.9
I = np.eye(state_size)
A = I - γ / 4 * p_gridworld.sum(axis=1).sum(axis=1).T

In [9]:
np.linalg.solve(A, b).reshape(5, 5).round(1)

array([[ 3.3,  8.8,  4.4,  5.3,  1.5],
       [ 1.5,  3. ,  2.3,  1.9,  0.5],
       [ 0.1,  0.7,  0.7,  0.4, -0.4],
       [-1. , -0.4, -0.4, -0.6, -1.2],
       [-1.9, -1.3, -1.2, -1.4, -2. ]])

## Optimal Policy

$$
    v_{*}(s) = \max_{a\in\mathcal{A}} \sum_{s',r}p\left( s', r \vert s, a \right)\left[r + \gamma v_*(s')\right]
$$

In [10]:
def bellman_optimality(vs):
    rhs = (rewards[None, :] + γ * vs[:, None])[..., None, None] * p_gridworld
    rhs = rhs.sum(axis=0).sum(axis=0).max(axis=0)
    
    return vs - rhs

In [11]:
vs = np.random.randn(25)
vs_star = optimize.broyden1(bellman_optimality, vs)
vs_star.reshape(5, 5).round(1)

array([[22. , 24.4, 22. , 19.4, 17.5],
       [19.8, 22. , 19.8, 17.8, 16. ],
       [17.8, 19.8, 17.8, 16. , 14.4],
       [16. , 17.8, 16. , 14.4, 13. ],
       [14.4, 16. , 14.4, 13. , 11.7]])

### Optimal actions

In [54]:
n_dec = 3
optimal_actions = (rewards[None, :] + γ * vs_star[:, None])[..., None, None] * p_gridworld
optimal_actions = optimal_actions.sum(axis=0).sum(axis=0).round(n_dec) == vs_star[None, :].round(n_dec)

In [68]:
# Up, right, left, down actions
actions_str = np.where(optimal_actions, np.array(["u","r","d","l"])[:, None], "").T
np.array([f'{"".join(row):4}' for row in actions_str]).reshape(5, 5)

array([['r   ', 'urdl', 'l   ', 'urdl', 'l   '],
       ['ur  ', 'u   ', 'ul  ', 'l   ', 'l   '],
       ['ur  ', 'u   ', 'ul  ', 'ul  ', 'ul  '],
       ['ur  ', 'u   ', 'ul  ', 'ul  ', 'ul  '],
       ['ur  ', 'u   ', 'ul  ', 'ul  ', 'ul  ']], dtype='<U4')