In [145]:
from copy import deepcopy
import numpy as np

In [146]:
discount_factor = 1
n = 4
action_names = {'-1' : 'g', '0' : 'l', '1' : 'u', '2' : 'r', '3' : 'd'}
actions = [(0, -1), (-1, 0), (0, 1), (1,0)]
values = np.zeros((n, n))
policy = np.zeros((n, n), dtype=int)
policy[0][0] = -1
policy[n-1][n-1] = -1

In [147]:
"""
* Policy Iteration Method
"""

def evaluate_policy():
    
    theta = 2
    while True:
        delta = 0
        for i in range(n):
            for j in range(n):
                old_value_s = values[i, j]
                if policy[i, j] != -1:
                    action = actions[policy[i,j]]
                    reward = -1.0
                    if i + action[0] < 0 or i + action[0] >= n or j + action[1] < 0 or j + action[1] >= n: # Check bounds after checking for A and B
                        x,y = i,j
                    else:
                        x,y = i + action[0], j + action[1]    
                    values[i, j] = (reward + discount_factor * (values[x, y]))
                
                delta = max(delta, np.abs(old_value_s - values[i, j]))
        if delta < theta:
            break
    return values
        
def improve_policy():
    stable = True
    for i in range(n):
        for j in range(n):
            if policy[i, j] != -1:
                old_policy = policy[i, j]
                max_return = -np.inf
                best_action = actions[old_policy]
                for action in actions:
                    reward = -1.0
                    if i + action[0] < 0 or i + action[0] >= n or j + action[1] < 0 or j + action[1] >= n: # Check bounds after checking for A and B
                        x,y = i,j
                    else:
                        x,y = i + action[0], j + action[1]    
                    expected_return = (reward + discount_factor * (values[x, y]))
                    if expected_return > max_return:
                        max_return = expected_return
                        best_action = action

                policy[i, j] = actions.index(best_action)

                if policy[i, j] != old_policy:
                    stable = False
    return not stable

In [148]:
values = evaluate_policy()
while (improve_policy()):
    values = evaluate_policy()

In [149]:
policy = policy.tolist()
for i in range(n):
    for j in range(n):
        policy[i][j] = action_names[str(policy[i][j])]

In [150]:
policy

[['g', 'l', 'l', 'l'],
 ['u', 'l', 'l', 'd'],
 ['u', 'l', 'r', 'd'],
 ['u', 'r', 'r', 'g']]

In [151]:
values

array([[ 0., -1., -2., -3.],
       [-1., -2., -3., -2.],
       [-2., -3., -2., -1.],
       [-3., -2., -1.,  0.]])

In [159]:
"""
* Value Iteration
"""

values = np.zeros((n, n))
policy = np.zeros((n, n), dtype=int)
policy[0][0] = -1
policy[n-1][n-1] = -1

In [160]:
theta = 1e-4

while True:
    delta = 0
    for i in range(n):
        for j in range(n):
            if policy[i, j] != -1:
                old_value = values[i, j]
                max_return = -np.inf
                best_action = actions[policy[i, j]]
                for action in actions:
                    reward = -1.0
                    if i + action[0] < 0 or i + action[0] >= n or j + action[1] < 0 or j + action[1] >= n: # Check bounds after checking for A and B
                        x,y = i,j
                    else:
                        x,y = i + action[0], j + action[1]
                    expected_return = (reward + discount_factor * (values[x, y]))
                    
                    if expected_return > max_return:
                        max_return = expected_return
                        best_action = action
                        
                values[i, j] = max_return
                policy[i, j] = actions.index(best_action)
                
                delta = max(delta, np.abs(old_value - values[i, j]))
    if delta < theta:
        break

In [161]:
policy = policy.tolist()
for i in range(n):
    for j in range(n):
        policy[i][j] = action_names[str(policy[i][j])]

In [162]:
values

array([[ 0., -1., -2., -3.],
       [-1., -2., -3., -2.],
       [-2., -3., -2., -1.],
       [-3., -2., -1.,  0.]])

In [163]:
policy

[['g', 'l', 'l', 'l'],
 ['u', 'l', 'l', 'd'],
 ['u', 'l', 'r', 'd'],
 ['u', 'r', 'r', 'g']]