In [1]:
import numpy as np

In [2]:
# Define the grid for gridworld
grid = (4, 4)

In [3]:
# Terminal states for the gridworld
terminal_states = [
    (0, 0),
    (3, 3)
]

In [4]:
def random_policy(state):
    # Policy returns random probabilities for each direction
    return np.array([0.25, 0.25, 0.25, 0.25])

In [5]:
def reward(state):
    # Reward is always -1
    return -1

In [6]:
def state_transition(state, action):
    # North is 0, East is 1, South is 2, West is 3
    if action == 0:
        state_ = (state[0]-1, state[1])
    elif action == 1:
        state_ = (state[0], state[1]+1)
    elif action == 2:
        state_ = (state[0]+1, state[1])
    elif action == 3:
        state_ = (state[0], state[1]-1)
        
    # Get reward for the next state (-1 or 0)
    r = reward(state_)
        
    # If next state is greater than the bounds, move in bounds
    state_ = (min(grid[0]-1, max(0, state_[0])), min(grid[1]-1, max(0, state_[1])))
    
    return state_, r

In [7]:
# Function used to calculate the state_value of a given state
def calculate_state_value(policy, state, d, stop_condition, discount):
    if d == stop_condition or state in terminal_states:
        return 0
    
    # Sample policy probabilities
    probs = policy(state)
    
    total_value = 0
    
    # Iterate over all possible actions
    for a in range(0, len(probs)):
        # Policy probability pi(a | s)
        p = probs[a]
        
        # With probability 1, we go to the next state
        state_, r = state_transition(state, a)
        
        # Calculate value of future state
        v = calculate_state_value(policy, state_, d+1, stop_condition, discount)
        
        # Sum of future states is just the next state as
        # we deterministically move to the next state
        # p(s', r | s, a) = 1
        sum_ = 1 * (r + discount * v)
        
        # Weigh by the policy probability
        value = p * sum_
        
        # Add to total value
        total_value += value
        
    return total_value

In [8]:
# Iterative policy evaluation, estimating v_k where k is the number of iterations
k = 10
thresh = 1e-4

# Discount factor
discount = 1.0

# state-values initialized to zeros
values = np.zeros(grid, dtype=float)


# Loop until the threshold is met
delta = 1
while delta > thresh:
    delta = 0
    # Loop over all states
    for (i, j) in np.ndindex(grid):
        # Skip terminal states
        if (i, j) in terminal_states:
            continue
        
        # Get the current state
        state = (i, j)
        
        # Get the value for this state
        value = values[state]
        
        # Update the value for this state
        values[state] = calculate_state_value(random_policy, state, 0, k, discount)
        
        # Get the delta
        delta = max(delta, abs(value - values[state]))

In [9]:
delta

0

In [10]:
values.round(2)

array([[ 0.  , -6.14, -8.35, -8.97],
       [-6.14, -7.74, -8.43, -8.35],
       [-8.35, -8.43, -7.74, -6.14],
       [-8.97, -8.35, -6.14,  0.  ]])